mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-21 02:26:59 +08:00
big refactoring and sprucing up of programs and text
This commit is contained in:
parent
8b00b33c75
commit
5fc9422143
8 changed files with 295 additions and 210 deletions
|
|
@ -1,4 +1,6 @@
|
|||
.global main
|
||||
#include "apple-linux-convergence.S"
|
||||
|
||||
GLABEL main
|
||||
.text
|
||||
.align 2
|
||||
|
||||
|
|
@ -7,77 +9,95 @@ dmore .req d21
|
|||
ndless .req d22
|
||||
ndmore .req d23
|
||||
|
||||
Emit: str x30, [sp, -16]!
|
||||
bl printf
|
||||
ldr x30, [sp], 16
|
||||
Emit:
|
||||
START_PROC
|
||||
PUSH_P x29, x30
|
||||
mov x29, sp
|
||||
#if defined(__APPLE__)
|
||||
PUSH_P x1, x2
|
||||
CRT printf
|
||||
add sp, sp, 16
|
||||
#else
|
||||
CRT printf
|
||||
#endif
|
||||
POP_P x29, x30
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
main: str x30, [sp, -16]!
|
||||
stp dless, dmore, [sp, -16]!
|
||||
stp ndless, ndmore, [sp, -16]!
|
||||
MAIN
|
||||
START_PROC
|
||||
PUSH_P x29, x30
|
||||
stp dless, dmore, [sp, -16]!
|
||||
stp ndless, ndmore, [sp, -16]!
|
||||
mov x29, sp
|
||||
|
||||
ldr x0, =vless
|
||||
ldr dless, [x0]
|
||||
ldr dmore, [x0, 8]
|
||||
ldr ndless, [x0, 16]
|
||||
ldr ndmore, [x0, 24]
|
||||
LLD_ADDR x0, leg
|
||||
CRT printf
|
||||
|
||||
//-fcvtps----------------------
|
||||
fcvtps x1, dless
|
||||
fcvtps x2, dmore
|
||||
ldr x0, =fmt1
|
||||
bl Emit
|
||||
LLD_ADDR x0, vless
|
||||
ldr dless, [x0]
|
||||
ldr dmore, [x0, 8]
|
||||
ldr ndless, [x0, 16]
|
||||
ldr ndmore, [x0, 24]
|
||||
|
||||
fcvtps x1, ndless
|
||||
fcvtps x2, ndmore
|
||||
ldr x0, =fmt1
|
||||
bl Emit
|
||||
//-fcvtms-----------------------
|
||||
fcvtms x1, dless
|
||||
fcvtms x2, dmore
|
||||
ldr x0, =fmt2
|
||||
bl Emit
|
||||
//-fcvtps- Floating-point Convert to Signed integer, rounding toward Plus infinity
|
||||
fcvtps x1, dless
|
||||
fcvtps x2, dmore
|
||||
LLD_ADDR x0, fmt1
|
||||
bl Emit
|
||||
|
||||
fcvtms x1, ndless
|
||||
fcvtms x2, ndmore
|
||||
ldr x0, =fmt2
|
||||
bl Emit
|
||||
//-fcvtzs-----------------------
|
||||
fcvtzs x1, dless
|
||||
fcvtzs x2, dmore
|
||||
ldr x0, =fmt4
|
||||
bl Emit
|
||||
fcvtps x1, ndless
|
||||
fcvtps x2, ndmore
|
||||
LLD_ADDR x0, fmt1
|
||||
bl Emit
|
||||
//-fcvtns- Floating-point Convert to Signed integer, rounding to nearest with ties to even (scalar).
|
||||
fcvtns x1, dless
|
||||
fcvtns x2, dmore
|
||||
LLD_ADDR x0, fmt2
|
||||
bl Emit
|
||||
|
||||
fcvtzs x1, ndless
|
||||
fcvtzs x2, ndmore
|
||||
ldr x0, =fmt4
|
||||
bl Emit
|
||||
//-fcvtas-----------------------
|
||||
fcvtas x1, dless
|
||||
fcvtas x2, dmore
|
||||
ldr x0, =fmt3
|
||||
bl Emit
|
||||
fcvtns x1, ndless
|
||||
fcvtns x2, ndmore
|
||||
LLD_ADDR x0, fmt2
|
||||
bl Emit
|
||||
//-fcvtzs- Floating-point Convert to Signed integer, rounding toward Zero (scalar).
|
||||
fcvtzs x1, dless
|
||||
fcvtzs x2, dmore
|
||||
LLD_ADDR x0, fmt4
|
||||
bl Emit
|
||||
|
||||
fcvtas x1, ndless
|
||||
fcvtas x2, ndmore
|
||||
ldr x0, =fmt3
|
||||
bl Emit
|
||||
fcvtzs x1, ndless
|
||||
fcvtzs x2, ndmore
|
||||
LLD_ADDR x0, fmt4
|
||||
bl Emit
|
||||
//-fcvtas- Floating-point Convert to Signed integer, rounding to nearest with ties to Away (scalar).
|
||||
fcvtas x1, dless
|
||||
fcvtas x2, dmore
|
||||
LLD_ADDR x0, fmt3
|
||||
bl Emit
|
||||
|
||||
fcvtas x1, ndless
|
||||
fcvtas x2, ndmore
|
||||
LLD_ADDR x0, fmt3
|
||||
bl Emit
|
||||
//------------------------------
|
||||
|
||||
ldp ndless, ndmore, [sp], 16
|
||||
ldp dless, dmore, [sp], 16
|
||||
ldr x30, [sp], 16
|
||||
mov w0, wzr
|
||||
ldp ndless, ndmore, [sp], 16
|
||||
ldp dless, dmore, [sp], 16
|
||||
POP_P x29, x30
|
||||
mov w0, wzr
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
.section .rodata
|
||||
.data
|
||||
vless: .double 5.49
|
||||
vmore: .double 5.51
|
||||
nvless: .double -5.49
|
||||
nvmore: .double -5.51
|
||||
fmt1: .asciz "fcvtps less: %d more: %d\n"
|
||||
fmt2: .asciz "fcvtms less: %d more: %d\n"
|
||||
fmt2: .asciz "fcvtns less: %d more: %d\n"
|
||||
fmt3: .asciz "fcvtta less: %d more: %d\n"
|
||||
fmt4: .asciz "fcvtzs less: %d more: %d\n"
|
||||
leg: .asciz "less values are +/- 5.49. more values are +/- 5.51.\n"
|
||||
|
||||
.end
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
/* Perry Kivolowitz
|
||||
Professor and Chair of Computer Science
|
||||
Carthage College
|
||||
A Gentle Introduction to Assembly Language
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
|
@ -12,24 +11,28 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
const int BIASD = 1023;
|
||||
const int BIASF = 127;
|
||||
const int BIASD = 1023; // biasing value for double exponents
|
||||
const int BIASF = 127; // biasing value for floats
|
||||
|
||||
const int FRAC_SIZD = 52;
|
||||
const int FRAC_SIZF = 23;
|
||||
// The mantissa controls precision.
|
||||
|
||||
const int EXPO_SIZD = 11;
|
||||
const int EXPO_SIZF = 8;
|
||||
const int FRAC_SIZD = 52; // number of bits in double's mantissa
|
||||
const int FRAC_SIZF = 23; // number of bits in float's mantissa
|
||||
|
||||
// The exponent controls range.
|
||||
|
||||
const int EXPO_SIZD = 11; // number of bits in a double's exponent
|
||||
const int EXPO_SIZF = 8; // number of bits in a float's exponent
|
||||
|
||||
const int SIGN_SIZE = 1;
|
||||
|
||||
struct SP {
|
||||
struct SP { // construction of a float
|
||||
unsigned int frac : FRAC_SIZF;
|
||||
unsigned int expo : EXPO_SIZF;
|
||||
unsigned int sign : SIGN_SIZE;
|
||||
};
|
||||
|
||||
struct DP {
|
||||
struct DP { // construction of a double
|
||||
unsigned long frac : FRAC_SIZD;
|
||||
unsigned long expo : EXPO_SIZD;
|
||||
unsigned long sign : SIGN_SIZE;
|
||||
|
|
@ -61,7 +64,9 @@ template<class T>
|
|||
string MakeEquation(T & u, int bias) {
|
||||
stringstream ss;
|
||||
bool is_double = (bias == BIASD);
|
||||
ss << (u.sign ? "-" : "") << dec << setprecision(11) << 1.0 + DeBinary(is_double, u.frac) << " x 2^" << (u.expo - bias);
|
||||
ss << (u.sign ? "-" : "") << dec << setprecision(11);
|
||||
ss << 1.0 + DeBinary(is_double, u.frac);
|
||||
ss << " x 2^" << (u.expo - bias);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
|
@ -69,22 +74,26 @@ int main(int argc, char ** argv) {
|
|||
Double d;
|
||||
Single f;
|
||||
|
||||
const int fore_space = 20;
|
||||
const int field_space = 25;
|
||||
const int fore_space = 18;
|
||||
const int field_space = 20;
|
||||
|
||||
if (argc < 2) {
|
||||
cerr << "Must supply a floating point value as a command line argument.\n";
|
||||
cerr << "Requires a floating point value on command line .\n";
|
||||
return 1;
|
||||
}
|
||||
d.d = atof(argv[1]);
|
||||
f.f = float(d.d);
|
||||
|
||||
cout << left << setw(fore_space) << "Component" << left << setw(25) << "Double";
|
||||
cout << left << setw(field_space) << "Float" << "Comment" << endl;
|
||||
cout << left << setw(fore_space) << "Component" << left;
|
||||
cout << setw(field_space);
|
||||
cout << "Double" << left << setw(field_space) << "Float";
|
||||
cout << "Comment" << endl;
|
||||
|
||||
cout << left << setw(fore_space) << "Value:" << setw(25) << setprecision(10) << d.d;
|
||||
cout << left << setw(fore_space) << "Value:" << setw(field_space);
|
||||
cout << setprecision(10) << d.d;
|
||||
cout << setw(field_space) << setprecision(10) << f.f;
|
||||
cout << "Delta(F - D): " << setw(16) << setprecision(10) << f.f - d.d << endl;
|
||||
cout << "Delta(F - D): " << setw(16) << setprecision(10);
|
||||
cout << f.f - d.d << endl;
|
||||
|
||||
cout << left << setw(fore_space) << "Sign:";
|
||||
cout << setw(field_space) << (bool)d.D.sign;
|
||||
|
|
@ -107,28 +116,38 @@ int main(int argc, char ** argv) {
|
|||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Halves:";
|
||||
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 1)) & 1);
|
||||
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 1)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((d.D.frac >> (FRAC_SIZD - 1)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((f.F.frac >> (FRAC_SIZF - 1)) & 1);
|
||||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Quarters:";
|
||||
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 2)) & 1);
|
||||
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 2)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((d.D.frac >> (FRAC_SIZD - 2)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((f.F.frac >> (FRAC_SIZF - 2)) & 1);
|
||||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Eighths:";
|
||||
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 3)) & 1);
|
||||
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 3)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((d.D.frac >> (FRAC_SIZD - 3)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((f.F.frac >> (FRAC_SIZF - 3)) & 1);
|
||||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Sixteenths:";
|
||||
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 4)) & 1);
|
||||
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 4)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((d.D.frac >> (FRAC_SIZD - 4)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((f.F.frac >> (FRAC_SIZF - 4)) & 1);
|
||||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Thirty seconds:";
|
||||
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 5)) & 1);
|
||||
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 5)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((d.D.frac >> (FRAC_SIZD - 5)) & 1);
|
||||
cout << setw(field_space) << hex;
|
||||
cout << ((f.F.frac >> (FRAC_SIZF - 5)) & 1);
|
||||
cout << endl;
|
||||
|
||||
cout << setw(fore_space) << "Full fraction:";
|
||||
|
|
|
|||
|
|
@ -10,8 +10,8 @@ themselves, using floating point literals is extremely constrained.
|
|||
For example:
|
||||
|
||||
```asm
|
||||
fmov d0, 1 // 1
|
||||
fmov d0, 1.1 // 2
|
||||
fmov d0, 1 // 1
|
||||
fmov d0, 1.1 // 2
|
||||
```
|
||||
|
||||
`Line 1` will pass muster but `Line 2` will cause an error.
|
||||
|
|
@ -150,13 +150,13 @@ what that magic is.
|
|||
Build the program with the `-g` option to enable debugging using GDB.
|
||||
|
||||
```text
|
||||
$ gcc -g t.s
|
||||
% gcc -g t.s
|
||||
```
|
||||
|
||||
Then launch GDB on the executable:
|
||||
|
||||
```text
|
||||
$ gdb a.out
|
||||
% gdb a.out
|
||||
```
|
||||
|
||||
Set a breakpoint on line 6.
|
||||
|
|
|
|||
|
|
@ -1,38 +1,64 @@
|
|||
.global main
|
||||
#include "apple-linux-convergence.S"
|
||||
|
||||
GLABEL main
|
||||
.text
|
||||
.align 2
|
||||
.p2align 2
|
||||
|
||||
counter .req x20
|
||||
dptr .req x21
|
||||
fptr .req x22
|
||||
.equ max, 4
|
||||
|
||||
main: stp counter, x30, [sp, -16]!
|
||||
stp dptr, fptr, [sp, -16]!
|
||||
ldr dptr, =d
|
||||
ldr fptr, =f
|
||||
mov counter, xzr
|
||||
MAIN
|
||||
START_PROC
|
||||
PUSH_P counter, x30
|
||||
PUSH_P dptr, fptr
|
||||
PUSH_R x29
|
||||
mov x29, sp
|
||||
|
||||
1: cmp counter, max
|
||||
beq 2f
|
||||
LLD_ADDR dptr, d
|
||||
LLD_ADDR fptr, f
|
||||
mov counter, xzr
|
||||
|
||||
ldr d0, [dptr, counter, lsl 3]
|
||||
ldr s1, [fptr, counter, lsl 2]
|
||||
fcvt d1, s1
|
||||
ldr x0, =fmt
|
||||
add counter, counter, 1
|
||||
mov x1, counter
|
||||
bl printf
|
||||
b 1b
|
||||
1: cmp counter, max
|
||||
beq 2f
|
||||
ldr d0, [dptr, counter, lsl 3]
|
||||
ldr s1, [fptr, counter, lsl 2]
|
||||
fcvt d1, s1
|
||||
LLD_ADDR x0, fmt
|
||||
add counter, counter, 1
|
||||
mov x1, counter
|
||||
#if defined(__APPLE__)
|
||||
/*
|
||||
Give us some stack space. Then read the printf template
|
||||
string right to left. Variadics on the Mac are difficult
|
||||
to get right. Remember that printf never prints floats.
|
||||
Only doubles. Internally, floats are converted to double.
|
||||
See the fcvt instruction above.
|
||||
*/
|
||||
sub sp, sp, 32
|
||||
str d1, [sp, 16]
|
||||
str d0, [sp, 8]
|
||||
str x1, [sp]
|
||||
CRT printf
|
||||
add sp, sp, 32
|
||||
#else
|
||||
CRT printf
|
||||
#endif
|
||||
b 1b
|
||||
|
||||
2: ldp dptr, fptr, [sp], 16
|
||||
ldp counter, x30, [sp], 16
|
||||
mov w0, wzr
|
||||
2: POP_R x29
|
||||
POP_P dptr, fptr
|
||||
POP_P counter, x30
|
||||
mov w0, wzr
|
||||
END_PROC
|
||||
ret
|
||||
|
||||
.data
|
||||
fmt: .asciz "%d %f %f\n"
|
||||
d: .double 1.111111, 2.222222, 3.333333, 4.444444
|
||||
f: .float 1.111111, 2.222222, 3.333333, 4.444444
|
||||
fmt: .asciz "index %ld double %f float %f\n"
|
||||
.p2align 3
|
||||
d: .double 1.555555, 2.666666, 3.777777, 4.888888
|
||||
.p2align 2
|
||||
f: .float 1.111111, 2.222222, 3.333333, 4.444444
|
||||
|
||||
.end
|
||||
|
|
|
|||
|
|
@ -13,9 +13,11 @@ For this chapter, I will use:
|
|||
* Rounding means picking some fractional value and if the float's
|
||||
fraction is higher, you go one way and if lower, you go the other.
|
||||
|
||||
* Truncation means you don't look too closely at the fractional value.
|
||||
Instead, you just eliminate the fractional part and slam the whole
|
||||
number ... one way or the other.
|
||||
* Truncation means you don't care about the fractional value. You just
|
||||
eliminate the fractional part and slam the whole number ... one way or
|
||||
the other.
|
||||
|
||||
"One way or the other" is defined next.
|
||||
|
||||
## Truncation Towards Zero
|
||||
|
||||
|
|
@ -30,8 +32,8 @@ Diving a little deeper, there is a choice to be made as to whether or
|
|||
not `integer_variable` is signed or unsigned. And, whether or not
|
||||
`integer_variable` is a 32 bit or 64 bit value.
|
||||
|
||||
The instruction is `fcvtz` - convert towards zero. Then, the choice
|
||||
as to whether to produce a signed or unsigned result is defined by the
|
||||
The instruction is `fcvtz` - convert towards zero. Then, the choice as
|
||||
to whether to produce a signed or unsigned result is defined by the
|
||||
final letterL `u` or `s`.
|
||||
|
||||
| Mnemonic | Meaning |
|
||||
|
|
@ -41,7 +43,7 @@ final letterL `u` or `s`.
|
|||
|
||||
As an example of how the ARM documentation is confusing - this
|
||||
instruction which completely discards the fractional value is said by
|
||||
the ARM documentation as doing rounding.
|
||||
the ARM documentation as doing rounding not truncating.
|
||||
|
||||
The the choice of source register defined whether you are converting
|
||||
a double or single precision floating point value.
|
||||
|
|
@ -60,14 +62,18 @@ Examples where `d` is a `double` and `f` is a `float`:
|
|||
|
||||
| C++ | Instruction |
|
||||
| --- | ----------- |
|
||||
| `int32_t(d)` | `fcvtzs w0, d0` |
|
||||
| `uint32_t(d)` | `fcvtzu w0, d0` |
|
||||
| `int64_t(d)` | `fcvtzs x0, d0` |
|
||||
| `uint64_t(d)` | `fcvtzu x0, d0` |
|
||||
| `int32_t(d)` | `fcvtzs w0, d0` |
|
||||
| `uint32_t(d)` | `fcvtzu w0, d0` |
|
||||
| `int64_t(d)` | `fcvtzs x0, d0` |
|
||||
| `uint64_t(d)` | `fcvtzu x0, d0` |
|
||||
|
||||
[Here](./asm_rounding.s) is a program which demonstrates various
|
||||
[Here](./asm_rounding.S) is a program which demonstrates various
|
||||
ways of converting doubles to integers.
|
||||
|
||||
Note: This source code has been updated using the author's
|
||||
Apple / Linux Convergence macros and can be built on both Apple Mac OS
|
||||
and Linux ARM systems.
|
||||
|
||||
Let's look at:
|
||||
|
||||
```text
|
||||
|
|
@ -105,23 +111,23 @@ Notice all the values were truncated to the whole number that is
|
|||
Truncation away from zero is not as easy. In fact, it cannot be
|
||||
performed with a single instruction.
|
||||
|
||||
In C and C++:
|
||||
In C (and C++):
|
||||
|
||||
```c
|
||||
iv = (int(fv) == fv) ? int(fv) : int(fv) + ((fv < 0) ? -1 : 1);
|
||||
```
|
||||
|
||||
If the `fv` is already equal to a whole number, the
|
||||
integer value will be that whole number. Other wise the `iv` is
|
||||
the whole number further *away from zero*.
|
||||
If the `fv` is already equal to a whole number, the integer value will
|
||||
be that whole number. Other wise the `iv` is the whole number further
|
||||
*away from zero*.
|
||||
|
||||
In C++, a more sophisticated version would require `<cmath>` and
|
||||
could look like:
|
||||
In C++, a more sophisticated version would require `<cmath>` and could
|
||||
look like:
|
||||
|
||||
```c++
|
||||
template <typename T>
|
||||
int MyTruncate(T x) {
|
||||
return int((x < 0) ? floor(x) : ceil(x));
|
||||
return int((x < 0) ? floor(x) : ceil(x));
|
||||
}
|
||||
```
|
||||
|
||||
|
|
@ -136,15 +142,15 @@ given above.
|
|||
|
||||
```asm
|
||||
RoundAwayFromZero:
|
||||
fcmp d0, 0
|
||||
ble 1f
|
||||
// Value is positive, truncate towards positive infinity (ceil)
|
||||
frintp d0, d0
|
||||
b 2f
|
||||
1: // Value is negative, truncate towards negative infinity (floor)
|
||||
frintm d0, d0
|
||||
2: fcvtzs x0, d0
|
||||
ret
|
||||
fcmp d0, 0
|
||||
ble 1f
|
||||
// Value is positive, truncate towards positive infinity (ceil)
|
||||
frintp d0, d0
|
||||
b 2f
|
||||
1: // Value is negative, truncate towards negative infinity (floor)
|
||||
frintm d0, d0
|
||||
2: fcvtzs x0, d0
|
||||
ret
|
||||
```
|
||||
|
||||
`frintp` and `frintm` will honor the source register already being
|
||||
|
|
|
|||
|
|
@ -1,5 +1,19 @@
|
|||
#include <cinttypes>
|
||||
#include <stdio.h>
|
||||
|
||||
#define MAX 4
|
||||
double d[4] = { 1.555555, 2.666666, 3.777777, 4.888888 };
|
||||
float f[4] = { 1.111111, 2.222222, 3.333333, 4.444444 };
|
||||
|
||||
int main() {
|
||||
for (long counter = 0; counter < MAX; counter++) {
|
||||
printf("index %ld double %f float %f\n", counter, d[counter], f[counter]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
extern "C" uint32_t T1(double d) {
|
||||
return uint32_t(d);
|
||||
}
|
||||
|
|
@ -19,3 +33,4 @@ extern "C" int32_t T4(float f) {
|
|||
extern "C" uint64_t T5(double d) {
|
||||
return uint64_t(d);
|
||||
}
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
# Section 2 / What Are Floating Point Numbers?
|
||||
|
||||
Before we introduce floating point instructions in the AARCH64 ISA, it is
|
||||
worth going over exactly what a floating point value is. Integers are easy.
|
||||
They're just powers of two summed together with a single bit at one end
|
||||
determining the sign (if the integer is signed).
|
||||
Before we introduce floating point instructions in the AARCH64 ISA, it
|
||||
is worth going over exactly what a floating point value is. Integers are
|
||||
easy. They're just powers of two summed together with a single bit at
|
||||
one end determining the sign (if the integer is signed).
|
||||
|
||||
But what are floating numbers?
|
||||
|
||||
|
|
@ -45,41 +45,42 @@ Full fraction: 0 0
|
|||
Equation: 1 x 2^0 1 x 2^0
|
||||
```
|
||||
|
||||
On the line marked "Value" you can see the values represented as double precision
|
||||
and as single precision. Under "Comment" you can see that there
|
||||
is no difference between the double and the single precision numbers. Remember
|
||||
the key thing about floating point numbers: they are approximations. Sometimes,
|
||||
as in the case of whole numbers like 1, the approximation is exact. When there
|
||||
is a difference, the difference will be small and printed in the Comment
|
||||
column.
|
||||
On the line marked "Value" you can see the values represented as double
|
||||
precision and as single precision. Under "Comment" you can see that
|
||||
there is no difference between the double and the single precision
|
||||
numbers. Remember the key thing about floating point numbers: they are
|
||||
approximations. Sometimes, as in the case of whole numbers like 1, the
|
||||
approximation is exact. When there is a difference, the difference will
|
||||
be small and printed in the Comment column.
|
||||
|
||||
The Sign field is 0. This indicates that the whole floating point value is positive.
|
||||
There are no other sign values including in the exponent. However, exponents can
|
||||
be negative... this is explained next.
|
||||
The Sign field is 0. This indicates that the whole floating point value
|
||||
is positive. There are no other sign values including in the exponent.
|
||||
However, exponents can be negative... this is explained next.
|
||||
|
||||
First, notice that the double precision exponent is 11 bits wide while the single
|
||||
precision exponent is only 8 bits wide. Next, notice the values... 1023 and 127
|
||||
respectively. The value of 1 is 1 raised to the power of 0 base 2. So why 1023
|
||||
or 127?
|
||||
First, notice that the double precision exponent is 11 bits wide while
|
||||
the single precision exponent is only 8 bits wide. Next, notice the
|
||||
values... 1023 and 127 respectively. The value of 1 is 1 raised to the
|
||||
power of 0 base 2. So why 1023 or 127?
|
||||
|
||||
There is no sign bit for the exponent yet the exponent must support negative numbers.
|
||||
It does this by incorporating an offset of 1023 and 127 respectively (both representing
|
||||
0). Anything above 1023 and 127 are positive exponents. Anything below these values
|
||||
are negative exponents.
|
||||
There is no sign bit for the exponent yet the exponent must support
|
||||
negative numbers. It does this by incorporating an offset of 1023 and
|
||||
127 respectively (both representing 0). Anything above 1023 and 127 are
|
||||
positive exponents. Anything below these values are negative exponents.
|
||||
|
||||
The De-biased line are the values of the exponent with their bias removed.
|
||||
Notice they work out to 0. So, the value of 1 is represented by 1 raised to the power of 0.
|
||||
The De-biased line are the values of the exponent with their bias
|
||||
removed. Notice they work out to 0. So, the value of 1 is represented by
|
||||
1 raised to the power of 0.
|
||||
|
||||
The Fraction has a value of zero. Where's the 1 that we've been talking about get stored?
|
||||
It isn't. A value of 1 is always assumed to be the only value in front of the decimal place
|
||||
in a `float` or `double`. Every floating point value is 1 plus a fraction all raised to
|
||||
some power of 2.
|
||||
The Fraction has a value of zero. Where's the 1 that we've been talking
|
||||
about get stored? It isn't. A value of 1 is always assumed to be the
|
||||
only value in front of the decimal place in a `float` or `double`. Every
|
||||
floating point value is 1 plus a fraction all raised to some power of 2.
|
||||
|
||||
We thought we'd highlight a few of the bits in the fractional part of a floating point
|
||||
number. These can be illuminating when the value being shown is in the range of
|
||||
-2 < x < 2. Notice the the values of -2 and 2 are outside this range. In other words,
|
||||
showing the first few bits of the fraction are illuminating when the exponent works
|
||||
out to 0.
|
||||
We thought we'd highlight a few of the bits in the fractional part of a
|
||||
floating point number. These can be illuminating when the value being
|
||||
shown is in the range of -2 < x < 2. Notice the the values of -2 and 2
|
||||
are outside this range. In other words, showing the first few bits of
|
||||
the fraction are illuminating when the exponent works out to 0.
|
||||
|
||||
* Halves - There are no halves in the value of 1.
|
||||
|
||||
|
|
@ -91,11 +92,12 @@ out to 0.
|
|||
|
||||
* Thirty Seconds - There are no thirty seconds in the value of 1.
|
||||
|
||||
Of course, there are more fractional values to `float` and `doubles` but listing them all
|
||||
wouldn't be a fun tasks and we're all about fun. :)
|
||||
Of course, there are more fractional values to `float` and `doubles` but
|
||||
listing them all wouldn't be a fun tasks and we're all about fun. :)
|
||||
|
||||
Finally, the Equation line rebuilds the floating point value in its actual "scientific"
|
||||
notation. The value of 1 is a 1 raised to the zeroth power of 2.
|
||||
Finally, the Equation line rebuilds the floating point value in its
|
||||
actual "scientific" notation. The value of 1 is a 1 raised to the zeroth
|
||||
power of 2.
|
||||
|
||||
How about a value of 1.5?
|
||||
|
||||
|
|
@ -138,11 +140,10 @@ Equation: 1.875 x 2^0 1.875 x 2^0
|
|||
|
||||
How about 8.5?
|
||||
|
||||
This is the first time we are looking at
|
||||
a value which increases the (de-biased) exponent to non-zero.
|
||||
Things get a little more complicated. Now, there isn't an
|
||||
obvious mapping of the fraction bits to the final number they
|
||||
represent. This is the impact of the non-zero exponent.
|
||||
This is the first time we are looking at a value which increases the
|
||||
(de-biased) exponent to non-zero. Things get a little more complicated.
|
||||
Now, there isn't an obvious mapping of the fraction bits to the final
|
||||
number they represent. This is the impact of the non-zero exponent.
|
||||
|
||||
```text
|
||||
Component Double Float Comment
|
||||
|
|
@ -160,17 +161,16 @@ Full fraction: 0.0625 0.0625
|
|||
Equation: 1.0625 x 2^3 1.0625 x 2^3
|
||||
```
|
||||
|
||||
Even though there is a half in eight and a half, the Halves bit
|
||||
is 0. What is 8? Eight is a 2 raised to the power of 3. In
|
||||
other words, the bit for the half in 8.5 is shifted to the
|
||||
right by three bits. Confirm this by looking at the
|
||||
Sixteenths. *There's our bit!*
|
||||
Even though there is a half in eight and a half, the Halves bit is 0.
|
||||
What is 8? Eight is a 2 raised to the power of 3. In other words, the
|
||||
bit for the half in 8.5 is shifted to the right by three bits. Confirm
|
||||
this by looking at the Sixteenths. *There's our bit!*
|
||||
|
||||
Turn your attention to the Equation. 1.0625 multiplied by 8
|
||||
is 8.5. Cool huh?
|
||||
Turn your attention to the Equation. 1.0625 multiplied by 8 is 8.5. Cool
|
||||
huh?
|
||||
|
||||
How about something harder? Like 8.51 - just a teensy bit
|
||||
different from the previous example.
|
||||
How about something harder? Like 8.51 - just a teensy bit different from
|
||||
the previous example.
|
||||
|
||||
```text
|
||||
Component Double Float Comment
|
||||
|
|
@ -189,19 +189,19 @@ Equation: 1.06375 x 2^3 1.0637500286 x 2^3
|
|||
```
|
||||
|
||||
For the first time we're seeing that 8.51 cannot be perfectly
|
||||
represented by `float`. `double` gets it right. The difference
|
||||
between the `double` and `float` is the very small number shown
|
||||
on the first line of output.
|
||||
represented by `float`. `double` gets it right. The difference between
|
||||
the `double` and `float` is the very small number shown on the first
|
||||
line of output.
|
||||
|
||||
## When a Number is Not a Number and How About Infinity?
|
||||
|
||||
`NaN` is an actual value. It means `not a number`.
|
||||
|
||||
[Here](./floatster.cpp) is the source code to another program we
|
||||
have written that explores both `NaN` and `Inf`.
|
||||
[Here](./floatster.cpp) is the source code to another program we have
|
||||
written that explores both `NaN` and `Inf`.
|
||||
|
||||
Let's examine `NaN` which is produced when you do naughty things
|
||||
like take the square root of a negative number.
|
||||
Let's examine `NaN` which is produced when you do naughty things like
|
||||
take the square root of a negative number.
|
||||
|
||||
```text
|
||||
Enter a number (-100 causes divide by 0, -200 causes sqrt(-1): -200
|
||||
|
|
@ -213,8 +213,8 @@ NaN: 1
|
|||
Inf: 0
|
||||
```
|
||||
|
||||
`Nan` is true (for `float`) when its exponent is 0xFF and fraction
|
||||
is not zero.
|
||||
`Nan` is true (for `float`) when its exponent is 0xFF and fraction is
|
||||
not zero.
|
||||
|
||||
You'll never get a `float` that is 2 raised to the power of 128 because
|
||||
that value is reserved for `NaN` and `Inf`.
|
||||
|
|
@ -232,13 +232,13 @@ Inf: 1
|
|||
```
|
||||
|
||||
Once again, notice the out-of-bounds value for the exponent: 0xFF.
|
||||
Secondly, the fraction is fully zero. The sign bit specifies negative
|
||||
or positive infinity.
|
||||
Secondly, the fraction is fully zero. The sign bit specifies negative or
|
||||
positive infinity.
|
||||
|
||||
## Testing for Naughty Values
|
||||
|
||||
Thankfully, there exists two functions that will do the inspection
|
||||
for you, looking for `Nan` and `Inf`.
|
||||
Thankfully, there exists two functions that will do the inspection for
|
||||
you, looking for `Nan` and `Inf`.
|
||||
|
||||
* `isnan(floating point value)` and
|
||||
|
||||
|
|
@ -246,6 +246,6 @@ for you, looking for `Nan` and `Inf`.
|
|||
|
||||
Both of these functions work with `double` and `float`.
|
||||
|
||||
Once a variable goes `NaN` or `Inf`, all subsequent operations
|
||||
will remain `NaN` or `Inf` until the variable is reset to a
|
||||
valid number. That is, 1 + `Inf` is `Inf`, for example.
|
||||
Once a variable goes `NaN` or `Inf`, all subsequent operations will
|
||||
remain `NaN` or `Inf` until the variable is reset to a valid number.
|
||||
That is, 1 + `Inf` is `Inf`, for example.
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@ It is worth noting early and often that you should not mix dealing
|
|||
with different precisions assuming that because of the overlaps in
|
||||
space, you'll get a meaningful result.
|
||||
|
||||
The above image does not show the corresponding layout of *half
|
||||
precision* floating point registers. `H0` sits in the least
|
||||
significant bits of `S0` and so on.
|
||||
|
||||
The above image does not show the corresponding layout of [half
|
||||
precision](./half.md) floating point registers. `H0` sits in the least
|
||||
significant bits of `S0` and so on.
|
||||
|
|
|
|||
Loading…
Reference in a new issue