big refactoring and sprucing up of programs and text

This commit is contained in:
Perry Kivolowitz 2023-01-20 17:59:43 -06:00
parent 8b00b33c75
commit 5fc9422143
8 changed files with 295 additions and 210 deletions

View file

@ -1,4 +1,6 @@
.global main #include "apple-linux-convergence.S"
GLABEL main
.text .text
.align 2 .align 2
@ -7,77 +9,95 @@ dmore .req d21
ndless .req d22 ndless .req d22
ndmore .req d23 ndmore .req d23
Emit: str x30, [sp, -16]! Emit:
bl printf START_PROC
ldr x30, [sp], 16 PUSH_P x29, x30
mov x29, sp
#if defined(__APPLE__)
PUSH_P x1, x2
CRT printf
add sp, sp, 16
#else
CRT printf
#endif
POP_P x29, x30
ret ret
END_PROC
main: str x30, [sp, -16]! MAIN
stp dless, dmore, [sp, -16]! START_PROC
stp ndless, ndmore, [sp, -16]! PUSH_P x29, x30
stp dless, dmore, [sp, -16]!
stp ndless, ndmore, [sp, -16]!
mov x29, sp
ldr x0, =vless LLD_ADDR x0, leg
ldr dless, [x0] CRT printf
ldr dmore, [x0, 8]
ldr ndless, [x0, 16]
ldr ndmore, [x0, 24]
//-fcvtps---------------------- LLD_ADDR x0, vless
fcvtps x1, dless ldr dless, [x0]
fcvtps x2, dmore ldr dmore, [x0, 8]
ldr x0, =fmt1 ldr ndless, [x0, 16]
bl Emit ldr ndmore, [x0, 24]
fcvtps x1, ndless //-fcvtps- Floating-point Convert to Signed integer, rounding toward Plus infinity
fcvtps x2, ndmore fcvtps x1, dless
ldr x0, =fmt1 fcvtps x2, dmore
bl Emit LLD_ADDR x0, fmt1
//-fcvtms----------------------- bl Emit
fcvtms x1, dless
fcvtms x2, dmore
ldr x0, =fmt2
bl Emit
fcvtms x1, ndless fcvtps x1, ndless
fcvtms x2, ndmore fcvtps x2, ndmore
ldr x0, =fmt2 LLD_ADDR x0, fmt1
bl Emit bl Emit
//-fcvtzs----------------------- //-fcvtns- Floating-point Convert to Signed integer, rounding to nearest with ties to even (scalar).
fcvtzs x1, dless fcvtns x1, dless
fcvtzs x2, dmore fcvtns x2, dmore
ldr x0, =fmt4 LLD_ADDR x0, fmt2
bl Emit bl Emit
fcvtzs x1, ndless fcvtns x1, ndless
fcvtzs x2, ndmore fcvtns x2, ndmore
ldr x0, =fmt4 LLD_ADDR x0, fmt2
bl Emit bl Emit
//-fcvtas----------------------- //-fcvtzs- Floating-point Convert to Signed integer, rounding toward Zero (scalar).
fcvtas x1, dless fcvtzs x1, dless
fcvtas x2, dmore fcvtzs x2, dmore
ldr x0, =fmt3 LLD_ADDR x0, fmt4
bl Emit bl Emit
fcvtas x1, ndless fcvtzs x1, ndless
fcvtas x2, ndmore fcvtzs x2, ndmore
ldr x0, =fmt3 LLD_ADDR x0, fmt4
bl Emit bl Emit
//-fcvtas- Floating-point Convert to Signed integer, rounding to nearest with ties to Away (scalar).
fcvtas x1, dless
fcvtas x2, dmore
LLD_ADDR x0, fmt3
bl Emit
fcvtas x1, ndless
fcvtas x2, ndmore
LLD_ADDR x0, fmt3
bl Emit
//------------------------------ //------------------------------
ldp ndless, ndmore, [sp], 16 ldp ndless, ndmore, [sp], 16
ldp dless, dmore, [sp], 16 ldp dless, dmore, [sp], 16
ldr x30, [sp], 16 POP_P x29, x30
mov w0, wzr mov w0, wzr
ret ret
END_PROC
.section .rodata
.data
vless: .double 5.49 vless: .double 5.49
vmore: .double 5.51 vmore: .double 5.51
nvless: .double -5.49 nvless: .double -5.49
nvmore: .double -5.51 nvmore: .double -5.51
fmt1: .asciz "fcvtps less: %d more: %d\n" fmt1: .asciz "fcvtps less: %d more: %d\n"
fmt2: .asciz "fcvtms less: %d more: %d\n" fmt2: .asciz "fcvtns less: %d more: %d\n"
fmt3: .asciz "fcvtta less: %d more: %d\n" fmt3: .asciz "fcvtta less: %d more: %d\n"
fmt4: .asciz "fcvtzs less: %d more: %d\n" fmt4: .asciz "fcvtzs less: %d more: %d\n"
leg: .asciz "less values are +/- 5.49. more values are +/- 5.51.\n"
.end .end

View file

@ -1,6 +1,5 @@
/* Perry Kivolowitz /* Perry Kivolowitz
Professor and Chair of Computer Science A Gentle Introduction to Assembly Language
Carthage College
*/ */
#include <iostream> #include <iostream>
@ -12,24 +11,28 @@
using namespace std; using namespace std;
const int BIASD = 1023; const int BIASD = 1023; // biasing value for double exponents
const int BIASF = 127; const int BIASF = 127; // biasing value for floats
const int FRAC_SIZD = 52; // The mantissa controls precision.
const int FRAC_SIZF = 23;
const int EXPO_SIZD = 11; const int FRAC_SIZD = 52; // number of bits in double's mantissa
const int EXPO_SIZF = 8; const int FRAC_SIZF = 23; // number of bits in float's mantissa
// The exponent controls range.
const int EXPO_SIZD = 11; // number of bits in a double's exponent
const int EXPO_SIZF = 8; // number of bits in a float's exponent
const int SIGN_SIZE = 1; const int SIGN_SIZE = 1;
struct SP { struct SP { // construction of a float
unsigned int frac : FRAC_SIZF; unsigned int frac : FRAC_SIZF;
unsigned int expo : EXPO_SIZF; unsigned int expo : EXPO_SIZF;
unsigned int sign : SIGN_SIZE; unsigned int sign : SIGN_SIZE;
}; };
struct DP { struct DP { // construction of a double
unsigned long frac : FRAC_SIZD; unsigned long frac : FRAC_SIZD;
unsigned long expo : EXPO_SIZD; unsigned long expo : EXPO_SIZD;
unsigned long sign : SIGN_SIZE; unsigned long sign : SIGN_SIZE;
@ -61,7 +64,9 @@ template<class T>
string MakeEquation(T & u, int bias) { string MakeEquation(T & u, int bias) {
stringstream ss; stringstream ss;
bool is_double = (bias == BIASD); bool is_double = (bias == BIASD);
ss << (u.sign ? "-" : "") << dec << setprecision(11) << 1.0 + DeBinary(is_double, u.frac) << " x 2^" << (u.expo - bias); ss << (u.sign ? "-" : "") << dec << setprecision(11);
ss << 1.0 + DeBinary(is_double, u.frac);
ss << " x 2^" << (u.expo - bias);
return ss.str(); return ss.str();
} }
@ -69,22 +74,26 @@ int main(int argc, char ** argv) {
Double d; Double d;
Single f; Single f;
const int fore_space = 20; const int fore_space = 18;
const int field_space = 25; const int field_space = 20;
if (argc < 2) { if (argc < 2) {
cerr << "Must supply a floating point value as a command line argument.\n"; cerr << "Requires a floating point value on command line .\n";
return 1; return 1;
} }
d.d = atof(argv[1]); d.d = atof(argv[1]);
f.f = float(d.d); f.f = float(d.d);
cout << left << setw(fore_space) << "Component" << left << setw(25) << "Double"; cout << left << setw(fore_space) << "Component" << left;
cout << left << setw(field_space) << "Float" << "Comment" << endl; cout << setw(field_space);
cout << "Double" << left << setw(field_space) << "Float";
cout << "Comment" << endl;
cout << left << setw(fore_space) << "Value:" << setw(25) << setprecision(10) << d.d; cout << left << setw(fore_space) << "Value:" << setw(field_space);
cout << setprecision(10) << d.d;
cout << setw(field_space) << setprecision(10) << f.f; cout << setw(field_space) << setprecision(10) << f.f;
cout << "Delta(F - D): " << setw(16) << setprecision(10) << f.f - d.d << endl; cout << "Delta(F - D): " << setw(16) << setprecision(10);
cout << f.f - d.d << endl;
cout << left << setw(fore_space) << "Sign:"; cout << left << setw(fore_space) << "Sign:";
cout << setw(field_space) << (bool)d.D.sign; cout << setw(field_space) << (bool)d.D.sign;
@ -107,28 +116,38 @@ int main(int argc, char ** argv) {
cout << endl; cout << endl;
cout << setw(fore_space) << "Halves:"; cout << setw(fore_space) << "Halves:";
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 1)) & 1); cout << setw(field_space) << hex;
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 1)) & 1); cout << ((d.D.frac >> (FRAC_SIZD - 1)) & 1);
cout << setw(field_space) << hex;
cout << ((f.F.frac >> (FRAC_SIZF - 1)) & 1);
cout << endl; cout << endl;
cout << setw(fore_space) << "Quarters:"; cout << setw(fore_space) << "Quarters:";
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 2)) & 1); cout << setw(field_space) << hex;
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 2)) & 1); cout << ((d.D.frac >> (FRAC_SIZD - 2)) & 1);
cout << setw(field_space) << hex;
cout << ((f.F.frac >> (FRAC_SIZF - 2)) & 1);
cout << endl; cout << endl;
cout << setw(fore_space) << "Eighths:"; cout << setw(fore_space) << "Eighths:";
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 3)) & 1); cout << setw(field_space) << hex;
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 3)) & 1); cout << ((d.D.frac >> (FRAC_SIZD - 3)) & 1);
cout << setw(field_space) << hex;
cout << ((f.F.frac >> (FRAC_SIZF - 3)) & 1);
cout << endl; cout << endl;
cout << setw(fore_space) << "Sixteenths:"; cout << setw(fore_space) << "Sixteenths:";
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 4)) & 1); cout << setw(field_space) << hex;
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 4)) & 1); cout << ((d.D.frac >> (FRAC_SIZD - 4)) & 1);
cout << setw(field_space) << hex;
cout << ((f.F.frac >> (FRAC_SIZF - 4)) & 1);
cout << endl; cout << endl;
cout << setw(fore_space) << "Thirty seconds:"; cout << setw(fore_space) << "Thirty seconds:";
cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 5)) & 1); cout << setw(field_space) << hex;
cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 5)) & 1); cout << ((d.D.frac >> (FRAC_SIZD - 5)) & 1);
cout << setw(field_space) << hex;
cout << ((f.F.frac >> (FRAC_SIZF - 5)) & 1);
cout << endl; cout << endl;
cout << setw(fore_space) << "Full fraction:"; cout << setw(fore_space) << "Full fraction:";

View file

@ -10,8 +10,8 @@ themselves, using floating point literals is extremely constrained.
For example: For example:
```asm ```asm
fmov d0, 1 // 1 fmov d0, 1 // 1
fmov d0, 1.1 // 2 fmov d0, 1.1 // 2
``` ```
`Line 1` will pass muster but `Line 2` will cause an error. `Line 1` will pass muster but `Line 2` will cause an error.
@ -150,13 +150,13 @@ what that magic is.
Build the program with the `-g` option to enable debugging using GDB. Build the program with the `-g` option to enable debugging using GDB.
```text ```text
$ gcc -g t.s % gcc -g t.s
``` ```
Then launch GDB on the executable: Then launch GDB on the executable:
```text ```text
$ gdb a.out % gdb a.out
``` ```
Set a breakpoint on line 6. Set a breakpoint on line 6.

View file

@ -1,38 +1,64 @@
.global main #include "apple-linux-convergence.S"
GLABEL main
.text .text
.align 2 .p2align 2
counter .req x20 counter .req x20
dptr .req x21 dptr .req x21
fptr .req x22 fptr .req x22
.equ max, 4 .equ max, 4
main: stp counter, x30, [sp, -16]! MAIN
stp dptr, fptr, [sp, -16]! START_PROC
ldr dptr, =d PUSH_P counter, x30
ldr fptr, =f PUSH_P dptr, fptr
mov counter, xzr PUSH_R x29
mov x29, sp
1: cmp counter, max LLD_ADDR dptr, d
beq 2f LLD_ADDR fptr, f
mov counter, xzr
ldr d0, [dptr, counter, lsl 3] 1: cmp counter, max
ldr s1, [fptr, counter, lsl 2] beq 2f
fcvt d1, s1 ldr d0, [dptr, counter, lsl 3]
ldr x0, =fmt ldr s1, [fptr, counter, lsl 2]
add counter, counter, 1 fcvt d1, s1
mov x1, counter LLD_ADDR x0, fmt
bl printf add counter, counter, 1
b 1b mov x1, counter
#if defined(__APPLE__)
/*
Give us some stack space. Then read the printf template
string right to left. Variadics on the Mac are difficult
to get right. Remember that printf never prints floats.
Only doubles. Internally, floats are converted to double.
See the fcvt instruction above.
*/
sub sp, sp, 32
str d1, [sp, 16]
str d0, [sp, 8]
str x1, [sp]
CRT printf
add sp, sp, 32
#else
CRT printf
#endif
b 1b
2: ldp dptr, fptr, [sp], 16 2: POP_R x29
ldp counter, x30, [sp], 16 POP_P dptr, fptr
mov w0, wzr POP_P counter, x30
mov w0, wzr
END_PROC
ret ret
.data .data
fmt: .asciz "%d %f %f\n" fmt: .asciz "index %ld double %f float %f\n"
d: .double 1.111111, 2.222222, 3.333333, 4.444444 .p2align 3
f: .float 1.111111, 2.222222, 3.333333, 4.444444 d: .double 1.555555, 2.666666, 3.777777, 4.888888
.p2align 2
f: .float 1.111111, 2.222222, 3.333333, 4.444444
.end .end

View file

@ -13,9 +13,11 @@ For this chapter, I will use:
* Rounding means picking some fractional value and if the float's * Rounding means picking some fractional value and if the float's
fraction is higher, you go one way and if lower, you go the other. fraction is higher, you go one way and if lower, you go the other.
* Truncation means you don't look too closely at the fractional value. * Truncation means you don't care about the fractional value. You just
Instead, you just eliminate the fractional part and slam the whole eliminate the fractional part and slam the whole number ... one way or
number ... one way or the other. the other.
"One way or the other" is defined next.
## Truncation Towards Zero ## Truncation Towards Zero
@ -30,8 +32,8 @@ Diving a little deeper, there is a choice to be made as to whether or
not `integer_variable` is signed or unsigned. And, whether or not not `integer_variable` is signed or unsigned. And, whether or not
`integer_variable` is a 32 bit or 64 bit value. `integer_variable` is a 32 bit or 64 bit value.
The instruction is `fcvtz` - convert towards zero. Then, the choice The instruction is `fcvtz` - convert towards zero. Then, the choice as
as to whether to produce a signed or unsigned result is defined by the to whether to produce a signed or unsigned result is defined by the
final letterL `u` or `s`. final letterL `u` or `s`.
| Mnemonic | Meaning | | Mnemonic | Meaning |
@ -41,7 +43,7 @@ final letterL `u` or `s`.
As an example of how the ARM documentation is confusing - this As an example of how the ARM documentation is confusing - this
instruction which completely discards the fractional value is said by instruction which completely discards the fractional value is said by
the ARM documentation as doing rounding. the ARM documentation as doing rounding not truncating.
The the choice of source register defined whether you are converting The the choice of source register defined whether you are converting
a double or single precision floating point value. a double or single precision floating point value.
@ -60,14 +62,18 @@ Examples where `d` is a `double` and `f` is a `float`:
| C++ | Instruction | | C++ | Instruction |
| --- | ----------- | | --- | ----------- |
| `int32_t(d)` | `fcvtzs w0, d0` | | `int32_t(d)` | `fcvtzs w0, d0` |
| `uint32_t(d)` | `fcvtzu w0, d0` | | `uint32_t(d)` | `fcvtzu w0, d0` |
| `int64_t(d)` | `fcvtzs x0, d0` | | `int64_t(d)` | `fcvtzs x0, d0` |
| `uint64_t(d)` | `fcvtzu x0, d0` | | `uint64_t(d)` | `fcvtzu x0, d0` |
[Here](./asm_rounding.s) is a program which demonstrates various [Here](./asm_rounding.S) is a program which demonstrates various
ways of converting doubles to integers. ways of converting doubles to integers.
Note: This source code has been updated using the author's
Apple / Linux Convergence macros and can be built on both Apple Mac OS
and Linux ARM systems.
Let's look at: Let's look at:
```text ```text
@ -105,23 +111,23 @@ Notice all the values were truncated to the whole number that is
Truncation away from zero is not as easy. In fact, it cannot be Truncation away from zero is not as easy. In fact, it cannot be
performed with a single instruction. performed with a single instruction.
In C and C++: In C (and C++):
```c ```c
iv = (int(fv) == fv) ? int(fv) : int(fv) + ((fv < 0) ? -1 : 1); iv = (int(fv) == fv) ? int(fv) : int(fv) + ((fv < 0) ? -1 : 1);
``` ```
If the `fv` is already equal to a whole number, the If the `fv` is already equal to a whole number, the integer value will
integer value will be that whole number. Other wise the `iv` is be that whole number. Other wise the `iv` is the whole number further
the whole number further *away from zero*. *away from zero*.
In C++, a more sophisticated version would require `<cmath>` and In C++, a more sophisticated version would require `<cmath>` and could
could look like: look like:
```c++ ```c++
template <typename T> template <typename T>
int MyTruncate(T x) { int MyTruncate(T x) {
return int((x < 0) ? floor(x) : ceil(x)); return int((x < 0) ? floor(x) : ceil(x));
} }
``` ```
@ -136,15 +142,15 @@ given above.
```asm ```asm
RoundAwayFromZero: RoundAwayFromZero:
fcmp d0, 0 fcmp d0, 0
ble 1f ble 1f
// Value is positive, truncate towards positive infinity (ceil) // Value is positive, truncate towards positive infinity (ceil)
frintp d0, d0 frintp d0, d0
b 2f b 2f
1: // Value is negative, truncate towards negative infinity (floor) 1: // Value is negative, truncate towards negative infinity (floor)
frintm d0, d0 frintm d0, d0
2: fcvtzs x0, d0 2: fcvtzs x0, d0
ret ret
``` ```
`frintp` and `frintm` will honor the source register already being `frintp` and `frintm` will honor the source register already being

View file

@ -1,5 +1,19 @@
#include <cinttypes> #include <cinttypes>
#include <stdio.h>
#define MAX 4
double d[4] = { 1.555555, 2.666666, 3.777777, 4.888888 };
float f[4] = { 1.111111, 2.222222, 3.333333, 4.444444 };
int main() {
for (long counter = 0; counter < MAX; counter++) {
printf("index %ld double %f float %f\n", counter, d[counter], f[counter]);
}
return 0;
}
/*
extern "C" uint32_t T1(double d) { extern "C" uint32_t T1(double d) {
return uint32_t(d); return uint32_t(d);
} }
@ -19,3 +33,4 @@ extern "C" int32_t T4(float f) {
extern "C" uint64_t T5(double d) { extern "C" uint64_t T5(double d) {
return uint64_t(d); return uint64_t(d);
} }
*/

View file

@ -1,9 +1,9 @@
# Section 2 / What Are Floating Point Numbers? # Section 2 / What Are Floating Point Numbers?
Before we introduce floating point instructions in the AARCH64 ISA, it is Before we introduce floating point instructions in the AARCH64 ISA, it
worth going over exactly what a floating point value is. Integers are easy. is worth going over exactly what a floating point value is. Integers are
They're just powers of two summed together with a single bit at one end easy. They're just powers of two summed together with a single bit at
determining the sign (if the integer is signed). one end determining the sign (if the integer is signed).
But what are floating numbers? But what are floating numbers?
@ -45,41 +45,42 @@ Full fraction: 0 0
Equation: 1 x 2^0 1 x 2^0 Equation: 1 x 2^0 1 x 2^0
``` ```
On the line marked "Value" you can see the values represented as double precision On the line marked "Value" you can see the values represented as double
and as single precision. Under "Comment" you can see that there precision and as single precision. Under "Comment" you can see that
is no difference between the double and the single precision numbers. Remember there is no difference between the double and the single precision
the key thing about floating point numbers: they are approximations. Sometimes, numbers. Remember the key thing about floating point numbers: they are
as in the case of whole numbers like 1, the approximation is exact. When there approximations. Sometimes, as in the case of whole numbers like 1, the
is a difference, the difference will be small and printed in the Comment approximation is exact. When there is a difference, the difference will
column. be small and printed in the Comment column.
The Sign field is 0. This indicates that the whole floating point value is positive. The Sign field is 0. This indicates that the whole floating point value
There are no other sign values including in the exponent. However, exponents can is positive. There are no other sign values including in the exponent.
be negative... this is explained next. However, exponents can be negative... this is explained next.
First, notice that the double precision exponent is 11 bits wide while the single First, notice that the double precision exponent is 11 bits wide while
precision exponent is only 8 bits wide. Next, notice the values... 1023 and 127 the single precision exponent is only 8 bits wide. Next, notice the
respectively. The value of 1 is 1 raised to the power of 0 base 2. So why 1023 values... 1023 and 127 respectively. The value of 1 is 1 raised to the
or 127? power of 0 base 2. So why 1023 or 127?
There is no sign bit for the exponent yet the exponent must support negative numbers. There is no sign bit for the exponent yet the exponent must support
It does this by incorporating an offset of 1023 and 127 respectively (both representing negative numbers. It does this by incorporating an offset of 1023 and
0). Anything above 1023 and 127 are positive exponents. Anything below these values 127 respectively (both representing 0). Anything above 1023 and 127 are
are negative exponents. positive exponents. Anything below these values are negative exponents.
The De-biased line are the values of the exponent with their bias removed. The De-biased line are the values of the exponent with their bias
Notice they work out to 0. So, the value of 1 is represented by 1 raised to the power of 0. removed. Notice they work out to 0. So, the value of 1 is represented by
1 raised to the power of 0.
The Fraction has a value of zero. Where's the 1 that we've been talking about get stored? The Fraction has a value of zero. Where's the 1 that we've been talking
It isn't. A value of 1 is always assumed to be the only value in front of the decimal place about get stored? It isn't. A value of 1 is always assumed to be the
in a `float` or `double`. Every floating point value is 1 plus a fraction all raised to only value in front of the decimal place in a `float` or `double`. Every
some power of 2. floating point value is 1 plus a fraction all raised to some power of 2.
We thought we'd highlight a few of the bits in the fractional part of a floating point We thought we'd highlight a few of the bits in the fractional part of a
number. These can be illuminating when the value being shown is in the range of floating point number. These can be illuminating when the value being
-2 < x < 2. Notice the the values of -2 and 2 are outside this range. In other words, shown is in the range of -2 < x < 2. Notice the the values of -2 and 2
showing the first few bits of the fraction are illuminating when the exponent works are outside this range. In other words, showing the first few bits of
out to 0. the fraction are illuminating when the exponent works out to 0.
* Halves - There are no halves in the value of 1. * Halves - There are no halves in the value of 1.
@ -91,11 +92,12 @@ out to 0.
* Thirty Seconds - There are no thirty seconds in the value of 1. * Thirty Seconds - There are no thirty seconds in the value of 1.
Of course, there are more fractional values to `float` and `doubles` but listing them all Of course, there are more fractional values to `float` and `doubles` but
wouldn't be a fun tasks and we're all about fun. :) listing them all wouldn't be a fun tasks and we're all about fun. :)
Finally, the Equation line rebuilds the floating point value in its actual "scientific" Finally, the Equation line rebuilds the floating point value in its
notation. The value of 1 is a 1 raised to the zeroth power of 2. actual "scientific" notation. The value of 1 is a 1 raised to the zeroth
power of 2.
How about a value of 1.5? How about a value of 1.5?
@ -115,7 +117,7 @@ Full fraction: 0.5 0.5
Equation: 1.5 x 2^0 1.5 x 2^0 Equation: 1.5 x 2^0 1.5 x 2^0
``` ```
The only difference is that there is a bit turned on in the fraction. The only difference is that there is a bit turned on in the fraction.
It is the most significant bit... there is a half in one and a half. It is the most significant bit... there is a half in one and a half.
How about 1.875? How about 1.875?
@ -138,11 +140,10 @@ Equation: 1.875 x 2^0 1.875 x 2^0
How about 8.5? How about 8.5?
This is the first time we are looking at This is the first time we are looking at a value which increases the
a value which increases the (de-biased) exponent to non-zero. (de-biased) exponent to non-zero. Things get a little more complicated.
Things get a little more complicated. Now, there isn't an Now, there isn't an obvious mapping of the fraction bits to the final
obvious mapping of the fraction bits to the final number they number they represent. This is the impact of the non-zero exponent.
represent. This is the impact of the non-zero exponent.
```text ```text
Component Double Float Comment Component Double Float Comment
@ -160,17 +161,16 @@ Full fraction: 0.0625 0.0625
Equation: 1.0625 x 2^3 1.0625 x 2^3 Equation: 1.0625 x 2^3 1.0625 x 2^3
``` ```
Even though there is a half in eight and a half, the Halves bit Even though there is a half in eight and a half, the Halves bit is 0.
is 0. What is 8? Eight is a 2 raised to the power of 3. In What is 8? Eight is a 2 raised to the power of 3. In other words, the
other words, the bit for the half in 8.5 is shifted to the bit for the half in 8.5 is shifted to the right by three bits. Confirm
right by three bits. Confirm this by looking at the this by looking at the Sixteenths. *There's our bit!*
Sixteenths. *There's our bit!*
Turn your attention to the Equation. 1.0625 multiplied by 8 Turn your attention to the Equation. 1.0625 multiplied by 8 is 8.5. Cool
is 8.5. Cool huh? huh?
How about something harder? Like 8.51 - just a teensy bit How about something harder? Like 8.51 - just a teensy bit different from
different from the previous example. the previous example.
```text ```text
Component Double Float Comment Component Double Float Comment
@ -189,19 +189,19 @@ Equation: 1.06375 x 2^3 1.0637500286 x 2^3
``` ```
For the first time we're seeing that 8.51 cannot be perfectly For the first time we're seeing that 8.51 cannot be perfectly
represented by `float`. `double` gets it right. The difference represented by `float`. `double` gets it right. The difference between
between the `double` and `float` is the very small number shown the `double` and `float` is the very small number shown on the first
on the first line of output. line of output.
## When a Number is Not a Number and How About Infinity? ## When a Number is Not a Number and How About Infinity?
`NaN` is an actual value. It means `not a number`. `NaN` is an actual value. It means `not a number`.
[Here](./floatster.cpp) is the source code to another program we [Here](./floatster.cpp) is the source code to another program we have
have written that explores both `NaN` and `Inf`. written that explores both `NaN` and `Inf`.
Let's examine `NaN` which is produced when you do naughty things Let's examine `NaN` which is produced when you do naughty things like
like take the square root of a negative number. take the square root of a negative number.
```text ```text
Enter a number (-100 causes divide by 0, -200 causes sqrt(-1): -200 Enter a number (-100 causes divide by 0, -200 causes sqrt(-1): -200
@ -213,8 +213,8 @@ NaN: 1
Inf: 0 Inf: 0
``` ```
`Nan` is true (for `float`) when its exponent is 0xFF and fraction `Nan` is true (for `float`) when its exponent is 0xFF and fraction is
is not zero. not zero.
You'll never get a `float` that is 2 raised to the power of 128 because You'll never get a `float` that is 2 raised to the power of 128 because
that value is reserved for `NaN` and `Inf`. that value is reserved for `NaN` and `Inf`.
@ -232,13 +232,13 @@ Inf: 1
``` ```
Once again, notice the out-of-bounds value for the exponent: 0xFF. Once again, notice the out-of-bounds value for the exponent: 0xFF.
Secondly, the fraction is fully zero. The sign bit specifies negative Secondly, the fraction is fully zero. The sign bit specifies negative or
or positive infinity. positive infinity.
## Testing for Naughty Values ## Testing for Naughty Values
Thankfully, there exists two functions that will do the inspection Thankfully, there exists two functions that will do the inspection for
for you, looking for `Nan` and `Inf`. you, looking for `Nan` and `Inf`.
* `isnan(floating point value)` and * `isnan(floating point value)` and
@ -246,6 +246,6 @@ for you, looking for `Nan` and `Inf`.
Both of these functions work with `double` and `float`. Both of these functions work with `double` and `float`.
Once a variable goes `NaN` or `Inf`, all subsequent operations Once a variable goes `NaN` or `Inf`, all subsequent operations will
will remain `NaN` or `Inf` until the variable is reset to a remain `NaN` or `Inf` until the variable is reset to a valid number.
valid number. That is, 1 + `Inf` is `Inf`, for example. That is, 1 + `Inf` is `Inf`, for example.

View file

@ -26,7 +26,7 @@ We say aliases because, like the integer registers, how you reference a
floating point register determines how it is interpreted. floating point register determines how it is interpreted.
For example, in the following image, note the overlap of two single For example, in the following image, note the overlap of two single
precision floats within a single double precision floating point precision floats within a single double precision floating point
register. register.
![regs](./regs.png) ![regs](./regs.png)
@ -35,7 +35,6 @@ It is worth noting early and often that you should not mix dealing
with different precisions assuming that because of the overlaps in with different precisions assuming that because of the overlaps in
space, you'll get a meaningful result. space, you'll get a meaningful result.
The above image does not show the corresponding layout of *half The above image does not show the corresponding layout of [half
precision* floating point registers. `H0` sits in the least precision](./half.md) floating point registers. `H0` sits in the least
significant bits of `S0` and so on. significant bits of `S0` and so on.