diff --git a/section_2/float/asm_rounding.s b/section_2/float/asm_rounding.s index f43787e..accca03 100644 --- a/section_2/float/asm_rounding.s +++ b/section_2/float/asm_rounding.s @@ -1,4 +1,6 @@ - .global main +#include "apple-linux-convergence.S" + + GLABEL main .text .align 2 @@ -7,77 +9,95 @@ dmore .req d21 ndless .req d22 ndmore .req d23 -Emit: str x30, [sp, -16]! - bl printf - ldr x30, [sp], 16 +Emit: + START_PROC + PUSH_P x29, x30 + mov x29, sp +#if defined(__APPLE__) + PUSH_P x1, x2 + CRT printf + add sp, sp, 16 +#else + CRT printf +#endif + POP_P x29, x30 ret + END_PROC -main: str x30, [sp, -16]! - stp dless, dmore, [sp, -16]! - stp ndless, ndmore, [sp, -16]! +MAIN + START_PROC + PUSH_P x29, x30 + stp dless, dmore, [sp, -16]! + stp ndless, ndmore, [sp, -16]! + mov x29, sp - ldr x0, =vless - ldr dless, [x0] - ldr dmore, [x0, 8] - ldr ndless, [x0, 16] - ldr ndmore, [x0, 24] + LLD_ADDR x0, leg + CRT printf -//-fcvtps---------------------- - fcvtps x1, dless - fcvtps x2, dmore - ldr x0, =fmt1 - bl Emit + LLD_ADDR x0, vless + ldr dless, [x0] + ldr dmore, [x0, 8] + ldr ndless, [x0, 16] + ldr ndmore, [x0, 24] - fcvtps x1, ndless - fcvtps x2, ndmore - ldr x0, =fmt1 - bl Emit -//-fcvtms----------------------- - fcvtms x1, dless - fcvtms x2, dmore - ldr x0, =fmt2 - bl Emit +//-fcvtps- Floating-point Convert to Signed integer, rounding toward Plus infinity + fcvtps x1, dless + fcvtps x2, dmore + LLD_ADDR x0, fmt1 + bl Emit - fcvtms x1, ndless - fcvtms x2, ndmore - ldr x0, =fmt2 - bl Emit -//-fcvtzs----------------------- - fcvtzs x1, dless - fcvtzs x2, dmore - ldr x0, =fmt4 - bl Emit + fcvtps x1, ndless + fcvtps x2, ndmore + LLD_ADDR x0, fmt1 + bl Emit +//-fcvtns- Floating-point Convert to Signed integer, rounding to nearest with ties to even (scalar). + fcvtns x1, dless + fcvtns x2, dmore + LLD_ADDR x0, fmt2 + bl Emit - fcvtzs x1, ndless - fcvtzs x2, ndmore - ldr x0, =fmt4 - bl Emit -//-fcvtas----------------------- - fcvtas x1, dless - fcvtas x2, dmore - ldr x0, =fmt3 - bl Emit + fcvtns x1, ndless + fcvtns x2, ndmore + LLD_ADDR x0, fmt2 + bl Emit +//-fcvtzs- Floating-point Convert to Signed integer, rounding toward Zero (scalar). + fcvtzs x1, dless + fcvtzs x2, dmore + LLD_ADDR x0, fmt4 + bl Emit - fcvtas x1, ndless - fcvtas x2, ndmore - ldr x0, =fmt3 - bl Emit + fcvtzs x1, ndless + fcvtzs x2, ndmore + LLD_ADDR x0, fmt4 + bl Emit +//-fcvtas- Floating-point Convert to Signed integer, rounding to nearest with ties to Away (scalar). + fcvtas x1, dless + fcvtas x2, dmore + LLD_ADDR x0, fmt3 + bl Emit + + fcvtas x1, ndless + fcvtas x2, ndmore + LLD_ADDR x0, fmt3 + bl Emit //------------------------------ - ldp ndless, ndmore, [sp], 16 - ldp dless, dmore, [sp], 16 - ldr x30, [sp], 16 - mov w0, wzr + ldp ndless, ndmore, [sp], 16 + ldp dless, dmore, [sp], 16 + POP_P x29, x30 + mov w0, wzr ret - - .section .rodata + END_PROC + + .data vless: .double 5.49 vmore: .double 5.51 nvless: .double -5.49 nvmore: .double -5.51 fmt1: .asciz "fcvtps less: %d more: %d\n" -fmt2: .asciz "fcvtms less: %d more: %d\n" +fmt2: .asciz "fcvtns less: %d more: %d\n" fmt3: .asciz "fcvtta less: %d more: %d\n" fmt4: .asciz "fcvtzs less: %d more: %d\n" +leg: .asciz "less values are +/- 5.49. more values are +/- 5.51.\n" .end diff --git a/section_2/float/float_dump.cpp b/section_2/float/float_dump.cpp index 3672b14..307705d 100644 --- a/section_2/float/float_dump.cpp +++ b/section_2/float/float_dump.cpp @@ -1,6 +1,5 @@ /* Perry Kivolowitz - Professor and Chair of Computer Science - Carthage College + A Gentle Introduction to Assembly Language */ #include @@ -12,24 +11,28 @@ using namespace std; -const int BIASD = 1023; -const int BIASF = 127; +const int BIASD = 1023; // biasing value for double exponents +const int BIASF = 127; // biasing value for floats -const int FRAC_SIZD = 52; -const int FRAC_SIZF = 23; +// The mantissa controls precision. -const int EXPO_SIZD = 11; -const int EXPO_SIZF = 8; +const int FRAC_SIZD = 52; // number of bits in double's mantissa +const int FRAC_SIZF = 23; // number of bits in float's mantissa + +// The exponent controls range. + +const int EXPO_SIZD = 11; // number of bits in a double's exponent +const int EXPO_SIZF = 8; // number of bits in a float's exponent const int SIGN_SIZE = 1; -struct SP { +struct SP { // construction of a float unsigned int frac : FRAC_SIZF; unsigned int expo : EXPO_SIZF; unsigned int sign : SIGN_SIZE; }; -struct DP { +struct DP { // construction of a double unsigned long frac : FRAC_SIZD; unsigned long expo : EXPO_SIZD; unsigned long sign : SIGN_SIZE; @@ -61,7 +64,9 @@ template string MakeEquation(T & u, int bias) { stringstream ss; bool is_double = (bias == BIASD); - ss << (u.sign ? "-" : "") << dec << setprecision(11) << 1.0 + DeBinary(is_double, u.frac) << " x 2^" << (u.expo - bias); + ss << (u.sign ? "-" : "") << dec << setprecision(11); + ss << 1.0 + DeBinary(is_double, u.frac); + ss << " x 2^" << (u.expo - bias); return ss.str(); } @@ -69,22 +74,26 @@ int main(int argc, char ** argv) { Double d; Single f; - const int fore_space = 20; - const int field_space = 25; + const int fore_space = 18; + const int field_space = 20; if (argc < 2) { - cerr << "Must supply a floating point value as a command line argument.\n"; + cerr << "Requires a floating point value on command line .\n"; return 1; } d.d = atof(argv[1]); f.f = float(d.d); - cout << left << setw(fore_space) << "Component" << left << setw(25) << "Double"; - cout << left << setw(field_space) << "Float" << "Comment" << endl; + cout << left << setw(fore_space) << "Component" << left; + cout << setw(field_space); + cout << "Double" << left << setw(field_space) << "Float"; + cout << "Comment" << endl; - cout << left << setw(fore_space) << "Value:" << setw(25) << setprecision(10) << d.d; + cout << left << setw(fore_space) << "Value:" << setw(field_space); + cout << setprecision(10) << d.d; cout << setw(field_space) << setprecision(10) << f.f; - cout << "Delta(F - D): " << setw(16) << setprecision(10) << f.f - d.d << endl; + cout << "Delta(F - D): " << setw(16) << setprecision(10); + cout << f.f - d.d << endl; cout << left << setw(fore_space) << "Sign:"; cout << setw(field_space) << (bool)d.D.sign; @@ -107,28 +116,38 @@ int main(int argc, char ** argv) { cout << endl; cout << setw(fore_space) << "Halves:"; - cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 1)) & 1); - cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 1)) & 1); + cout << setw(field_space) << hex; + cout << ((d.D.frac >> (FRAC_SIZD - 1)) & 1); + cout << setw(field_space) << hex; + cout << ((f.F.frac >> (FRAC_SIZF - 1)) & 1); cout << endl; cout << setw(fore_space) << "Quarters:"; - cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 2)) & 1); - cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 2)) & 1); + cout << setw(field_space) << hex; + cout << ((d.D.frac >> (FRAC_SIZD - 2)) & 1); + cout << setw(field_space) << hex; + cout << ((f.F.frac >> (FRAC_SIZF - 2)) & 1); cout << endl; cout << setw(fore_space) << "Eighths:"; - cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 3)) & 1); - cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 3)) & 1); + cout << setw(field_space) << hex; + cout << ((d.D.frac >> (FRAC_SIZD - 3)) & 1); + cout << setw(field_space) << hex; + cout << ((f.F.frac >> (FRAC_SIZF - 3)) & 1); cout << endl; cout << setw(fore_space) << "Sixteenths:"; - cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 4)) & 1); - cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 4)) & 1); + cout << setw(field_space) << hex; + cout << ((d.D.frac >> (FRAC_SIZD - 4)) & 1); + cout << setw(field_space) << hex; + cout << ((f.F.frac >> (FRAC_SIZF - 4)) & 1); cout << endl; cout << setw(fore_space) << "Thirty seconds:"; - cout << setw(field_space) << hex << ((d.D.frac >> (FRAC_SIZD - 5)) & 1); - cout << setw(field_space) << hex << ((f.F.frac >> (FRAC_SIZF - 5)) & 1); + cout << setw(field_space) << hex; + cout << ((d.D.frac >> (FRAC_SIZD - 5)) & 1); + cout << setw(field_space) << hex; + cout << ((f.F.frac >> (FRAC_SIZF - 5)) & 1); cout << endl; cout << setw(fore_space) << "Full fraction:"; diff --git a/section_2/float/literals.md b/section_2/float/literals.md index 6c74405..3466976 100644 --- a/section_2/float/literals.md +++ b/section_2/float/literals.md @@ -10,8 +10,8 @@ themselves, using floating point literals is extremely constrained. For example: ```asm - fmov d0, 1 // 1 - fmov d0, 1.1 // 2 + fmov d0, 1 // 1 + fmov d0, 1.1 // 2 ``` `Line 1` will pass muster but `Line 2` will cause an error. @@ -150,13 +150,13 @@ what that magic is. Build the program with the `-g` option to enable debugging using GDB. ```text -$ gcc -g t.s +% gcc -g t.s ``` Then launch GDB on the executable: ```text -$ gdb a.out +% gdb a.out ``` Set a breakpoint on line 6. diff --git a/section_2/float/literals.s b/section_2/float/literals.s index c537670..79b285c 100644 --- a/section_2/float/literals.s +++ b/section_2/float/literals.s @@ -1,38 +1,64 @@ - .global main +#include "apple-linux-convergence.S" + + GLABEL main .text - .align 2 + .p2align 2 counter .req x20 dptr .req x21 fptr .req x22 .equ max, 4 -main: stp counter, x30, [sp, -16]! - stp dptr, fptr, [sp, -16]! - ldr dptr, =d - ldr fptr, =f - mov counter, xzr +MAIN + START_PROC + PUSH_P counter, x30 + PUSH_P dptr, fptr + PUSH_R x29 + mov x29, sp -1: cmp counter, max - beq 2f + LLD_ADDR dptr, d + LLD_ADDR fptr, f + mov counter, xzr - ldr d0, [dptr, counter, lsl 3] - ldr s1, [fptr, counter, lsl 2] - fcvt d1, s1 - ldr x0, =fmt - add counter, counter, 1 - mov x1, counter - bl printf - b 1b +1: cmp counter, max + beq 2f + ldr d0, [dptr, counter, lsl 3] + ldr s1, [fptr, counter, lsl 2] + fcvt d1, s1 + LLD_ADDR x0, fmt + add counter, counter, 1 + mov x1, counter +#if defined(__APPLE__) + /* + Give us some stack space. Then read the printf template + string right to left. Variadics on the Mac are difficult + to get right. Remember that printf never prints floats. + Only doubles. Internally, floats are converted to double. + See the fcvt instruction above. + */ + sub sp, sp, 32 + str d1, [sp, 16] + str d0, [sp, 8] + str x1, [sp] + CRT printf + add sp, sp, 32 +#else + CRT printf +#endif + b 1b -2: ldp dptr, fptr, [sp], 16 - ldp counter, x30, [sp], 16 - mov w0, wzr +2: POP_R x29 + POP_P dptr, fptr + POP_P counter, x30 + mov w0, wzr + END_PROC ret .data -fmt: .asciz "%d %f %f\n" -d: .double 1.111111, 2.222222, 3.333333, 4.444444 -f: .float 1.111111, 2.222222, 3.333333, 4.444444 +fmt: .asciz "index %ld double %f float %f\n" + .p2align 3 +d: .double 1.555555, 2.666666, 3.777777, 4.888888 + .p2align 2 +f: .float 1.111111, 2.222222, 3.333333, 4.444444 .end diff --git a/section_2/float/rounding.md b/section_2/float/rounding.md index 91c2d58..b7900d7 100644 --- a/section_2/float/rounding.md +++ b/section_2/float/rounding.md @@ -13,9 +13,11 @@ For this chapter, I will use: * Rounding means picking some fractional value and if the float's fraction is higher, you go one way and if lower, you go the other. -* Truncation means you don't look too closely at the fractional value. -Instead, you just eliminate the fractional part and slam the whole -number ... one way or the other. +* Truncation means you don't care about the fractional value. You just +eliminate the fractional part and slam the whole number ... one way or +the other. + +"One way or the other" is defined next. ## Truncation Towards Zero @@ -30,8 +32,8 @@ Diving a little deeper, there is a choice to be made as to whether or not `integer_variable` is signed or unsigned. And, whether or not `integer_variable` is a 32 bit or 64 bit value. -The instruction is `fcvtz` - convert towards zero. Then, the choice -as to whether to produce a signed or unsigned result is defined by the +The instruction is `fcvtz` - convert towards zero. Then, the choice as +to whether to produce a signed or unsigned result is defined by the final letterL `u` or `s`. | Mnemonic | Meaning | @@ -41,7 +43,7 @@ final letterL `u` or `s`. As an example of how the ARM documentation is confusing - this instruction which completely discards the fractional value is said by -the ARM documentation as doing rounding. +the ARM documentation as doing rounding not truncating. The the choice of source register defined whether you are converting a double or single precision floating point value. @@ -60,14 +62,18 @@ Examples where `d` is a `double` and `f` is a `float`: | C++ | Instruction | | --- | ----------- | -| `int32_t(d)` | `fcvtzs w0, d0` | -| `uint32_t(d)` | `fcvtzu w0, d0` | -| `int64_t(d)` | `fcvtzs x0, d0` | -| `uint64_t(d)` | `fcvtzu x0, d0` | +| `int32_t(d)` | `fcvtzs w0, d0` | +| `uint32_t(d)` | `fcvtzu w0, d0` | +| `int64_t(d)` | `fcvtzs x0, d0` | +| `uint64_t(d)` | `fcvtzu x0, d0` | -[Here](./asm_rounding.s) is a program which demonstrates various +[Here](./asm_rounding.S) is a program which demonstrates various ways of converting doubles to integers. +Note: This source code has been updated using the author's +Apple / Linux Convergence macros and can be built on both Apple Mac OS +and Linux ARM systems. + Let's look at: ```text @@ -105,23 +111,23 @@ Notice all the values were truncated to the whole number that is Truncation away from zero is not as easy. In fact, it cannot be performed with a single instruction. -In C and C++: +In C (and C++): ```c iv = (int(fv) == fv) ? int(fv) : int(fv) + ((fv < 0) ? -1 : 1); ``` -If the `fv` is already equal to a whole number, the -integer value will be that whole number. Other wise the `iv` is -the whole number further *away from zero*. +If the `fv` is already equal to a whole number, the integer value will +be that whole number. Other wise the `iv` is the whole number further +*away from zero*. -In C++, a more sophisticated version would require `` and -could look like: +In C++, a more sophisticated version would require `` and could +look like: ```c++ template int MyTruncate(T x) { - return int((x < 0) ? floor(x) : ceil(x)); + return int((x < 0) ? floor(x) : ceil(x)); } ``` @@ -136,15 +142,15 @@ given above. ```asm RoundAwayFromZero: - fcmp d0, 0 - ble 1f - // Value is positive, truncate towards positive infinity (ceil) - frintp d0, d0 - b 2f -1: // Value is negative, truncate towards negative infinity (floor) - frintm d0, d0 -2: fcvtzs x0, d0 - ret + fcmp d0, 0 + ble 1f + // Value is positive, truncate towards positive infinity (ceil) + frintp d0, d0 + b 2f +1: // Value is negative, truncate towards negative infinity (floor) + frintm d0, d0 +2: fcvtzs x0, d0 + ret ``` `frintp` and `frintm` will honor the source register already being diff --git a/section_2/float/test.cpp b/section_2/float/test.cpp index 466479b..f1e53f7 100644 --- a/section_2/float/test.cpp +++ b/section_2/float/test.cpp @@ -1,5 +1,19 @@ #include +#include +#define MAX 4 +double d[4] = { 1.555555, 2.666666, 3.777777, 4.888888 }; +float f[4] = { 1.111111, 2.222222, 3.333333, 4.444444 }; + +int main() { + for (long counter = 0; counter < MAX; counter++) { + printf("index %ld double %f float %f\n", counter, d[counter], f[counter]); + } + return 0; +} + + +/* extern "C" uint32_t T1(double d) { return uint32_t(d); } @@ -19,3 +33,4 @@ extern "C" int32_t T4(float f) { extern "C" uint64_t T5(double d) { return uint64_t(d); } +*/ diff --git a/section_2/float/what.md b/section_2/float/what.md index 99db59a..fd8f8e2 100644 --- a/section_2/float/what.md +++ b/section_2/float/what.md @@ -1,9 +1,9 @@ # Section 2 / What Are Floating Point Numbers? -Before we introduce floating point instructions in the AARCH64 ISA, it is -worth going over exactly what a floating point value is. Integers are easy. -They're just powers of two summed together with a single bit at one end -determining the sign (if the integer is signed). +Before we introduce floating point instructions in the AARCH64 ISA, it +is worth going over exactly what a floating point value is. Integers are +easy. They're just powers of two summed together with a single bit at +one end determining the sign (if the integer is signed). But what are floating numbers? @@ -45,41 +45,42 @@ Full fraction: 0 0 Equation: 1 x 2^0 1 x 2^0 ``` -On the line marked "Value" you can see the values represented as double precision -and as single precision. Under "Comment" you can see that there -is no difference between the double and the single precision numbers. Remember -the key thing about floating point numbers: they are approximations. Sometimes, -as in the case of whole numbers like 1, the approximation is exact. When there -is a difference, the difference will be small and printed in the Comment -column. +On the line marked "Value" you can see the values represented as double +precision and as single precision. Under "Comment" you can see that +there is no difference between the double and the single precision +numbers. Remember the key thing about floating point numbers: they are +approximations. Sometimes, as in the case of whole numbers like 1, the +approximation is exact. When there is a difference, the difference will +be small and printed in the Comment column. -The Sign field is 0. This indicates that the whole floating point value is positive. -There are no other sign values including in the exponent. However, exponents can -be negative... this is explained next. +The Sign field is 0. This indicates that the whole floating point value +is positive. There are no other sign values including in the exponent. +However, exponents can be negative... this is explained next. -First, notice that the double precision exponent is 11 bits wide while the single -precision exponent is only 8 bits wide. Next, notice the values... 1023 and 127 -respectively. The value of 1 is 1 raised to the power of 0 base 2. So why 1023 -or 127? +First, notice that the double precision exponent is 11 bits wide while +the single precision exponent is only 8 bits wide. Next, notice the +values... 1023 and 127 respectively. The value of 1 is 1 raised to the +power of 0 base 2. So why 1023 or 127? -There is no sign bit for the exponent yet the exponent must support negative numbers. -It does this by incorporating an offset of 1023 and 127 respectively (both representing -0). Anything above 1023 and 127 are positive exponents. Anything below these values -are negative exponents. +There is no sign bit for the exponent yet the exponent must support +negative numbers. It does this by incorporating an offset of 1023 and +127 respectively (both representing 0). Anything above 1023 and 127 are +positive exponents. Anything below these values are negative exponents. -The De-biased line are the values of the exponent with their bias removed. -Notice they work out to 0. So, the value of 1 is represented by 1 raised to the power of 0. +The De-biased line are the values of the exponent with their bias +removed. Notice they work out to 0. So, the value of 1 is represented by +1 raised to the power of 0. -The Fraction has a value of zero. Where's the 1 that we've been talking about get stored? -It isn't. A value of 1 is always assumed to be the only value in front of the decimal place -in a `float` or `double`. Every floating point value is 1 plus a fraction all raised to -some power of 2. +The Fraction has a value of zero. Where's the 1 that we've been talking +about get stored? It isn't. A value of 1 is always assumed to be the +only value in front of the decimal place in a `float` or `double`. Every +floating point value is 1 plus a fraction all raised to some power of 2. -We thought we'd highlight a few of the bits in the fractional part of a floating point -number. These can be illuminating when the value being shown is in the range of --2 < x < 2. Notice the the values of -2 and 2 are outside this range. In other words, -showing the first few bits of the fraction are illuminating when the exponent works -out to 0. +We thought we'd highlight a few of the bits in the fractional part of a +floating point number. These can be illuminating when the value being +shown is in the range of -2 < x < 2. Notice the the values of -2 and 2 +are outside this range. In other words, showing the first few bits of +the fraction are illuminating when the exponent works out to 0. * Halves - There are no halves in the value of 1. @@ -91,11 +92,12 @@ out to 0. * Thirty Seconds - There are no thirty seconds in the value of 1. -Of course, there are more fractional values to `float` and `doubles` but listing them all -wouldn't be a fun tasks and we're all about fun. :) +Of course, there are more fractional values to `float` and `doubles` but +listing them all wouldn't be a fun tasks and we're all about fun. :) -Finally, the Equation line rebuilds the floating point value in its actual "scientific" -notation. The value of 1 is a 1 raised to the zeroth power of 2. +Finally, the Equation line rebuilds the floating point value in its +actual "scientific" notation. The value of 1 is a 1 raised to the zeroth +power of 2. How about a value of 1.5? @@ -115,7 +117,7 @@ Full fraction: 0.5 0.5 Equation: 1.5 x 2^0 1.5 x 2^0 ``` -The only difference is that there is a bit turned on in the fraction. +The only difference is that there is a bit turned on in the fraction. It is the most significant bit... there is a half in one and a half. How about 1.875? @@ -138,11 +140,10 @@ Equation: 1.875 x 2^0 1.875 x 2^0 How about 8.5? -This is the first time we are looking at -a value which increases the (de-biased) exponent to non-zero. -Things get a little more complicated. Now, there isn't an -obvious mapping of the fraction bits to the final number they -represent. This is the impact of the non-zero exponent. +This is the first time we are looking at a value which increases the +(de-biased) exponent to non-zero. Things get a little more complicated. +Now, there isn't an obvious mapping of the fraction bits to the final +number they represent. This is the impact of the non-zero exponent. ```text Component Double Float Comment @@ -160,17 +161,16 @@ Full fraction: 0.0625 0.0625 Equation: 1.0625 x 2^3 1.0625 x 2^3 ``` -Even though there is a half in eight and a half, the Halves bit -is 0. What is 8? Eight is a 2 raised to the power of 3. In -other words, the bit for the half in 8.5 is shifted to the -right by three bits. Confirm this by looking at the -Sixteenths. *There's our bit!* +Even though there is a half in eight and a half, the Halves bit is 0. +What is 8? Eight is a 2 raised to the power of 3. In other words, the +bit for the half in 8.5 is shifted to the right by three bits. Confirm +this by looking at the Sixteenths. *There's our bit!* -Turn your attention to the Equation. 1.0625 multiplied by 8 -is 8.5. Cool huh? +Turn your attention to the Equation. 1.0625 multiplied by 8 is 8.5. Cool +huh? -How about something harder? Like 8.51 - just a teensy bit -different from the previous example. +How about something harder? Like 8.51 - just a teensy bit different from +the previous example. ```text Component Double Float Comment @@ -189,19 +189,19 @@ Equation: 1.06375 x 2^3 1.0637500286 x 2^3 ``` For the first time we're seeing that 8.51 cannot be perfectly -represented by `float`. `double` gets it right. The difference -between the `double` and `float` is the very small number shown -on the first line of output. +represented by `float`. `double` gets it right. The difference between +the `double` and `float` is the very small number shown on the first +line of output. ## When a Number is Not a Number and How About Infinity? `NaN` is an actual value. It means `not a number`. -[Here](./floatster.cpp) is the source code to another program we -have written that explores both `NaN` and `Inf`. +[Here](./floatster.cpp) is the source code to another program we have +written that explores both `NaN` and `Inf`. -Let's examine `NaN` which is produced when you do naughty things -like take the square root of a negative number. +Let's examine `NaN` which is produced when you do naughty things like +take the square root of a negative number. ```text Enter a number (-100 causes divide by 0, -200 causes sqrt(-1): -200 @@ -213,8 +213,8 @@ NaN: 1 Inf: 0 ``` -`Nan` is true (for `float`) when its exponent is 0xFF and fraction -is not zero. +`Nan` is true (for `float`) when its exponent is 0xFF and fraction is +not zero. You'll never get a `float` that is 2 raised to the power of 128 because that value is reserved for `NaN` and `Inf`. @@ -232,13 +232,13 @@ Inf: 1 ``` Once again, notice the out-of-bounds value for the exponent: 0xFF. -Secondly, the fraction is fully zero. The sign bit specifies negative -or positive infinity. +Secondly, the fraction is fully zero. The sign bit specifies negative or +positive infinity. ## Testing for Naughty Values -Thankfully, there exists two functions that will do the inspection -for you, looking for `Nan` and `Inf`. +Thankfully, there exists two functions that will do the inspection for +you, looking for `Nan` and `Inf`. * `isnan(floating point value)` and @@ -246,6 +246,6 @@ for you, looking for `Nan` and `Inf`. Both of these functions work with `double` and `float`. -Once a variable goes `NaN` or `Inf`, all subsequent operations -will remain `NaN` or `Inf` until the variable is reset to a -valid number. That is, 1 + `Inf` is `Inf`, for example. +Once a variable goes `NaN` or `Inf`, all subsequent operations will +remain `NaN` or `Inf` until the variable is reset to a valid number. +That is, 1 + `Inf` is `Inf`, for example. diff --git a/section_2/float/working.md b/section_2/float/working.md index 8b3dd22..d95e5c6 100644 --- a/section_2/float/working.md +++ b/section_2/float/working.md @@ -26,7 +26,7 @@ We say aliases because, like the integer registers, how you reference a floating point register determines how it is interpreted. For example, in the following image, note the overlap of two single -precision floats within a single double precision floating point +precision floats within a single double precision floating point register. ![regs](./regs.png) @@ -35,7 +35,6 @@ It is worth noting early and often that you should not mix dealing with different precisions assuming that because of the overlaps in space, you'll get a meaningful result. -The above image does not show the corresponding layout of *half -precision* floating point registers. `H0` sits in the least -significant bits of `S0` and so on. - +The above image does not show the corresponding layout of [half +precision](./half.md) floating point registers. `H0` sits in the least + significant bits of `S0` and so on.