diff --git a/README.md b/README.md index efe25ca..e04e95b 100644 --- a/README.md +++ b/README.md @@ -308,7 +308,11 @@ What would a book about assembly language be without bit bashing? ### Section 4 - More Stuff -In this section, we present miscellaneous material. +In this section, we present miscellaneous material including our "world +famous lecture" on debugging. This lecture has been invited at several +colleges and universities. It is intended for audiences working with +languages like C, C++ and assembly language but some of the lessons +contained therein are applicable to all languages. | Chapter | Markdown | PDF | | ------- | -------- | --- | @@ -319,6 +323,9 @@ In this section, we present miscellaneous material. | 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) | | 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) | | 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) | +| 8 | [Jump Tables](./more/jump_tables/README.md) | [Link](./more/jump_tables/README.pdf) | +| 9 | [argv](./more/argv_example/jess1.S) | ASM CODE | +| - | [Debugging Lecture](./debugging/Discourses%20and%20Dialogs%20on%20Debugging.pptx) | PPTX | ## Macro Suite diff --git a/README.pdf b/README.pdf index a2f8e18..6bf96b2 100644 Binary files a/README.pdf and b/README.pdf differ diff --git a/debugging/Discourses and Dialogs on Debugging.pptx b/debugging/Discourses and Dialogs on Debugging.pptx new file mode 100644 index 0000000..fcd0d58 Binary files /dev/null and b/debugging/Discourses and Dialogs on Debugging.pptx differ diff --git a/more/apple_silicon/apple-linux-convergence.S b/more/apple_silicon/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/more/apple_silicon/apple-linux-convergence.S +++ b/more/apple_silicon/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/more/argv_example/apple-linux-convergence.S b/more/argv_example/apple-linux-convergence.S new file mode 100644 index 0000000..8827423 --- /dev/null +++ b/more/argv_example/apple-linux-convergence.S @@ -0,0 +1,156 @@ +/* Macros to permit the "same" assembly language to build on ARM64 + Linux systems as well as Apple Silicon systems. + + See the fuller documentation at: + https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md + + Perry Kivolowitz + A Gentle Introduction to Assembly Language +*/ + +.macro GLD_PTR xreg, label +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + ldr \xreg, [\xreg, _\label@GOTPAGEOFF] +#else + ldr \xreg, =\label + ldr \xreg, [\xreg] +#endif +.endm + +.macro GLD_ADDR xreg, label // Get a global address +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + add \xreg, \xreg, _\label@GOTPAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_ADDR xreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_DBL xreg, dreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \dreg, [\xreg] +// fmov \dreg, \xreg +#else + ldr \xreg, =\label + ldur \dreg, [\xreg] +#endif +.endm + +.macro LLD_FLT xreg, sreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \sreg, [\xreg] +#else + ldr \xreg, =\label + ldur \sreg, [\xreg] +#endif +.endm + +.macro GLABEL label +#if defined(__APPLE__) + .global _\label +#else + .global \label +#endif +.endm + +.macro MAIN +#if defined(__APPLE__) +_main: +#else +main: +#endif +.endm + +/* Fetching the address of the externally defined errno is quite + different on Apple and Linux. This macro leaves the address of + errno in x0. +*/ +.macro ERRNO_ADDR +#if defined(__APPLE__) + bl ___error +#else + bl __errno_location +#endif +.endm + +.macro CRT label +#if defined(__APPLE__) + bl _\label +#else + bl \label +#endif +.endm + +.macro START_PROC // after starting label + .cfi_startproc +.endm + +.macro END_PROC // after the return + .cfi_endproc +.endm + +.macro PUSH_P a, b + stp \a, \b, [sp, -16]! +.endm + +.macro PUSH_R a + str \a, [sp, -16]! +.endm + +.macro POP_P a, b + ldp \a, \b, [sp], 16 +.endm + +.macro POP_R a + ldr \a, [sp], 16 +.endm + +/* The smaller of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MIN src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, LT +.endm + +/* The larger of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MAX src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, GT +.endm + +.macro AASCIZ label, string + .p2align 2 +\label: .asciz "\string" +.endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/more/argv_example/jess1.S b/more/argv_example/jess1.S new file mode 100644 index 0000000..b02d175 --- /dev/null +++ b/more/argv_example/jess1.S @@ -0,0 +1,111 @@ +#include "apple-linux-convergence.S" + + .p2align 2 + .text + GLABEL main + +/* This program will get a string followed by a double followed by an + integer from the command line demonstrating how each of these types + can be retrieved. + + Example: + ./a.out test 29.3 29 +*/ + +MAIN + PUSH_P x29, x30 + mov x29, sp + + // Check argc to see if it is 4. This is not the only way to + // validate command line arguments but it is an easy way. + cmp w0, 4 + bne 99f // take branch if argc isn't "right". + + // Skip past argv[0] + add x1, x1, 8 + + // Fetch argv[1] as a string. + // x1 is a pointer to a pointer to chars (i.e. the string). + // Being a pointer to a pointer, it must be dereferenced to + // make a pointer. + ldr x0, [x1] // dereference + // Now x0 contains a pointer to the command line argument. + // Print the string (as a string). But doing this causes a + // function call which will destroy x1. So, save x1 temporarily. + // This could be avoided if x1 were moved to a backed up x + // register (e.g. x20). + PUSH_R x1 + CRT puts // ptr is in x0 where puts() needs it. + POP_R x1 + + // Advance x1 once again to get to argv[2] which can be done + // in the same instruction as dereferencing it use a + // preincrement. + ldr x0, [x1, 8]! // dereference + + // Now the string version of argv[2] is now pointed to by x0. + // This is exactly where atof would want it. We need atof + // because it turns strings into numbers. BUT, same as before, + // calling a function would destroy x1 so let's do the same + // trick of backing up x1 on the stack and then restoring after + // the function call. + PUSH_R x1 + CRT atof // ptr is in x0 where atof() needs it. + POP_R x1 + // The string value will be converted to a double left in d0. + // d0 is also a scratch register so for our next call to atoi, + // d0 will have to be preserved on the stack - alternatively, + // we could have used a high d register backed up and restored + // at the start and ending of main(). + + // Advance x1 once again to get to argv[3] which can be done + // in the same instruction as dereferencing it use a + // preincrement. + ldr x0, [x1, 8]! // dereference + + // Now the string version of argv[3] is now pointed to by x0. + // This is exactly where atoi would want it. We need atoi + // because it turns strings into numbers. BUT, same as before, + // calling a function would destroy x1 so let's do the same + // trick of backing up x1 on the stack and then restoring after + // the function call. We must also do the same for d0. Actually, + // we won't need argv after this so we will skip backing up x1. + + PUSH_R d0 + CRT atoi // ptr is in x0 where atof() needs it. + POP_R d0 + // d0 now contains the double. + // x0 now contains the integer. + // x0 must be copied to x1 because x0 must be a pointer to fmt + // for printf to work. + mov x1, x0 + LLD_ADDR x0, fmt +#if defined(__APPLE__) + sub sp, sp, 16 + str x1, [sp, 8] + str d0, [sp] + CRT printf + add sp, sp, 16 +#else + bl printf +#endif + +99: POP_P x29, x30 + mov w0, wzr + ret + +/* What did we learn? + * x1 has argv when main begins. + * pointers to the arguments are the contents of argv NOT + the actual values. Therefore, x1, which is a pointer (to a pointer), + must be dereferenced to get to the actual pointer. In the code, + there are three lines with the comment "// dereference". + * all command line arguments are c-strings. If that's not what you + want, they must be converted - see the code for atoi and atof for + examples. +*/ + .data + +fmt: .asciz "double: %f integer: %d\n" + + .end diff --git a/more/jump_tables/.gdb_history b/more/jump_tables/.gdb_history new file mode 100644 index 0000000..eaf1197 --- /dev/null +++ b/more/jump_tables/.gdb_history @@ -0,0 +1,6 @@ +b MyMemSet +run +n +n +:q +q diff --git a/more/jump_tables/README.md b/more/jump_tables/README.md new file mode 100644 index 0000000..06c4d43 --- /dev/null +++ b/more/jump_tables/README.md @@ -0,0 +1,297 @@ +# Jump or Branch Tables + +A jump or branch table is a powerful instruction saving technique that +can be used to switch between multiple single instructions or even +choose one of a series of functions to call (or branches to take). + +This concept can be found as the implementation of some `switch` +statements and is found at the very very lowest end of an Operating +System (interrupt vectors, for example). + +The + +## Single Instructions a la Duff's Device + +[Duff's Device](https://en.wikipedia.org/wiki/Duff%27s_device) shoe +horned a jump table into the middle of a `while` loop. At the same +time, it also demonstrates a simple case of *loop unrolling*. +It's very creative. + +Let's expand on Duff's Device. + +The full source code for this example can be found +[here](./branch_table.S). It demonstrates a branch table consisting of +instructions which are meant to be executed in sequence after jumping +into the middle of the sequence. + +Here: + +```asm + mov x6, 8 + MOD x2, x6, x4, x5 // x4 gets l % 8 + cbz x4, 10f // Handle evenly divisible case. + sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5 +``` + +we are performing this: *x4 is getting the result of modding the +number of times we want the instructions executed by the number of +times we unrolled the loop*. + +Specifically, this example does `length % 8`. However, the AARCH64 ISA +does not include a *mod* instruction. The `MOD` macro used above is +defined as: + +```asm +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm +``` + +`msub` is a cool instruction. It does this: + +```d = c - (b * a)``` + +Example: 13 % 8 == 5. First the `sdiv`: 13 / 8 is 1. Then, the `msub`: +13 - (1 * 8) is 5. + +Next: + +```asm + cbz x4, 10f // Handle evenly divisible case. + sub x4, x6, x4 // Invert sense of x4 e.g. 5 becomes 3 +``` + +This code is key. + +If the result of the `mod` is 0, then the entire table must be executed. +This is implemented by the `cbz`. + +If the result of the `mod` is not 0, then its value must be *flipped*. +The idea here is that if the result of the mod is 5, for example, we +have 5 stragglers. We want to execute 5 of the sequential instructions +below. So, we want to jump 3 instructions into the table. Notice that +3 is 8 - 5. + +Finally, we have the computation of the address to where we jump into +the middle of the table. + +```asm + LLD_ADDR x5, 10f + add x5, x5, x4, lsl 2 + br x5 +``` + +Each of the lines above bears description: + +The `LLD_ADDR` is from the [*convergence +macros*](./apple-linux-convergence.S). It loads the address of the +beginning of the table. + +Next, the `add` instruction multiplies the flipped result of the `mod` +by 4 (the length of one instruction) THEN adds it to the base address of +the table. We have calculated *instruction addresses* exactly the way we +would with array dereferences. Thank you John von Neumann. + +Finally, we `br` which means branch to an address contained in a +register. + +```asm +10: str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + // loop code not shown +``` + +## Performing Multiple Instructions + +If you need to execute more than one instruction you have two choices: + +### Multiple Instructions by Address Arithmetic + +Suppose you needed two instructions in each step of the sequence. +Simply multiply the index by 8 instead of 4 (i.e. the length of two +instructions). The same technique works with a larger number. E.g. +you need three instructions per step: multiply by 12. + +Suppose some need 3 instruction and some need 2. You must handle this +because using this technique requires that all steps in the sequence +of steps must be the same length so that the address arithmetic works. + +To deal with some cases being shorter than others, insert the occasional +`nop` instruction in the indexes that are shorter than the others. + +### Multiple Instructions by Branch / Branch + +Here's another [example of code](./jmptbl.s) that implements a branch or +jump table: + +```asm +jt: b 0f + b 1f + b 2f + b 3f + b 4f + b 5f + b 6f + b 7f +``` + +You jump into the middle of the table as per above and then immediately +jump some place else. This is like: + +```c +if (index == 0) { + blah +} else if (index == 1) { + blah +} else if (index == 2) { + blah +} etc. +``` + +### Multiple Instructions by Branch / Call + +You can modify the above techniques to make something like: + +```asm +jt: bl func_0 + bl func_1 + bl func_2 + bl func_3 + bl func_4 + bl func_5 + bl func_6 + bl func_7 +``` + +or to be more similar to a `break` statement coming after each case: + +```asm +jt: bl func_0 + b common_label + bl func_1 + b common_label + bl func_2 + b common_label + bl func_3 + b common_label + bl func_4 + b common_label + bl func_5 + b common_label + bl func_6 + b common_label + bl func_7 + b common_label + + // perhaps some loop control... if none, the preceding + // b can be removed since can fall through to the common + // label. +common_label: +``` + +## Small Gaps in Sequential Indexes + +Suppose your range of indexes was 0 through 8 inclusive (notice there +are 9 integers in the range) but index 7 is skipped. That is, your +potential indexes are 0 through 6 inclusive and then 8 but never +7. + +In a `switch` statement, this would look like: + +```c++ +/* +// Ensure index is a valid value before getting here. In this case the +// valid range is 0 through 8 inclusive (a range of 9 values). To fill +// out to the next power of 2 (which would be 16), one could put in +// empty cases plus a default. +*/ +switch (index & 0xF) { + case 0: blah blah; + break; + case 1: blah blah; + break; + case 2: blah blah; + break; + case 3: blah blah; + break; + case 4: blah blah; + break; + case 5: blah blah; + break; + case 6: blah blah; + break; + case 8: blah blah; + break; +} +``` + +Gaps in the potential indexes presents a surmountable problem if the +gaps are few. + +In the case where there are a small number of gaps simple fill them +with a branch to a common, otherwise "do nothing", label. For example, +you might have: + +```asm +b_table: b label0 + b label1 + b label2 + b label3 + b label4 + b label5 + b label6 + b do_nothing + b label8 +``` + +in the style of Duff's Device where you are executing sequential single +instructions, it might loop like this: + +```asm +x_fer: str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + nop + str w1, [x0], 1 +``` + +Here, the `nop` instruction means "no operation". It does nothing but +is a valid instruction meant to take up space (and decades ago, take +up time). + +In a high level language this might look like this: + +```c +for (int i = 0; i <= 8; i++) { + if (i == 7) + continue; + blah blah +} +``` + +## More about the `switch` statement + +`switch` statements are optimized using many techniques than suggested +here. In fact, the implementation of optimized `switch` statements is +fascinating. There might be: + +* binary searches for large numbers of cases + +* separation of ranges where each sub-range is optimized in a different +way + +* degeneration into streams of if / else ifs + +and other techniques. The people who work on the compilers we take for +granted really are due some respect and *free beer*. diff --git a/more/jump_tables/README.pdf b/more/jump_tables/README.pdf new file mode 100644 index 0000000..f7b6974 Binary files /dev/null and b/more/jump_tables/README.pdf differ diff --git a/more/jump_tables/apple-linux-convergence.S b/more/jump_tables/apple-linux-convergence.S new file mode 100644 index 0000000..8827423 --- /dev/null +++ b/more/jump_tables/apple-linux-convergence.S @@ -0,0 +1,156 @@ +/* Macros to permit the "same" assembly language to build on ARM64 + Linux systems as well as Apple Silicon systems. + + See the fuller documentation at: + https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md + + Perry Kivolowitz + A Gentle Introduction to Assembly Language +*/ + +.macro GLD_PTR xreg, label +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + ldr \xreg, [\xreg, _\label@GOTPAGEOFF] +#else + ldr \xreg, =\label + ldr \xreg, [\xreg] +#endif +.endm + +.macro GLD_ADDR xreg, label // Get a global address +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + add \xreg, \xreg, _\label@GOTPAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_ADDR xreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_DBL xreg, dreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \dreg, [\xreg] +// fmov \dreg, \xreg +#else + ldr \xreg, =\label + ldur \dreg, [\xreg] +#endif +.endm + +.macro LLD_FLT xreg, sreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \sreg, [\xreg] +#else + ldr \xreg, =\label + ldur \sreg, [\xreg] +#endif +.endm + +.macro GLABEL label +#if defined(__APPLE__) + .global _\label +#else + .global \label +#endif +.endm + +.macro MAIN +#if defined(__APPLE__) +_main: +#else +main: +#endif +.endm + +/* Fetching the address of the externally defined errno is quite + different on Apple and Linux. This macro leaves the address of + errno in x0. +*/ +.macro ERRNO_ADDR +#if defined(__APPLE__) + bl ___error +#else + bl __errno_location +#endif +.endm + +.macro CRT label +#if defined(__APPLE__) + bl _\label +#else + bl \label +#endif +.endm + +.macro START_PROC // after starting label + .cfi_startproc +.endm + +.macro END_PROC // after the return + .cfi_endproc +.endm + +.macro PUSH_P a, b + stp \a, \b, [sp, -16]! +.endm + +.macro PUSH_R a + str \a, [sp, -16]! +.endm + +.macro POP_P a, b + ldp \a, \b, [sp], 16 +.endm + +.macro POP_R a + ldr \a, [sp], 16 +.endm + +/* The smaller of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MIN src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, LT +.endm + +/* The larger of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MAX src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, GT +.endm + +.macro AASCIZ label, string + .p2align 2 +\label: .asciz "\string" +.endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/more/jump_tables/branch_table.S b/more/jump_tables/branch_table.S new file mode 100644 index 0000000..84d1b49 --- /dev/null +++ b/more/jump_tables/branch_table.S @@ -0,0 +1,57 @@ +#include "apple-linux-convergence.S" + + .p2align 2 + .text + GLABEL MyMemSet + +/* MyMemSet(unsigned char * b, unsigned char v, long l) + x0 w1 x2 + + The length is first checked against less than or equal to 0. If + so, the body of the function is skipped. + + The loop will be unrolled 8x. The length (x2) modulo 8 gets turned + into the number of instructions to jump to or beyond the initial + str. A modulo of 0 is handled separately - it causes a branch to the + initial str. + + This code can be dramatically improved by copying more than one byte + at a time. You will have to figure out how to do this optimally in + P6 - MemCpy +*/ +#if defined(__APPLE__) +_MyMemSet: +#else +MyMemSet: +#endif + START_PROC + PUSH_P x29, x30 + mov x29, sp + cmp x2, xzr // Test for bad length. + ble 99f // Take branch of 0 or less. + + add x3, x2, x0 // x3 gets address of one beyond buffer + mov x6, 8 + MOD x2, x6, x4, x5 // x4 gets l % 8 + cbz x4, 10f // Handle evenly divisible case. + sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5 + + LLD_ADDR x5, 10f + add x5, x5, x4, lsl 2 + br x5 + +10: str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + str w1, [x0], 1 + cmp x3, x0 + bgt 10b + +99: POP_P x29, x30 + ret + END_PROC + diff --git a/more/jump_tables/jmptbl.s b/more/jump_tables/jmptbl.s new file mode 100644 index 0000000..97e272a --- /dev/null +++ b/more/jump_tables/jmptbl.s @@ -0,0 +1,83 @@ + .text + .align 4 + .global main + +main: str x30, [sp, -16]! + mov x0, xzr // set up call to time(nullptr) + bl time // call time setting up srand + bl srand // call srand setting up rand + bl rand // get a random number + and x0, x0, 7 // ensure its range is 0 to 7 + // note use of x register is on purpose + lsl x0, x0, 2 // multiply by 4 + ldr x1, =jt // load base address of jump table + add x1, x1, x0 // add offset to base address + br x1 + +// If, as in this case, all the "cases" have the same number of +// instructions then this intermediate jump table can be omitted saving +// some space and a tiny amount of time. To omit the intermediate jump +// table, you'd multiply by 12 above and not 4. Twelve because each +// "case" has 3 instructions (3 x 4 == 12). + +// Question for you: If you did omit the jump table, relative to what +// would you jump (since "jt" would be gone). + +jt: b 0f + b 1f + b 2f + b 3f + b 4f + b 5f + b 6f + b 7f + +0: ldr x0, =ZR + bl puts + b 99f + +1: ldr x0, =ON + bl puts + b 99f + +2: ldr x0, =TW + bl puts + b 99f + +3: ldr x0, =TH + bl puts + b 99f + +4: ldr x0, =FR + bl puts + b 99f + +5: ldr x0, =FV + bl puts + b 99f + +6: ldr x0, =SX + bl puts + b 99f + +7: ldr x0, =SV + bl puts + b 99f + +99: mov w0, wzr + ldr x30, [sp], 16 + ret + + .data + .section .rodata + +ZR: .asciz "0 returned" +ON: .asciz "1 returned" +TW: .asciz "2 returned" +TH: .asciz "3 returned" +FR: .asciz "4 returned" +FV: .asciz "5 returned" +SX: .asciz "6 returned" +SV: .asciz "7 returned" + + .end diff --git a/more/jump_tables/jt.c b/more/jump_tables/jt.c new file mode 100644 index 0000000..e4a56d1 --- /dev/null +++ b/more/jump_tables/jt.c @@ -0,0 +1,55 @@ +#include +#include +#include + +/* This is the prototype for the assembly language version. You may + have always thought that switch statements are implemented as a long + chain of if / else. Well, sometimes they are. Sometimes they are + implemented using binary search and still other times they are + implemented as jump tables. + + My assembly language version is found in jmptbl.s. +*/ + +int main() +{ + int r; + + srand(time(0)); + r = rand() & 7; + switch (r) + { + case 0: + puts("0 returned"); + break; + + case 1: + puts("1 returned"); + break; + + case 2: + puts("2 returned"); + break; + + case 3: + puts("3 returned"); + break; + + case 4: + puts("4 returned"); + break; + + case 5: + puts("5 returned"); + break; + + case 6: + puts("6 returned"); + break; + + case 7: + puts("7 returned"); + break; + } + return 0; +} \ No newline at end of file diff --git a/more/jump_tables/test_interop.cpp b/more/jump_tables/test_interop.cpp new file mode 100644 index 0000000..da41cb8 --- /dev/null +++ b/more/jump_tables/test_interop.cpp @@ -0,0 +1,31 @@ +#include + +extern "C" void MyMemSet(unsigned char *, unsigned char v, long length); + +/* MyMemSet(unsigned char *, unsigned char v, long length); +*/ + +/* +void MyMemSet(unsigned char * b, unsigned char v, long l) { + for (long i = 0; i < l; i++) { + b[i] = v; + } +} +*/ +const long BUFFER_SIZE = 1000; + +unsigned char buffer[BUFFER_SIZE]; + +int main() { + unsigned char before = buffer[-1]; + unsigned char after = buffer[BUFFER_SIZE]; + + MyMemSet(buffer, 0xF0, 3); + + if (before != buffer[-1]) + printf("Bytes prior to buffer are smashed.\n"); + if (after != buffer[BUFFER_SIZE]) + printf("Bytes after buffer are smashed.\n"); + + return 0; +} diff --git a/projects/SINE/README.md b/projects/SINE/README.md index 887be52..96ef142 100644 --- a/projects/SINE/README.md +++ b/projects/SINE/README.md @@ -17,7 +17,9 @@ sin x = x - x^3/3! + x^5/5! - x^7/7! ... Notice each term flips from addition to subtraction. -Notice each term is based on the odd integers starting at 1. +Notice each term is based on the odd integers starting at 1. While the +"1" case might look different, it is the same as all the others since +1 is just 1 to the first power divided by 1 factorial. ## Command line @@ -29,40 +31,76 @@ arguments are therefore required. be a double. * The number of terms to evaluate. The number of terms must lie between - 1 and 10 inclusive. + 1 and 10 inclusive. Note the value of 10 as an upper bound in new. It + was 8. ## C version To assist your efforts, [here](./c_version.c) is a version of this -project written in C. +project written in C. This has been updated to print nice debugging +output which is not part of the project. -## Errors to stderr - -Error messages must be sent to `stderr`. - -If you are using the convergence macros to allow your program to build -on both Apple Silicon Mac OS and Linux, note the special casing needed -to deal with `stderr`. If this is you, compile the C version on Mac OS -with the `-S` compiler option to see the generated assembly language and -search for `stderr`. +This C version also demonstrates a different way of calculating the +toggle. This version flips the sign of the toggle by multiplying by -1. +The previous version used odd and even values of the term. ## Sample executions ```text -SINE % ./a.out 0 8 -The sine of 0.00 degrees is 0.000000 in radians. -SINE % ./a.out 90 8 -The sine of 90.00 degrees is 1.000000 in radians. -SINE % ./a.out 180 8 -The sine of 180.00 degrees is -0.000001 in radians. -SINE % ./a.out 180 82 +pk_taylor_series > gcc main.S -o a +pk_taylor_series > ./a 0 10 +The sine of 0.00 degrees is 0.00000000. +pk_taylor_series > ./a 30 10 +The sine of 30.00 degrees is 0.50000000. +pk_taylor_series > ./a 45 10 +The sine of 45.00 degrees is 0.70710678. +pk_taylor_series > ./a 90 10 +The sine of 90.00 degrees is 1.00000000. +pk_taylor_series > ./a 180 10 +The sine of 180.00 degrees is -0.00000000. +pk_taylor_series > ./a 360 10 +The sine of 360.00 degrees is -0.00104818. +pk_taylor_series > ./a 360 100 Number of terms is out of range. -SINE % ./a.out 180 -10 +pk_taylor_series > ./a 360 -1 Number of terms is out of range. -SINE % echo $? -1 +pk_taylor_series > ``` +## Floating point instructions I used + +These are the floating point instructions I used in my implementation. + +* fmov + +* scvtf + +* fmul + +* fdiv + +* fadd + +## How I broke up the program + +I have functions named: + +* main + +* HandleOptions + +* Factorial + +* IntegerPower - x to the nth power + +* ComputeSine - The main calculation + +* PrintAnswer + +* ConvertTheta - Wrap D2R + +* D2R - Degrees to radians + ## CSC3510 The following applies to Carthage College CSC3510 students. @@ -74,4 +112,3 @@ Work is to be done solo. ### What to hand in Just the .S file. **Your name must be at the top of the file.** - diff --git a/projects/SINE/README.pdf b/projects/SINE/README.pdf index eba1271..dd10305 100644 Binary files a/projects/SINE/README.pdf and b/projects/SINE/README.pdf differ diff --git a/projects/SINE/c_version.c b/projects/SINE/c_version.c index 9c47c14..c7937b6 100644 --- a/projects/SINE/c_version.c +++ b/projects/SINE/c_version.c @@ -1,13 +1,14 @@ #include #include +#include -double pi = 3.14159265359; +double pi = 3.14159265358979323846; double D2R(double d) { return d * pi / 180.0; } -long Factorial(int n) { +double Factorial(int n) { long retval = 1; if (n > 0) { @@ -15,7 +16,7 @@ long Factorial(int n) { retval = retval * n--; } } - return retval; + return (double) retval; } double IntegerPower(double b, int e) { @@ -48,20 +49,20 @@ int main(int argc, char ** argv) { double r_angle = D2R(angle); + double toggle = 1.0; for (int term = 0, base = 1; term < terms; term++, base += 2) { - double toggle = (term & 1) ? -1.0 : 1.0; - + if (toggle > 0) { + printf("%+03.8e + %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base), + Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base)); + } else { + printf("%+03.8e - %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base), + Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base)); + } sin += toggle * IntegerPower(r_angle, base) / Factorial(base); - /* - if (toggle > 0) { - printf("adding %d p/b intermediate: %f\n", base, sin); - } else { - printf("subtracting %d p/b intermediate: %f\n", base, sin); - } - */ + toggle = toggle * -1; } - printf("The sine of %.2f degrees is %f in radians.\n", angle, sin); + printf("The sine of %0.4f degrees is %0.10f.\n", angle, sin); return 0; } \ No newline at end of file diff --git a/python/apple-linux-convergence.S b/python/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/python/apple-linux-convergence.S +++ b/python/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_1/hello_world/apple-linux-convergence.S b/section_1/hello_world/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/section_1/hello_world/apple-linux-convergence.S +++ b/section_1/hello_world/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_1/regs/apple-linux-convergence.S b/section_1/regs/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/section_1/regs/apple-linux-convergence.S +++ b/section_1/regs/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_1/structs/apple-linux-convergence.S b/section_1/structs/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/section_1/structs/apple-linux-convergence.S +++ b/section_1/structs/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_2/float/apple-linux-convergence.S b/section_2/float/apple-linux-convergence.S index c60d2da..8827423 100644 --- a/section_2/float/apple-linux-convergence.S +++ b/section_2/float/apple-linux-convergence.S @@ -149,3 +149,8 @@ main: .p2align 2 \label: .asciz "\string" .endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_2/float/asm_rounding.s b/section_2/float/asm_rounding.S similarity index 100% rename from section_2/float/asm_rounding.s rename to section_2/float/asm_rounding.S diff --git a/section_2/float/fmov.md b/section_2/float/fmov.md index 0028533..6ab7af7 100644 --- a/section_2/float/fmov.md +++ b/section_2/float/fmov.md @@ -2,27 +2,23 @@ The `fmov` instruction is used to move floating point values in and out of floating point registers and to some degree, moving data between -integer and floating point registers. +integer and floating point registers. ## Loading Floating Point Numbers as Immediate Values -Just as we saw with integer -registers, some values can be used as immediate values and some cannot. +Just as we saw with integer registers, some values can be used as +immediate values and some cannot. It comes down to how many bits are +necessary to encode the value. Too many bits... not enough room to fit +in a 4 byte instruction plus the opcode. For example, this works: -`mov x0, 65536` +`mov x0, 65535` but this does not: `mov x0, 65537` -The reason is that all AARCH64 instructions must fit within a 32 bit -instruction that must hold the instruction's op code, its flags and -other bits and bobs plus any immediate value. In the above example we -can see that the `mov` instruction provides up to 16 bits for an -immediate value. - The constraints placed on immediate values for `fmov` are much tighter because floating point numbers are far more complex than integers. @@ -40,7 +36,7 @@ Let's take a look at some code: fmov d0, 1.96875 // Zoinks! ``` -From this we can see that an immediate value for an `fmov` seems to have +From this we can see that an immediate value for an `fmov` has 4 bits available for the mantissa. In fact, the only values that work as immediate values will be those floating point values whose fractional values are combinations of: @@ -56,6 +52,9 @@ values are combinations of: As far as exponents go, `fmov` can accommodate 3 bits. So, exponents of plus or minus 2**7 can be used. +A sign bit makes the total number of bits available for immediate moves +to be 8. + ## Loading / Storing Floating Point Numbers in General When in doubt, load fixed floating point numbers from memory. This is @@ -64,11 +63,16 @@ covered [in this chapter](./literals.md). ## SIMD `fmov` can also deal with the more complicated special cases induced by -SIMD instructions. +SIMD instructions. `fmov` is able to move values between the various +register widths such as single precision to double precision. **However, +no conversion of value is performed - `fmov` just copies bits.** + +If you need to change the precision of a floating point value, the +`fcvt` family of instructions must be used instead. ## Movement To / From Integer Registers -`fmov` can *bits* between the integer and floating point registers. We -emphasize the *bits*. No conversions are done using `fmov`. There exist -other instructions for that. See [this chapter](./rounding.md) for more -information. +`fmov` can copy *bits* between the integer and floating point registers. +We emphasize the *bits*. No conversions are done using `fmov`. There +exist other instructions for that. See [this chapter](./rounding.md) for +more information. diff --git a/section_2/float/fmov.pdf b/section_2/float/fmov.pdf index 20d5dee..8c9e9f4 100644 Binary files a/section_2/float/fmov.pdf and b/section_2/float/fmov.pdf differ diff --git a/section_2/float/literals.s b/section_2/float/literals.S similarity index 100% rename from section_2/float/literals.s rename to section_2/float/literals.S diff --git a/section_2/float/literals.md b/section_2/float/literals.md index 3466976..89673c3 100644 --- a/section_2/float/literals.md +++ b/section_2/float/literals.md @@ -20,30 +20,32 @@ To load a `float`, you could translate the value to binary and do as the following: ```asm - .text // 1 - .global main // 2 - .align 2 // 3 - // 4 -main: str x30, [sp, -16]! // 5 - ldr s0, =0x3fc00000 // 6 - fcvt d0, s0 // 7 - ldr x0, =fmt // 8 - bl printf // 9 - ldr x30, [sp], 16 // 10 - mov w0, wzr // 11 - ret // 12 - // 13 - .data // 14 -fmt: .asciz "%f\n" // 15 - .end // 16 + .text // 1 + .global main // 2 + .align 2 // 3 + // 4 +main: str x30, [sp, -16]! // 5 + ldr s0, =0x3fc00000 // 6 + fcvt d0, s0 // 7 + ldr x0, =fmt // 8 + bl printf // 9 + ldr x30, [sp], 16 // 10 + mov w0, wzr // 11 + ret // 12 + // 13 + .data // 14 +fmt: .asciz "%f\n" // 15 + .end // 16 ``` -The above code is found [here](./t.s). +The above code is kind of found [here](./t.s) - the file is used +for miscellaneous testing. -`Line 6` puts the translated value of 1.5 into `s0` (since the value -is a `float` it goes in an `s` register). The assembler performs some -magic getting a 32 bit value seemingly fit into a 32 bit instruction. -See [below](./literals.md#fitting-32-bits-into-a-32-bit-bag). +`Line 6` puts the translated value of 1.5 into `s0` (since we are +thinking of the value as a `float` it goes in an `s` register). The +assembler performs some magic getting a 32 bit value seemingly fit into +a 32 bit instruction. See +[below](./literals.md#fitting-32-bits-into-a-32-bit-bag). `Line 7` converts the single precision number into a double precision number for printing. @@ -136,6 +138,9 @@ Cool huh? ## Fitting 32 bits into a 32 bit bag +**This section is currently LINUX-centric - in the future it will +address both native Apple and Linux equally.*** + AARCH64 instructions are 32 bits in width. Yet, `line 6` from [this](./t.s) program reads: @@ -195,15 +200,16 @@ Scan downward to find `0x7a0`: 0x7a0 .inst 0x3fc00000 ; undefined ``` -Hey look! Here's our literal float. The `.inst` is an ARM -specific GNU assembler directive what allows the programmer -to encode their own instruction. Note, the encoded instruction does not -have to make any sense - instead the compiler has emitted a make believe -instruction that happens to have the value of our literal. +Hey look! Here's our literal float. The `.inst` is an ARM specific GNU +assembler directive says: `¯\_(-)_/¯`. + +Note, the encoded "instruction" does not have to make any sense - +instead the compiler has emitted a make believe instruction that happens +to have the value of our literal. What we're seeing the actual `line 6` doing is reaching ahead a short -distance to load the value of another "instruction" when really it is -our constant. +distance to load the value of another location in memory where our +constant is really found. Let us take this explanation further. Notice we see: diff --git a/section_2/float/literals.pdf b/section_2/float/literals.pdf index 8090205..1a2ac50 100644 Binary files a/section_2/float/literals.pdf and b/section_2/float/literals.pdf differ diff --git a/section_2/float/t.s b/section_2/float/t.s index 645b851..135c68e 100644 --- a/section_2/float/t.s +++ b/section_2/float/t.s @@ -1,12 +1,16 @@ .text - .global main - .align 2 + .global _main + .align 2 -main: str x30, [sp, -16]! +_main: + str x30, [sp, -16]! + mov x0, 0xFFFFFFFF +/* ldr s0, =0x3fc00000 fcvt d0, s0 ldr x0, =fmt bl printf +*/ ldr x30, [sp], 16 mov w0, wzr ret