Merge branch 'main' of https://github.com/pkivolowitz/asm_book

ijoijoijoij
2026-06-21 01:46:46 +08:00 · 2023-04-15 22:22:49 -05:00 · 2023-04-15 22:22:49 -05:00 · 65931b77a0
commit 65931b77a0
parent f59c6c92cf 5ce8078652
29 changed files with 1125 additions and 84 deletions
--- a/README.md
+++ b/README.md
@ -308,7 +308,11 @@ What would a book about assembly language be without bit bashing?

 ### Section 4 - More Stuff

-In this section, we present miscellaneous material.
+In this section, we present miscellaneous material including our "world
+famous lecture" on debugging. This lecture has been invited at several
+colleges and universities. It is intended for audiences working with
+languages like C, C++ and assembly language but some of the lessons
+contained therein are applicable to all languages.

 | Chapter | Markdown | PDF |
 | ------- | -------- | --- |
@ -319,6 +323,9 @@ In this section, we present miscellaneous material.
 | 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
 | 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
 | 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
+| 8 | [Jump Tables](./more/jump_tables/README.md) | [Link](./more/jump_tables/README.pdf) |
+| 9 | [argv](./more/argv_example/jess1.S) | ASM CODE |
+| - | [Debugging Lecture](./debugging/Discourses%20and%20Dialogs%20on%20Debugging.pptx) | PPTX |

 ## Macro Suite

--- a/README.pdf
+++ b/README.pdf
--- a/debugging/Discourses
+++ b/debugging/Discourses
--- a/more/apple_silicon/apple-linux-convergence.S
+++ b/more/apple_silicon/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/more/argv_example/apple-linux-convergence.S
+++ b/more/argv_example/apple-linux-convergence.S
@ -0,0 +1,156 @@
+/*  Macros to permit the "same" assembly language to build on ARM64
+    Linux systems as well as Apple Silicon systems.
+
+    See the fuller documentation at:
+    https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
+
+    Perry Kivolowitz
+    A Gentle Introduction to Assembly Language
+*/
+
+.macro  GLD_PTR     xreg, label
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        ldr	        \xreg, [\xreg, _\label@GOTPAGEOFF]
+#else
+        ldr         \xreg, =\label
+        ldr         \xreg, [\xreg]
+#endif
+.endm
+
+.macro  GLD_ADDR    xreg, label     // Get a global address
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        add         \xreg, \xreg, _\label@GOTPAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_ADDR xreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_DBL xreg, dreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \dreg, [\xreg]
+//      fmov    \dreg, \xreg
+#else
+        ldr     \xreg, =\label
+        ldur    \dreg, [\xreg]
+#endif
+.endm
+
+.macro  LLD_FLT xreg, sreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \sreg, [\xreg]
+#else
+        ldr     \xreg, =\label
+        ldur    \sreg, [\xreg]
+#endif
+.endm
+
+.macro GLABEL label
+#if defined(__APPLE__)
+        .global _\label
+#else
+        .global \label
+#endif
+.endm
+
+.macro MAIN
+#if defined(__APPLE__)
+_main:
+#else
+main:
+#endif
+.endm
+
+/*  Fetching the address of the externally defined errno is quite
+    different on Apple and Linux. This macro leaves the address of
+    errno in x0.
+*/
+.macro  ERRNO_ADDR
+#if defined(__APPLE__)
+        bl      ___error
+#else
+        bl      __errno_location
+#endif
+.endm
+
+.macro  CRT label
+#if defined(__APPLE__)
+        bl  _\label
+#else
+        bl  \label
+#endif
+.endm
+
+.macro  START_PROC          // after starting label
+        .cfi_startproc
+.endm
+
+.macro  END_PROC            // after the return
+        .cfi_endproc
+.endm
+
+.macro  PUSH_P  a, b 
+        stp     \a, \b, [sp, -16]!
+.endm
+
+.macro  PUSH_R  a 
+        str     \a, [sp, -16]!
+.endm
+
+.macro  POP_P   a, b 
+        ldp     \a, \b, [sp], 16
+.endm
+
+.macro  POP_R   a 
+        ldr     \a, [sp], 16
+.endm
+
+/*  The smaller of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+    
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MIN     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, LT
+.endm
+
+/*  The larger of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MAX     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, GT
+.endm
+
+.macro  AASCIZ      label, string
+        .p2align    2
+\label: .asciz      "\string"
+.endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/more/argv_example/jess1.S
+++ b/more/argv_example/jess1.S
@ -0,0 +1,111 @@
+#include "apple-linux-convergence.S"
+
+        .p2align    2
+        .text
+        GLABEL      main
+
+/*  This program will get a string followed by a double followed by an
+    integer from the command line demonstrating how each of these types
+    can be retrieved.
+
+    Example:
+    ./a.out test 29.3 29
+*/
+
+MAIN
+        PUSH_P      x29, x30
+        mov         x29, sp
+
+        // Check argc to see if it is 4. This is not the only way to
+        // validate command line arguments but it is an easy way.
+        cmp         w0, 4
+        bne         99f         // take branch if argc isn't "right".
+
+        // Skip past argv[0]
+        add         x1, x1, 8
+
+        // Fetch argv[1] as a string.
+        // x1 is a pointer to a pointer to chars (i.e. the string).
+        // Being a pointer to a pointer, it must be dereferenced to
+        // make a pointer.
+        ldr         x0, [x1]    // dereference
+        // Now x0 contains a pointer to the command line argument.
+        // Print the string (as a string). But doing this causes a
+        // function call which will destroy x1. So, save x1 temporarily.
+        // This could be avoided if x1 were moved to a backed up x
+        // register (e.g. x20).
+        PUSH_R      x1
+        CRT         puts    // ptr is in x0 where puts() needs it.
+        POP_R       x1
+
+        // Advance x1 once again to get to argv[2] which can be done
+        // in the same instruction as dereferencing it use a
+        // preincrement.
+        ldr         x0, [x1, 8]!    // dereference
+
+        // Now the string version of argv[2] is now pointed to by x0.
+        // This is exactly where atof would want it. We need atof
+        // because it turns strings into numbers. BUT, same as before,
+        // calling a function would destroy x1 so let's do the same
+        // trick of backing up x1 on the stack and then restoring after
+        // the function call.
+        PUSH_R      x1
+        CRT         atof    // ptr is in x0 where atof() needs it.
+        POP_R       x1
+        // The string value will be converted to a double left in d0.
+        // d0 is also a scratch register so for our next call to atoi,
+        // d0 will have to be preserved on the stack - alternatively,
+        // we could have used a high d register backed up and restored
+        // at the start and ending of main().
+
+        // Advance x1 once again to get to argv[3] which can be done
+        // in the same instruction as dereferencing it use a
+        // preincrement.
+        ldr         x0, [x1, 8]!  // dereference
+
+        // Now the string version of argv[3] is now pointed to by x0.
+        // This is exactly where atoi would want it. We need atoi
+        // because it turns strings into numbers. BUT, same as before,
+        // calling a function would destroy x1 so let's do the same
+        // trick of backing up x1 on the stack and then restoring after
+        // the function call. We must also do the same for d0. Actually,
+        // we won't need argv after this so we will skip backing up x1.
+
+        PUSH_R      d0
+        CRT         atoi    // ptr is in x0 where atof() needs it.
+        POP_R       d0
+        // d0 now contains the double.
+        // x0 now contains the integer.
+        // x0 must be copied to x1 because x0 must be a pointer to fmt
+        // for printf to work.
+        mov         x1, x0
+        LLD_ADDR    x0, fmt
+#if defined(__APPLE__)
+        sub         sp, sp, 16
+        str         x1, [sp, 8]
+        str         d0, [sp]
+        CRT         printf
+        add         sp, sp, 16 
+#else
+        bl          printf
+#endif
+
+99:     POP_P       x29, x30
+        mov         w0, wzr
+        ret
+
+/* What did we learn?
+    * x1 has argv when main begins.
+    * pointers to the arguments are the contents of argv NOT
+    the actual values. Therefore, x1, which is a pointer (to a pointer),
+    must be dereferenced to get to the actual pointer. In the code,
+    there are three lines with the comment "// dereference".
+    * all command line arguments are c-strings. If that's not what you
+    want, they must be converted - see the code for atoi and atof for
+    examples.
+*/
+        .data
+
+fmt:    .asciz      "double: %f integer: %d\n"
+
+        .end
--- a/more/jump_tables/.gdb_history
+++ b/more/jump_tables/.gdb_history
@ -0,0 +1,6 @@
+b MyMemSet
+run
+n
+n
+:q
+q
--- a/more/jump_tables/README.md
+++ b/more/jump_tables/README.md
@ -0,0 +1,297 @@
+# Jump or Branch Tables
+
+A jump or branch table is a powerful instruction saving technique that
+can be used to switch between multiple single instructions or even
+choose one of a series of functions to call (or branches to take).
+
+This concept can be found as the implementation of some `switch`
+statements and is found at the very very lowest end of an Operating
+System (interrupt vectors, for example).
+
+The
+
+## Single Instructions a la Duff's Device
+
+[Duff's Device](https://en.wikipedia.org/wiki/Duff%27s_device) shoe
+horned a jump table into the middle of a `while` loop. At the same
+time, it also demonstrates a simple case of *loop unrolling*.
+It's very creative.
+
+Let's expand on Duff's Device.
+
+The full source code for this example can be found
+[here](./branch_table.S). It demonstrates a branch table consisting of
+instructions which are meant to be executed in sequence after jumping
+into the middle of the sequence.
+
+Here:
+
+```asm
+    mov         x6, 8
+    MOD         x2, x6, x4, x5  // x4 gets l % 8
+    cbz         x4, 10f         // Handle evenly divisible case.
+    sub         x4, x6, x4      // Invert sense of x4 e.g. 3 becomes 5
+```
+
+we are performing this: *x4 is getting the result of modding the
+number of times we want the instructions executed by the number of
+times we unrolled the loop*.
+
+Specifically, this example does `length % 8`. However, the AARCH64 ISA
+does not include a *mod* instruction. The `MOD` macro used above is
+defined as:
+
+```asm
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
+```
+
+`msub` is a cool instruction. It does this:
+
+```d = c - (b * a)```
+
+Example: 13 % 8 == 5. First the `sdiv`: 13 / 8 is 1. Then, the `msub`:
+13 - (1 * 8) is 5.
+
+Next:
+
+```asm
+        cbz         x4, 10f         // Handle evenly divisible case.
+        sub         x4, x6, x4      // Invert sense of x4 e.g. 5 becomes 3
+```
+
+This code is key.
+
+If the result of the `mod` is 0, then the entire table must be executed.
+This is implemented by the `cbz`.
+
+If the result of the `mod` is not 0, then its value must be *flipped*.
+The idea here is that if the result of the mod is 5, for example, we
+have 5 stragglers. We want to execute 5 of the sequential instructions
+below. So, we want to jump 3 instructions into the table. Notice that
+3 is 8 - 5.
+
+Finally, we have the computation of the address to where we jump into
+the middle of the table.
+
+```asm
+        LLD_ADDR    x5, 10f
+        add         x5, x5, x4, lsl 2
+        br          x5
+```
+
+Each of the lines above bears description:
+
+The `LLD_ADDR` is from the [*convergence
+macros*](./apple-linux-convergence.S). It loads the address of the
+beginning of the table.
+
+Next, the `add` instruction multiplies the flipped result of the `mod`
+by 4 (the length of one instruction) THEN adds it to the base address of
+the table. We have calculated *instruction addresses* exactly the way we
+would with array dereferences. Thank you John von Neumann.
+
+Finally, we `br` which means branch to an address contained in a
+register.
+
+```asm
+10:     str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        // loop code not shown
+```
+
+## Performing Multiple Instructions
+
+If you need to execute more than one instruction you have two choices:
+
+### Multiple Instructions by Address Arithmetic
+
+Suppose you needed two instructions in each step of the sequence.
+Simply multiply the index by 8 instead of 4 (i.e. the length of two
+instructions). The same technique works with a larger number. E.g.
+you need three instructions per step: multiply by 12.
+
+Suppose some need 3 instruction and some need 2. You must handle this
+because using this technique requires that all steps in the sequence
+of steps must be the same length so that the address arithmetic works.
+
+To deal with some cases being shorter than others, insert the occasional
+`nop` instruction in the indexes that are shorter than the others.
+
+### Multiple Instructions by Branch / Branch
+
+Here's another [example of code](./jmptbl.s) that implements a branch or
+jump table:
+
+```asm
+jt:     b       0f
+        b       1f
+        b       2f
+        b       3f
+        b       4f
+        b       5f
+        b       6f
+        b       7f
+```
+
+You jump into the middle of the table as per above and then immediately
+jump some place else. This is like:
+
+```c
+if (index == 0) {
+    blah
+} else if (index == 1) {
+    blah
+} else if (index == 2) {
+    blah
+} etc.
+```
+
+### Multiple Instructions by Branch / Call
+
+You can modify the above techniques to make something like:
+
+```asm
+jt:     bl       func_0
+        bl       func_1
+        bl       func_2
+        bl       func_3
+        bl       func_4
+        bl       func_5
+        bl       func_6
+        bl       func_7
+```
+
+or to be more similar to a `break` statement coming after each case:
+
+```asm
+jt:     bl       func_0
+        b        common_label
+        bl       func_1
+        b        common_label
+        bl       func_2
+        b        common_label
+        bl       func_3
+        b        common_label
+        bl       func_4
+        b        common_label
+        bl       func_5
+        b        common_label
+        bl       func_6
+        b        common_label
+        bl       func_7
+        b        common_label
+
+        // perhaps  some  loop control... if none, the preceding
+        // b can be removed since can fall through to the common
+        // label.
+common_label:
+```
+
+## Small Gaps in Sequential Indexes
+
+Suppose your range of indexes was 0 through 8 inclusive (notice there
+are 9 integers in the range) but index 7 is skipped. That is, your
+potential indexes are 0 through 6 inclusive and then 8 but never
+7.
+
+In a `switch` statement, this would look like:
+
+```c++
+/* 
+// Ensure index is a valid value before getting here. In this case the
+// valid range is 0 through 8 inclusive (a range of 9 values). To fill
+// out to the next power of 2 (which would be 16), one could put in
+// empty cases plus a default.
+*/
+switch (index & 0xF) {
+    case 0: blah blah;
+            break;
+    case 1: blah blah;
+            break;
+    case 2: blah blah;
+            break;
+    case 3: blah blah;
+            break;
+    case 4: blah blah;
+            break;
+    case 5: blah blah;
+            break;
+    case 6: blah blah;
+            break;
+    case 8: blah blah;
+            break;
+}
+```
+
+Gaps in the potential indexes presents a surmountable problem if the
+gaps are few.
+
+In the case where there are a small number of gaps simple fill them
+with a branch to a common, otherwise "do nothing", label. For example,
+you might have:
+
+```asm
+b_table:    b       label0
+            b       label1
+            b       label2
+            b       label3
+            b       label4
+            b       label5
+            b       label6
+            b       do_nothing
+            b       label8
+```
+
+in the style of Duff's Device where you are executing sequential single
+instructions, it might loop like this:
+
+```asm
+x_fer:      str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            nop
+            str     w1, [x0], 1
+```
+
+Here, the `nop` instruction means "no operation". It does nothing but
+is a valid instruction meant to take up space (and decades ago, take
+up time).
+
+In a high level language this might look like this:
+
+```c
+for (int i = 0; i <= 8; i++) {
+    if (i == 7)
+        continue;
+    blah blah
+}
+```
+
+## More about the `switch` statement
+
+`switch` statements are optimized using many techniques than suggested
+here. In fact, the implementation of optimized `switch` statements is
+fascinating. There might be:
+
+* binary searches for large numbers of cases
+
+* separation of ranges where each sub-range is optimized in a different
+way
+
+* degeneration into streams of if / else ifs
+
+and other techniques. The people who work on the compilers we take for
+granted really are due some respect and *free beer*.
--- a/more/jump_tables/README.pdf
+++ b/more/jump_tables/README.pdf
--- a/more/jump_tables/apple-linux-convergence.S
+++ b/more/jump_tables/apple-linux-convergence.S
@ -0,0 +1,156 @@
+/*  Macros to permit the "same" assembly language to build on ARM64
+    Linux systems as well as Apple Silicon systems.
+
+    See the fuller documentation at:
+    https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
+
+    Perry Kivolowitz
+    A Gentle Introduction to Assembly Language
+*/
+
+.macro  GLD_PTR     xreg, label
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        ldr	        \xreg, [\xreg, _\label@GOTPAGEOFF]
+#else
+        ldr         \xreg, =\label
+        ldr         \xreg, [\xreg]
+#endif
+.endm
+
+.macro  GLD_ADDR    xreg, label     // Get a global address
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        add         \xreg, \xreg, _\label@GOTPAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_ADDR xreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_DBL xreg, dreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \dreg, [\xreg]
+//      fmov    \dreg, \xreg
+#else
+        ldr     \xreg, =\label
+        ldur    \dreg, [\xreg]
+#endif
+.endm
+
+.macro  LLD_FLT xreg, sreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \sreg, [\xreg]
+#else
+        ldr     \xreg, =\label
+        ldur    \sreg, [\xreg]
+#endif
+.endm
+
+.macro GLABEL label
+#if defined(__APPLE__)
+        .global _\label
+#else
+        .global \label
+#endif
+.endm
+
+.macro MAIN
+#if defined(__APPLE__)
+_main:
+#else
+main:
+#endif
+.endm
+
+/*  Fetching the address of the externally defined errno is quite
+    different on Apple and Linux. This macro leaves the address of
+    errno in x0.
+*/
+.macro  ERRNO_ADDR
+#if defined(__APPLE__)
+        bl      ___error
+#else
+        bl      __errno_location
+#endif
+.endm
+
+.macro  CRT label
+#if defined(__APPLE__)
+        bl  _\label
+#else
+        bl  \label
+#endif
+.endm
+
+.macro  START_PROC          // after starting label
+        .cfi_startproc
+.endm
+
+.macro  END_PROC            // after the return
+        .cfi_endproc
+.endm
+
+.macro  PUSH_P  a, b 
+        stp     \a, \b, [sp, -16]!
+.endm
+
+.macro  PUSH_R  a 
+        str     \a, [sp, -16]!
+.endm
+
+.macro  POP_P   a, b 
+        ldp     \a, \b, [sp], 16
+.endm
+
+.macro  POP_R   a 
+        ldr     \a, [sp], 16
+.endm
+
+/*  The smaller of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+    
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MIN     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, LT
+.endm
+
+/*  The larger of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MAX     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, GT
+.endm
+
+.macro  AASCIZ      label, string
+        .p2align    2
+\label: .asciz      "\string"
+.endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/more/jump_tables/branch_table.S
+++ b/more/jump_tables/branch_table.S
@ -0,0 +1,57 @@
+#include    "apple-linux-convergence.S"
+
+        .p2align    2
+        .text
+        GLABEL      MyMemSet
+
+/*  MyMemSet(unsigned char * b, unsigned char v, long l)
+             x0                 w1               x2
+
+    The length is first checked against less than or equal to 0. If
+    so, the body of the function is skipped.
+
+    The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
+    into the number of instructions to jump to or beyond the initial
+    str. A modulo of 0 is handled separately - it causes a branch to the
+    initial str.
+
+    This code can be dramatically improved by copying more than one byte
+    at a time. You will have to figure out how to do this optimally in
+    P6 - MemCpy
+*/
+#if defined(__APPLE__)
+_MyMemSet:
+#else
+MyMemSet:
+#endif
+        START_PROC
+        PUSH_P      x29, x30
+        mov         x29, sp
+        cmp         x2, xzr         // Test for bad length.
+        ble         99f             // Take branch of 0 or less.
+
+        add         x3, x2, x0      // x3 gets address of one beyond buffer
+        mov         x6, 8
+        MOD         x2, x6, x4, x5  // x4 gets l % 8
+        cbz         x4, 10f         // Handle evenly divisible case.
+        sub         x4, x6, x4      // Invert sense of x4 e.g. 3 becomes 5
+
+        LLD_ADDR    x5, 10f
+        add         x5, x5, x4, lsl 2
+        br          x5
+
+10:     str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        cmp         x3, x0
+        bgt         10b
+
+99:     POP_P       x29, x30
+        ret
+        END_PROC
+
--- a/more/jump_tables/jmptbl.s
+++ b/more/jump_tables/jmptbl.s
@ -0,0 +1,83 @@
+        .text
+        .align  4
+        .global main
+
+main:   str     x30, [sp, -16]!
+        mov     x0, xzr             // set up call to time(nullptr)
+        bl      time                // call time setting up srand
+        bl      srand               // call srand setting up rand
+        bl      rand                // get a random number
+        and     x0, x0, 7           // ensure its range is 0 to 7
+                                    // note use of x register is on purpose
+        lsl     x0, x0, 2           // multiply by 4
+        ldr     x1, =jt             // load base address of jump table
+        add     x1, x1, x0          // add offset to base address
+        br      x1
+
+// If, as in this case, all the "cases" have the same number of 
+// instructions then this intermediate jump table can be omitted saving
+// some space and a tiny amount of time. To omit the intermediate jump
+// table, you'd multiply by 12 above and not 4. Twelve because each 
+// "case" has 3 instructions (3 x 4 == 12).
+
+// Question for you: If you did omit the jump table, relative to what
+// would you jump (since "jt" would be gone).
+
+jt:     b       0f
+        b       1f
+        b       2f
+        b       3f
+        b       4f
+        b       5f
+        b       6f
+        b       7f
+
+0:      ldr     x0, =ZR
+        bl      puts
+        b       99f
+
+1:      ldr     x0, =ON
+        bl      puts
+        b       99f
+
+2:      ldr     x0, =TW
+        bl      puts
+        b       99f
+
+3:      ldr     x0, =TH
+        bl      puts
+        b       99f
+
+4:      ldr     x0, =FR
+        bl      puts
+        b       99f
+
+5:      ldr     x0, =FV
+        bl      puts
+        b       99f
+
+6:      ldr     x0, =SX
+        bl      puts
+        b       99f
+
+7:      ldr     x0, =SV
+        bl      puts
+        b       99f
+
+99:     mov     w0, wzr
+        ldr     x30, [sp], 16
+        ret
+
+        .data
+        .section    .rodata
+
+ZR:     .asciz      "0 returned"
+ON:     .asciz      "1 returned"
+TW:     .asciz      "2 returned"
+TH:     .asciz      "3 returned"
+FR:     .asciz      "4 returned"
+FV:     .asciz      "5 returned"
+SX:     .asciz      "6 returned"
+SV:     .asciz      "7 returned"
+
+        .end
--- a/more/jump_tables/jt.c
+++ b/more/jump_tables/jt.c
@ -0,0 +1,55 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+/*	This is the prototype for the assembly language version. You may
+	have always thought that switch statements are implemented as a long
+	chain of if / else. Well, sometimes they are. Sometimes they are
+	implemented using binary search and still other times they are 
+	implemented as jump tables.
+
+	My assembly language version is found in jmptbl.s.
+*/
+
+int main()
+{
+    int r;
+
+    srand(time(0));
+    r = rand() & 7;
+    switch (r)
+    {
+        case 0: 
+            puts("0 returned");
+            break;
+
+        case 1:
+            puts("1 returned");
+            break;
+
+        case 2:
+            puts("2 returned");
+            break;
+
+        case 3:
+            puts("3 returned");
+            break;
+
+        case 4:
+            puts("4 returned");
+            break;
+
+        case 5:
+            puts("5 returned");
+            break;
+
+        case 6:
+            puts("6 returned");
+            break;
+
+        case 7:
+            puts("7 returned");
+            break;
+    }
+    return 0;
+}
--- a/more/jump_tables/test_interop.cpp
+++ b/more/jump_tables/test_interop.cpp
@ -0,0 +1,31 @@
+#include <stdio.h>
+
+extern "C" void MyMemSet(unsigned char *, unsigned char v, long length);
+
+/*      MyMemSet(unsigned char *, unsigned char v, long length);
+*/
+
+/*
+void MyMemSet(unsigned char * b, unsigned char v, long l) {
+    for (long i = 0; i < l; i++) {
+        b[i] = v;
+    }
+}
+*/
+const long BUFFER_SIZE = 1000;
+
+unsigned char buffer[BUFFER_SIZE];
+
+int main() {
+	unsigned char before = buffer[-1];
+	unsigned char after = buffer[BUFFER_SIZE];
+
+	MyMemSet(buffer, 0xF0, 3);
+    
+	if (before != buffer[-1])
+		printf("Bytes prior to buffer are smashed.\n");
+	if (after != buffer[BUFFER_SIZE])
+		printf("Bytes after buffer are smashed.\n");
+
+	return 0;
+}
--- a/projects/SINE/README.md
+++ b/projects/SINE/README.md
@ -17,7 +17,9 @@ sin x = x - x^3/3! + x^5/5! - x^7/7! ...

 Notice each term flips from addition to subtraction.

-Notice each term is based on the odd integers starting at 1.
+Notice each term is based on the odd integers starting at 1. While the
+"1" case might look different, it is the same as all the others since
+1 is just 1 to the first power divided by 1 factorial.

 ## Command line

@ -29,40 +31,76 @@ arguments are therefore required.
  be a double.

 * The number of terms to evaluate. The number of terms must lie between
-  1 and 10 inclusive.
+  1 and 10 inclusive. Note the value of 10 as an upper bound in new. It
+  was 8.

 ## C version

 To assist your efforts, [here](./c_version.c) is a version of this
-project written in C.
+project written in C. This has been updated to print nice debugging
+output which is not part of the project.

-## Errors to stderr
-
-Error messages must be sent to `stderr`.
-
-If you are using the convergence macros to allow your program to build
-on both Apple Silicon Mac OS and Linux, note the special casing needed
-to deal with `stderr`. If this is you, compile the C version on Mac OS
-with the `-S` compiler option to see the generated assembly language and
-search for `stderr`.
+This C version also demonstrates a different way of calculating the
+toggle. This version flips the sign of the toggle by multiplying by -1.
+The previous version used odd and even values of the term.

 ## Sample executions

 ```text
-SINE % ./a.out 0 8
-The sine of 0.00 degrees is 0.000000 in radians.
-SINE % ./a.out 90 8
-The sine of 90.00 degrees is 1.000000 in radians.
-SINE % ./a.out 180 8
-The sine of 180.00 degrees is -0.000001 in radians.
-SINE % ./a.out 180 82
+pk_taylor_series > gcc main.S -o a
+pk_taylor_series > ./a 0 10
+The sine of 0.00 degrees is 0.00000000.
+pk_taylor_series > ./a 30 10
+The sine of 30.00 degrees is 0.50000000.
+pk_taylor_series > ./a 45 10
+The sine of 45.00 degrees is 0.70710678.
+pk_taylor_series > ./a 90 10
+The sine of 90.00 degrees is 1.00000000.
+pk_taylor_series > ./a 180 10
+The sine of 180.00 degrees is -0.00000000.
+pk_taylor_series > ./a 360 10
+The sine of 360.00 degrees is -0.00104818.
+pk_taylor_series > ./a 360 100
 Number of terms is out of range.
-SINE % ./a.out 180 -10
+pk_taylor_series > ./a 360 -1
 Number of terms is out of range.
-SINE % echo $?
-1
+pk_taylor_series > 
 ```

+## Floating point instructions I used
+
+These are the floating point instructions I used in my implementation.
+
+* fmov
+
+* scvtf
+
+* fmul
+
+* fdiv
+
+* fadd
+
+## How I broke up the program
+
+I have functions named:
+
+* main
+
+* HandleOptions
+
+* Factorial
+
+* IntegerPower - x to the nth power
+
+* ComputeSine - The main calculation
+
+* PrintAnswer
+
+* ConvertTheta - Wrap D2R
+
+* D2R - Degrees to radians
+
 ## CSC3510

 The following applies to Carthage College CSC3510 students.
@ -74,4 +112,3 @@ Work is to be done solo.
 ### What to hand in

 Just the .S file. **Your name must be at the top of the file.**
-
--- a/projects/SINE/README.pdf
+++ b/projects/SINE/README.pdf
--- a/projects/SINE/c_version.c
+++ b/projects/SINE/c_version.c
@ -1,13 +1,14 @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <math.h>

-double pi = 3.14159265359;
+double pi = 3.14159265358979323846;

 double D2R(double d) {
    return d * pi / 180.0;
 }

-long Factorial(int n) {
+double Factorial(int n) {
    long retval = 1;

    if (n > 0) {
@ -15,7 +16,7 @@ long Factorial(int n) {
            retval = retval * n--;
        }
    }
-    return retval;
+    return (double) retval;
 }

 double IntegerPower(double b, int e) {
@ -48,20 +49,20 @@ int main(int argc, char ** argv) {

    double r_angle = D2R(angle);

+    double toggle = 1.0;
    for (int term = 0, base = 1; term < terms; term++, base += 2) {
-        double toggle = (term & 1) ? -1.0 : 1.0;
-
+		if (toggle > 0) {
+			printf("%+03.8e + %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
+				   Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
+		} else {
+			printf("%+03.8e - %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
+				   Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
+		}
 		sin += toggle *
 			   IntegerPower(r_angle, base) / Factorial(base);
-        /*
-		if (toggle > 0) {
-			printf("adding      %d p/b intermediate: %f\n", base, sin);
-		} else {
-			printf("subtracting %d p/b intermediate: %f\n", base, sin);
-		}
-        */
+        toggle = toggle * -1;
 	}
-	printf("The sine of %.2f degrees is %f in radians.\n", angle, sin);
+	printf("The sine of %0.4f degrees is %0.10f.\n", angle, sin);

    return 0;
 }
--- a/python/apple-linux-convergence.S
+++ b/python/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_1/hello_world/apple-linux-convergence.S
+++ b/section_1/hello_world/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_1/regs/apple-linux-convergence.S
+++ b/section_1/regs/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_1/structs/apple-linux-convergence.S
+++ b/section_1/structs/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_2/float/apple-linux-convergence.S
+++ b/section_2/float/apple-linux-convergence.S
@ -149,3 +149,8 @@ main:
        .p2align    2
 \label: .asciz      "\string"
 .endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_2/float/asm_rounding.S
+++ b/section_2/float/asm_rounding.S
--- a/section_2/float/fmov.md
+++ b/section_2/float/fmov.md
@ -2,27 +2,23 @@

 The `fmov` instruction is used to move floating point values in and out
 of floating point registers and to some degree, moving data between
-integer and floating point registers. 
+integer and floating point registers.

 ## Loading Floating Point Numbers as Immediate Values

-Just as we saw with integer
-registers, some values can be used as immediate values and some cannot.
+Just as we saw with integer registers, some values can be used as
+immediate values and some cannot. It comes down to how many bits are
+necessary to encode the value. Too many bits... not enough room to fit
+in a 4 byte instruction plus the opcode.

 For example, this works:

-`mov    x0, 65536`
+`mov    x0, 65535`

 but this does not:

 `mov    x0, 65537`

-The reason is that all AARCH64 instructions must fit within a 32 bit
-instruction that must hold the instruction's op code, its flags and
-other bits and bobs plus any immediate value. In the above example we
-can see that the `mov` instruction provides up to 16 bits for an
-immediate value.
-
 The constraints placed on immediate values for `fmov` are much tighter
 because floating point numbers are far more complex than integers.

@ -40,7 +36,7 @@ Let's take a look at some code:
        fmov        d0, 1.96875     // Zoinks!
 ```

-From this we can see that an immediate value for an `fmov` seems to have
+From this we can see that an immediate value for an `fmov` has
 4 bits available for the mantissa. In fact, the only values that work
 as immediate values will be those floating point values whose fractional
 values are combinations of:
@ -56,6 +52,9 @@ values are combinations of:
 As far as exponents go, `fmov` can accommodate 3 bits. So, exponents of
 plus or minus 2**7 can be used.

+A sign bit makes the total number of bits available for immediate moves
+to be 8.
+
 ## Loading / Storing Floating Point Numbers in General

 When in doubt, load fixed floating point numbers from memory. This is
@ -64,11 +63,16 @@ covered [in this chapter](./literals.md).
 ## SIMD

 `fmov` can also deal with the more complicated special cases induced by
-SIMD instructions.
+SIMD instructions. `fmov` is able to move values between the various
+register widths such as single precision to double precision. **However,
+no conversion of value is performed - `fmov` just copies bits.**
+
+If you need to change the precision of a floating point value, the
+`fcvt` family of instructions must be used instead.

 ## Movement To / From Integer Registers

-`fmov` can *bits* between the integer and floating point registers. We
-emphasize the *bits*. No conversions are done using `fmov`. There exist
-other instructions for that. See [this chapter](./rounding.md) for more
-information.
+`fmov` can copy *bits* between the integer and floating point registers.
+We emphasize the *bits*. No conversions are done using `fmov`. There
+exist other instructions for that. See [this chapter](./rounding.md) for
+more information.
--- a/section_2/float/fmov.pdf
+++ b/section_2/float/fmov.pdf
--- a/section_2/float/literals.S
+++ b/section_2/float/literals.S
--- a/section_2/float/literals.md
+++ b/section_2/float/literals.md
@ -20,30 +20,32 @@ To load a `float`, you could translate the value to binary and do
 as the following:

 ```asm
-        .text                                                       // 1 
-        .global main                                                // 2 
-        .align    2                                                 // 3 
-                                                                    // 4 
-main:   str        x30, [sp, -16]!                                  // 5 
-        ldr        s0, =0x3fc00000                                  // 6 
-        fcvt       d0, s0                                           // 7 
-        ldr        x0, =fmt                                         // 8 
-        bl         printf                                           // 9 
-        ldr        x30, [sp], 16                                    // 10 
-        mov        w0, wzr                                          // 11 
-        ret                                                         // 12 
-                                                                    // 13 
-        .data                                                       // 14 
-fmt:    .asciz    "%f\n"                                            // 15 
-        .end                                                        // 16 
+        .text                                                   // 1 
+        .global main                                            // 2 
+        .align    2                                             // 3 
+                                                                // 4 
+main:   str        x30, [sp, -16]!                              // 5 
+        ldr        s0, =0x3fc00000                              // 6 
+        fcvt       d0, s0                                       // 7 
+        ldr        x0, =fmt                                     // 8 
+        bl         printf                                       // 9 
+        ldr        x30, [sp], 16                                // 10 
+        mov        w0, wzr                                      // 11 
+        ret                                                     // 12 
+                                                                // 13 
+        .data                                                   // 14 
+fmt:    .asciz    "%f\n"                                        // 15 
+        .end                                                    // 16 
 ```

-The above code is found [here](./t.s).
+The above code is kind of found [here](./t.s) - the file is used
+for miscellaneous testing.

-`Line 6` puts the translated value of 1.5 into `s0` (since the value
-is a `float` it goes in an `s` register). The assembler performs some
-magic getting a 32 bit value seemingly fit into a 32 bit instruction.
-See [below](./literals.md#fitting-32-bits-into-a-32-bit-bag).
+`Line 6` puts the translated value of 1.5 into `s0` (since we are
+thinking of the value as a `float` it goes in an `s` register). The
+assembler performs some magic getting a 32 bit value seemingly fit into
+a 32 bit instruction. See
+[below](./literals.md#fitting-32-bits-into-a-32-bit-bag).

 `Line 7` converts the single precision number into a double precision
 number for printing.
@ -136,6 +138,9 @@ Cool huh?

 ## Fitting 32 bits into a 32 bit bag

+**This section is currently LINUX-centric - in the future it will
+address both native Apple and Linux equally.***
+
 AARCH64 instructions are 32 bits in width. Yet, `line 6` from
 [this](./t.s) program reads:

@ -195,15 +200,16 @@ Scan downward to find `0x7a0`:
   0x7a0 <main+32>         .inst   0x3fc00000 ; undefined  
 ```

-Hey look! Here's our literal float. The `.inst` is an ARM
-specific GNU assembler directive what allows the programmer
-to encode their own instruction. Note, the encoded instruction does not
-have to make any sense - instead the compiler has emitted a make believe
-instruction that happens to have the value of our literal.
+Hey look! Here's our literal float. The `.inst` is an ARM specific GNU
+assembler directive says: `¯\_(-)_/¯`.
+
+Note, the encoded "instruction" does not have to make any sense -
+instead the compiler has emitted a make believe instruction that happens
+to have the value of our literal.

 What we're seeing the actual `line 6` doing is reaching ahead a short
-distance to load the value of another "instruction" when really it is
-our constant.
+distance to load the value of another location in memory where our
+constant is really found.

 Let us take this explanation further. Notice we see:

--- a/section_2/float/literals.pdf
+++ b/section_2/float/literals.pdf
--- a/section_2/float/t.s
+++ b/section_2/float/t.s
@ -1,12 +1,16 @@
        .text                     
-        .global main              
-        .align    2               
+        .global     _main 
+        .align      2               
    
-main:   str        x30, [sp, -16]!
+_main:   
+        str         x30, [sp, -16]!
+        mov         x0, 0xFFFFFFFF
+/*
        ldr        s0, =0x3fc00000
        fcvt       d0, s0         
        ldr        x0, =fmt       
        bl         printf         
+*/
        ldr        x30, [sp], 16   
        mov        w0, wzr         
        ret