added jump tables

2026-06-21 01:46:46 +08:00 · 2023-03-31 10:28:31 -05:00 · 2023-03-31 10:28:31 -05:00 · c74ef063e1
commit c74ef063e1
parent b4199955ed
9 changed files with 658 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -319,6 +319,7 @@ In this section, we present miscellaneous material.
 | 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
 | 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
 | 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
+| 8 | [Jump Tables](./more/jump_tables/README.md) | [Link](./more/jump_tables/README.pdf) |

 ## Macro Suite

--- a/README.pdf
+++ b/README.pdf
--- a/more/jump_tables/README.md
+++ b/more/jump_tables/README.md
@ -0,0 +1,275 @@
+# Jump or Branch Tables
+
+A jump or branch table is a powerful instruction saving technique that
+can be used to switch between multiple single instructions or even
+choose one of a series of functions to call (or branches to take).
+
+This concept can be found as the implementation of some `switch`
+statements and is found at the very very lowest end of an Operating
+System (interrupt vectors, for example).
+
+The
+
+## Single Instructions a la Duff's Device
+
+[Duff's Device](https://en.wikipedia.org/wiki/Duff%27s_device) shoe
+horned a jump table into the middle of a `while` loop. At the same
+time, it also correctly demonstrates a simple case of *loop unrolling*.
+It's very creative.
+
+Let's expand on Duff's Device.
+
+The full source code for this example can be found
+[here](./branch_table.S). It demonstrates a branch table consisting of
+instructions which are meant to be executed in sequence after jumping
+into the middle of the sequence.
+
+Here:
+
+```asm
+    mov         x6, 8
+    MOD         x2, x6, x4, x5  // x4 gets l % 8
+    cbz         x4, 10f         // Handle evenly divisible case.
+    sub         x4, x6, x4      // Invert sense of x4 e.g. 3 becomes 5
+```
+
+we are performing this: *x4 is getting the result of modding the
+number of times we want the instructions executed by the number of
+times we unrolled the loop*.
+
+Specifically, this example does `length % 8`. However, the AARCH64 ISA
+does not include a *mod* instruction. The `MOD` macro used above is
+defined as:
+
+```asm
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
+```
+
+`msub` is a cool instruction. It does this:
+
+```d = c - (b * a)```
+
+Example: 13 % 8 == 5. First the `sdiv`: 13 / 8 is 1. Then, the `msub`:
+13 - (1 * 8) is 5.
+
+Next:
+
+```asm
+        cbz         x4, 10f         // Handle evenly divisible case.
+        sub         x4, x6, x4      // Invert sense of x4 e.g. 5 becomes 3
+```
+
+This code is key.
+
+If the result of the `mod` is 0, then the entire table must be executed.
+This is implemented by the `cbz`.
+
+If the result of the `mod` is not 0, then its value must be *flipped*.
+This is the `sub` instruction. See the comment above.
+
+Finally, we have the computation of the address to where we jump into
+the middle of the table.
+
+```asm
+        LLD_ADDR    x5, 10f
+        add         x5, x5, x4, lsl 2
+        br          x5
+```
+
+Each of the lines above bears description:
+
+The `LLD_ADDR` is from the [*convergence
+macros*](./apple-linux-convergence.S). It loads the address of the
+beginning of the table.
+
+Next, the `add` instruction multiplies the flipped result of the `mod`
+by 4 (the length of one instruction) THEN adds it to the base address
+of the table. We have calculated *instruction addresses* exactly the
+way we would with array dereferences. Thank you John von Neumann.
+
+Finally, we `br` which means branch to an address contained in a
+register.
+
+```asm
+10:     str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        // loop code not shown
+```
+
+## Performing Multiple Instructions
+
+If you need to execute more than one instruction you have two choices:
+
+### Multiple Instructions by Address Arithmetic
+
+Suppose you needed two instructions in each step of the sequence.
+Simply multiply the index by 8 instead of 4 (i.e. the length of two
+instructions). The same technique works with a larger number. E.g.
+you need three instructions per step: multiply by 12.
+
+Suppose some need 3 instruction and some need 2. You must handle this
+because using this technique requires that all steps in the sequence
+of steps must be the same length so that the address arithmetic holds.
+
+Simply insert the occasional `nop` instruction in the indexes that are
+shorter than the others.
+
+### Multiple Instructions by Branch Branch
+
+Here's another [example of code](./jmptbl.s) that implements a branch or
+jump table:
+
+```asm
+jt:     b       0f
+        b       1f
+        b       2f
+        b       3f
+        b       4f
+        b       5f
+        b       6f
+        b       7f
+```
+
+You jump into the middle of the table and then immediately jump some
+place else. This is like:
+
+```c
+if (blah) {
+    blah
+} else if (blah) {
+    blah
+} else if (blah) {
+    blah
+}
+etc.
+```
+
+### Multiple Instructions by Branch Call
+
+You can easily modify the above techniques to make something like:
+
+```asm
+jt:     br       func_0
+        br       func_1
+        br       func_2
+        br       func_3
+        br       func_4
+        br       func_5
+        br       func_6
+        br       func_7
+```
+
+or:
+
+```asm
+jt:     br       func_0
+        b        common_label
+        br       func_1
+        b        common_label
+        br       func_2
+        b        common_label
+        br       func_3
+        b        common_label
+        br       func_4
+        b        common_label
+        br       func_5
+        b        common_label
+        br       func_6
+        b        common_label
+        br       func_7
+        b        common_label
+        // perhaps some loop control... if none, the preceding
+        // b can be removed since can fall through to the common
+        // label.
+common:
+```
+
+The above looks like a `switch` statement where each case is terminated
+with a `break` statement.
+
+## Small Gaps in Sequential Indexes
+
+Suppose your range of indexes was 0 through 8 inclusive (notice there
+are 9 integers in the range) but index 7 is skipped. That is, your
+potential indexes are 0 through 6 inclusive and then 8 but never
+7.
+
+In a `switch` statement, this would look like:
+
+```c++
+switch (index) {
+    case 0: blah blah;
+            break;
+    case 1: blah blah;
+            break;
+    case 2: blah blah;
+            break;
+    case 3: blah blah;
+            break;
+    case 4: blah blah;
+            break;
+    case 5: blah blah;
+            break;
+    case 6: blah blah;
+            break;
+    case 8: blah blah;
+            break;
+}
+```
+
+Gaps in the potential indexes presents a surmountable problem if the
+gaps are few.
+
+In the case where there are a small number of gaps simple fill them
+with a branch to a common, otherwise "do nothing", label. For example,
+you might have:
+
+```asm
+b_table:    b       label0
+            b       label1
+            b       label2
+            b       label3
+            b       label4
+            b       label5
+            b       label6
+            b       do_nothing
+            b       label8
+```
+
+in a Duff's Device where you are executing sequential single
+instructions, it might loop like this:
+
+```asm
+x_fer:      str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            str     w1, [x0], 1
+            nop
+            str     w1, [x0], 1
+```
+
+Here, the `nop` instruction means "no operation". It does nothing but
+is a valid instruction meant to take up space (and decades ago, take
+up time).
+
+In a high level language this might look like this:
+
+```c
+for (int i = 0; i <= 8; i++) {
+    if (i == 7)
+        continue;
+    blah blah
+}
+```
--- a/more/jump_tables/README.pdf
+++ b/more/jump_tables/README.pdf
--- a/more/jump_tables/apple-linux-convergence.S
+++ b/more/jump_tables/apple-linux-convergence.S
@ -0,0 +1,156 @@
+/*  Macros to permit the "same" assembly language to build on ARM64
+    Linux systems as well as Apple Silicon systems.
+
+    See the fuller documentation at:
+    https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
+
+    Perry Kivolowitz
+    A Gentle Introduction to Assembly Language
+*/
+
+.macro  GLD_PTR     xreg, label
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        ldr	        \xreg, [\xreg, _\label@GOTPAGEOFF]
+#else
+        ldr         \xreg, =\label
+        ldr         \xreg, [\xreg]
+#endif
+.endm
+
+.macro  GLD_ADDR    xreg, label     // Get a global address
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        add         \xreg, \xreg, _\label@GOTPAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_ADDR xreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_DBL xreg, dreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \dreg, [\xreg]
+//      fmov    \dreg, \xreg
+#else
+        ldr     \xreg, =\label
+        ldur    \dreg, [\xreg]
+#endif
+.endm
+
+.macro  LLD_FLT xreg, sreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \sreg, [\xreg]
+#else
+        ldr     \xreg, =\label
+        ldur    \sreg, [\xreg]
+#endif
+.endm
+
+.macro GLABEL label
+#if defined(__APPLE__)
+        .global _\label
+#else
+        .global \label
+#endif
+.endm
+
+.macro MAIN
+#if defined(__APPLE__)
+_main:
+#else
+main:
+#endif
+.endm
+
+/*  Fetching the address of the externally defined errno is quite
+    different on Apple and Linux. This macro leaves the address of
+    errno in x0.
+*/
+.macro  ERRNO_ADDR
+#if defined(__APPLE__)
+        bl      ___error
+#else
+        bl      __errno_location
+#endif
+.endm
+
+.macro  CRT label
+#if defined(__APPLE__)
+        bl  _\label
+#else
+        bl  \label
+#endif
+.endm
+
+.macro  START_PROC          // after starting label
+        .cfi_startproc
+.endm
+
+.macro  END_PROC            // after the return
+        .cfi_endproc
+.endm
+
+.macro  PUSH_P  a, b 
+        stp     \a, \b, [sp, -16]!
+.endm
+
+.macro  PUSH_R  a 
+        str     \a, [sp, -16]!
+.endm
+
+.macro  POP_P   a, b 
+        ldp     \a, \b, [sp], 16
+.endm
+
+.macro  POP_R   a 
+        ldr     \a, [sp], 16
+.endm
+
+/*  The smaller of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+    
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MIN     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, LT
+.endm
+
+/*  The larger of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MAX     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, GT
+.endm
+
+.macro  AASCIZ      label, string
+        .p2align    2
+\label: .asciz      "\string"
+.endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/more/jump_tables/branch_table.S
+++ b/more/jump_tables/branch_table.S
@ -0,0 +1,57 @@
+#include    "apple-linux-convergence.S"
+
+        .p2align    2
+        .text
+        GLABEL      MyMemSet
+
+/*  MyMemSet(unsigned char * b, unsigned char v, long l)
+             x0                 w1               x2
+
+    The length is first checked against less than or equal to 0. If
+    so, the body of the function is skipped.
+
+    The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
+    into the number of instructions to jump to or beyond the initial
+    str. A modulo of 0 is handled separately - it causes a branch to the
+    initial str.
+
+    This code can be dramatically improved by copying more than one byte
+    at a time. You will have to figure out how to do this optimally in
+    P6 - MemCpy
+*/
+#if defined(__APPLE__)
+_MyMemSet:
+#else
+MyMemSet:
+#endif
+        START_PROC
+        PUSH_P      x29, x30
+        mov         x29, sp
+        cmp         x2, xzr         // Test for bad length.
+        ble         99f             // Take branch of 0 or less.
+
+        add         x3, x2, x0      // x3 gets address of one beyond buffer
+        mov         x6, 8
+        MOD         x2, x6, x4, x5  // x4 gets l % 8
+        cbz         x4, 10f         // Handle evenly divisible case.
+        sub         x4, x6, x4      // Invert sense of x4 e.g. 3 becomes 5
+
+        LLD_ADDR    x5, 10f
+        add         x5, x5, x4, lsl 2
+        br          x5
+
+10:     str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        str         w1, [x0], 1
+        cmp         x3, x0
+        bgt         10b
+
+99:     POP_P       x29, x30
+        ret
+        END_PROC
+
--- a/more/jump_tables/jmptbl.s
+++ b/more/jump_tables/jmptbl.s
@ -0,0 +1,83 @@
+        .text
+        .align  4
+        .global main
+
+main:   str     x30, [sp, -16]!
+        mov     x0, xzr             // set up call to time(nullptr)
+        bl      time                // call time setting up srand
+        bl      srand               // call srand setting up rand
+        bl      rand                // get a random number
+        and     x0, x0, 7           // ensure its range is 0 to 7
+                                    // note use of x register is on purpose
+        lsl     x0, x0, 2           // multiply by 4
+        ldr     x1, =jt             // load base address of jump table
+        add     x1, x1, x0          // add offset to base address
+        br      x1
+
+// If, as in this case, all the "cases" have the same number of 
+// instructions then this intermediate jump table can be omitted saving
+// some space and a tiny amount of time. To omit the intermediate jump
+// table, you'd multiply by 12 above and not 4. Twelve because each 
+// "case" has 3 instructions (3 x 4 == 12).
+
+// Question for you: If you did omit the jump table, relative to what
+// would you jump (since "jt" would be gone).
+
+jt:     b       0f
+        b       1f
+        b       2f
+        b       3f
+        b       4f
+        b       5f
+        b       6f
+        b       7f
+
+0:      ldr     x0, =ZR
+        bl      puts
+        b       99f
+
+1:      ldr     x0, =ON
+        bl      puts
+        b       99f
+
+2:      ldr     x0, =TW
+        bl      puts
+        b       99f
+
+3:      ldr     x0, =TH
+        bl      puts
+        b       99f
+
+4:      ldr     x0, =FR
+        bl      puts
+        b       99f
+
+5:      ldr     x0, =FV
+        bl      puts
+        b       99f
+
+6:      ldr     x0, =SX
+        bl      puts
+        b       99f
+
+7:      ldr     x0, =SV
+        bl      puts
+        b       99f
+
+99:     mov     w0, wzr
+        ldr     x30, [sp], 16
+        ret
+
+        .data
+        .section    .rodata
+
+ZR:     .asciz      "0 returned"
+ON:     .asciz      "1 returned"
+TW:     .asciz      "2 returned"
+TH:     .asciz      "3 returned"
+FR:     .asciz      "4 returned"
+FV:     .asciz      "5 returned"
+SX:     .asciz      "6 returned"
+SV:     .asciz      "7 returned"
+
+        .end
--- a/more/jump_tables/jt.c
+++ b/more/jump_tables/jt.c
@ -0,0 +1,55 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+/*	This is the prototype for the assembly language version. You may
+	have always thought that switch statements are implemented as a long
+	chain of if / else. Well, sometimes they are. Sometimes they are
+	implemented using binary search and still other times they are 
+	implemented as jump tables.
+
+	My assembly language version is found in jmptbl.s.
+*/
+
+int main()
+{
+    int r;
+
+    srand(time(0));
+    r = rand() & 7;
+    switch (r)
+    {
+        case 0: 
+            puts("0 returned");
+            break;
+
+        case 1:
+            puts("1 returned");
+            break;
+
+        case 2:
+            puts("2 returned");
+            break;
+
+        case 3:
+            puts("3 returned");
+            break;
+
+        case 4:
+            puts("4 returned");
+            break;
+
+        case 5:
+            puts("5 returned");
+            break;
+
+        case 6:
+            puts("6 returned");
+            break;
+
+        case 7:
+            puts("7 returned");
+            break;
+    }
+    return 0;
+}
--- a/more/jump_tables/test_interop.cpp
+++ b/more/jump_tables/test_interop.cpp
@ -0,0 +1,31 @@
+#include <stdio.h>
+
+extern "C" void MyMemSet(unsigned char *, unsigned char v, long length);
+
+/*      MyMemSet(unsigned char *, unsigned char v, long length);
+*/
+
+/*
+void MyMemSet(unsigned char * b, unsigned char v, long l) {
+    for (long i = 0; i < l; i++) {
+        b[i] = v;
+    }
+}
+*/
+const long BUFFER_SIZE = 1000;
+
+unsigned char buffer[BUFFER_SIZE];
+
+int main() {
+	unsigned char before = buffer[-1];
+	unsigned char after = buffer[BUFFER_SIZE];
+
+	MyMemSet(buffer, 0xF0, BUFFER_SIZE);
+    
+	if (before != buffer[-1])
+		printf("Bytes prior to buffer are smashed.\n");
+	if (after != buffer[BUFFER_SIZE])
+		printf("Bytes after buffer are smashed.\n");
+
+	return 0;
+}