most of the way there on precomputation.

2026-06-21 02:26:59 +08:00 · 2024-04-13 14:24:55 -05:00 · 2024-04-13 14:24:55 -05:00 · 054a5deaa5
commit 054a5deaa5
parent 32fe3b3b94
5 changed files with 406 additions and 0 deletions
--- a/section_3/precomputation/.vscode/settings.json
+++ b/section_3/precomputation/.vscode/settings.json
@ -0,0 +1,7 @@
+{
+    "cSpell.words": [
+        "ifact",
+        "pfact",
+        "rfact"
+    ]
+}
--- a/section_3/precomputation/README.md
+++ b/section_3/precomputation/README.md
@ -23,3 +23,88 @@ Certainly, for the purposes of this demonstration, it is not necessary
 to implement both iterative and recursive methods. We do so for fun and
 for any lessons the reader can glean.

+## C Driver
+
+[Here](./main.c), you will find a version written in C. We will
+repurpose `main()` to drive versions in assembly language.
+
+## Iterative
+
+```c
+long Iterative(long n) {
+    long retval = 1;
+    for (long i = 1; i <= n; i++) {
+        retval *= i;
+    }
+    return retval;
+}
+```
+
+First, notice that this algorithm's work increases linearly with the
+parameter n. Therefore this algorithm is O(n).
+
+We translated this function into assembly language to produce the code
+provided below. This code is *condensed*. To see the original code with
+comments, please see [here](/asm.S).
+
+```asm
+ifact:
+        mov         x2, x0
+        mov         x0, 1       // equivalent to retval = 1
+        mov         x1, 1       // equivalent to i = 1
+
+        // This has five instructions (20 bytes) in the inner loop which
+        // increases in work by O(n).
+
+10:     cmp         x1, x2
+        bgt         99f
+        mul         x0, x0, x1
+        add         x1, x1, 1
+        b           10b
+
+99: 
+        ret
+```
+
+Reminder, the above code is *condense*. You will note that the code that
+performs the calculation is 5 instructions (or 20 bytes) long. This
+isn't much but again, the algorithm runs in O(n) time.
+
+## Recursive
+
+```c
+long Recursive(long n) {
+    long retval;
+    if (n <= 1)
+        retval = 1;
+    else
+        retval = n * Recursive(n - 1);
+        
+    return retval;
+}
+```
+
+The code below is *condensed*. The original code, with comments, can be
+found [here](./asm.S).
+
+```asm
+rfact:
+        PUSH_P      x29, x30
+        mov         x29, sp
+
+        cmp         x0, 1
+        bgt         10f
+        mov         x0, 1       // ensure x0 is 1 - it could be less.
+        b           99f
+
+10:     // If we get here, n must be more then 1. Recursion is needed.
+
+        PUSH_R      x0          // save the current n
+        sub         x0, x0, 1   // prepare for recursion
+        bl          rfact       // 
+        POP_R       x1          // restore the current n
+        mul         x0, x0, x1  // multiply it by recursive return
+
+99:     POP_P       x29, x30
+        ret
+```
--- a/section_3/precomputation/apple-linux-convergence.S
+++ b/section_3/precomputation/apple-linux-convergence.S
@ -0,0 +1,156 @@
+/*  Macros to permit the "same" assembly language to build on ARM64
+    Linux systems as well as Apple Silicon systems.
+
+    See the fuller documentation at:
+    https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
+
+    Perry Kivolowitz
+    A Gentle Introduction to Assembly Language
+*/
+
+.macro  GLD_PTR     xreg, label
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        ldr	        \xreg, [\xreg, _\label@GOTPAGEOFF]
+#else
+        ldr         \xreg, =\label
+        ldr         \xreg, [\xreg]
+#endif
+.endm
+
+.macro  GLD_ADDR    xreg, label     // Get a global address
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        add         \xreg, \xreg, _\label@GOTPAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_ADDR xreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_DBL xreg, dreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \dreg, [\xreg]
+//      fmov    \dreg, \xreg
+#else
+        ldr     \xreg, =\label
+        ldur    \dreg, [\xreg]
+#endif
+.endm
+
+.macro  LLD_FLT xreg, sreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \sreg, [\xreg]
+#else
+        ldr     \xreg, =\label
+        ldur    \sreg, [\xreg]
+#endif
+.endm
+
+.macro GLABEL label
+#if defined(__APPLE__)
+        .global _\label
+#else
+        .global \label
+#endif
+.endm
+
+.macro MAIN
+#if defined(__APPLE__)
+_main:
+#else
+main:
+#endif
+.endm
+
+/*  Fetching the address of the externally defined errno is quite
+    different on Apple and Linux. This macro leaves the address of
+    errno in x0.
+*/
+.macro  ERRNO_ADDR
+#if defined(__APPLE__)
+        bl      ___error
+#else
+        bl      __errno_location
+#endif
+.endm
+
+.macro  CRT label
+#if defined(__APPLE__)
+        bl  _\label
+#else
+        bl  \label
+#endif
+.endm
+
+.macro  START_PROC          // after starting label
+        .cfi_startproc
+.endm
+
+.macro  END_PROC            // after the return
+        .cfi_endproc
+.endm
+
+.macro  PUSH_P  a, b 
+        stp     \a, \b, [sp, -16]!
+.endm
+
+.macro  PUSH_R  a 
+        str     \a, [sp, -16]!
+.endm
+
+.macro  POP_P   a, b 
+        ldp     \a, \b, [sp], 16
+.endm
+
+.macro  POP_R   a 
+        ldr     \a, [sp], 16
+.endm
+
+/*  The smaller of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+    
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MIN     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, LT
+.endm
+
+/*  The larger of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MAX     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, GT
+.endm
+
+.macro  AASCIZ      label, string
+        .p2align    2
+\label: .asciz      "\string"
+.endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/section_3/precomputation/asm.S
+++ b/section_3/precomputation/asm.S
@ -0,0 +1,151 @@
+#include "apple-linux-convergence.S"
+
+        .text
+        .p2align    2
+        GLABEL      ifact
+        GLABEL      rfact
+        GLABEL      pfact
+
+#if defined(__APPLE__)
+_ifact:
+#else
+ifact:
+#endif
+        START_PROC
+
+        // The parameter n comes to us in register x0. We want to
+        // return the result in x0 so copy x0 to x2 for processing.
+        // Then the calculation is straight forward with x1 serving as
+        // the equivalent of i in the assembly language.
+        //
+        // A case can be made for counting down to 1 as this would
+        // require 1 fewer register at the expense of a cmp to initially
+        // vet the value.
+        //
+        // Finally, notice we did not backup and restore x29 and x30. We
+        // can get away with this because we know this function calls no
+        // other functions. Therefore, the value of the link register,
+        // x30, remains undisturbed.
+
+        mov         x2, x0
+        mov         x0, 1       // equivalent to retval = 1
+        mov         x1, 1       // equivalent to i = 1
+
+        // For loops are typically implemented differently from what one
+        // would imagine to save an instruction inside the loop. Here,
+        // however, we implement it in the way a programmer coming from
+        // C would expect.
+
+        // This has five instructions (20 bytes) in the inner loop which
+        // increases in work by O(n).
+
+10:     cmp         x1, x2
+        bgt         99f
+        mul         x0, x0, x1
+        add         x1, x1, 1
+        b           10b
+
+99: 
+        ret
+        END_PROC
+
+#if defined(__APPLE__)
+_rfact:
+#else
+rfact:
+#endif
+        START_PROC
+        PUSH_P      x29, x30
+        mov         x29, sp
+
+        // The parameter n comes to us in x0 but it is passed by value.
+        // That is, the n in C is a local variable. We need to keep the
+        // present copy of n around when we recursively call ourselves
+        // with n - 1.
+        // 
+        // Think stack when you think local variable. The argument can
+        // be made to keep the local copy in a durable register but this
+        // will mean a stack push anyway so instead, we choose to make
+        // the stack push explicit by placing it at the recursive call.
+
+        cmp         x0, 1
+        bgt         10f
+        mov         x0, 1       // ensure x0 is 1 - it could be less.
+        b           99f
+
+10:     // If we get here, n must be more then 1. Recursion is needed.
+
+        // This has five instructions (20 bytes) in the inner loop which
+        // increases in work by O(n) and also incurs references to RAM.
+
+        PUSH_R      x0          // save the current n
+        sub         x0, x0, 1   // prepare for recursion
+        CRT         rfact       // borrow the macro to be x-compatible.
+        POP_R       x1          // restore the current n
+        mul         x0, x0, x1  // multiply it by recursive return
+
+99:     POP_P       x29, x30
+        ret
+        END_PROC
+
+#if defined(__APPLE__)
+_pfact:
+#else
+pfact:
+#endif
+        START_PROC
+        PUSH_P      x29, x30
+        mov         x29, sp
+
+        // The parameter comes to us in x0. Since we're using it to
+        // create an address, we better vet the value to be between 1
+        // and 15 inclusive.
+        //
+        // After that, the base address of the precomputed factorials
+        // is loaded into x0. Then, the parameter n (having been
+        // multiplied by 8) is added to the base address to form the
+        // address of the precomputed factorial that we're after. It's
+        // value is loaded into x0 for return.
+
+        // Two instructions (8 bytes) are needed to form the correct
+        // address. 15 * 8 bytes are needed for the precomputed values.
+        // This summs to 118 bytes of ram needed, more than the previous
+        // two methods. But, execution time is constant so is O(1). Far
+        // faster.
+
+        mov         x1, x0
+        mov         x0, 1
+        cmp         x1, xzr
+        ble         99f
+        cmp         x1, 15
+        bgt         99f
+
+        LLD_ADDR    x0, fv
+        ldr         x0, [x0, x1, lsl 3]
+
+99:     POP_P       x29, x30
+        ret
+        END_PROC
+
+        .data
+
+        .p2align    3
+        
+fv:     .dword      1
+        .dword      1
+        .dword      2
+        .dword      6
+        .dword      24
+        .dword      120
+        .dword      720
+        .dword      5040
+        .dword      40320
+        .dword      362880
+        .dword      3628800
+        .dword      39916800
+        .dword      479001600
+        .dword      6227020800
+        .dword      87178291200
+        .dword      1307674368000
+
+        .end
--- a/section_3/precomputation/main.c
+++ b/section_3/precomputation/main.c
@ -7,11 +7,18 @@ long Iterative(long n);
 long Recursive(long n);
 long Precomputed(long n);

+extern long ifact(long);
+extern long rfact(long);
+extern long pfact(long);
+
 int main() {
    long (*func[])(long) = {
        Iterative,
        Recursive,
        Precomputed,
+        ifact,
+        rfact,
+        pfact,
        NULL
    };