diff --git a/section_3/precomputation/.vscode/settings.json b/section_3/precomputation/.vscode/settings.json new file mode 100644 index 0000000..b7fef2e --- /dev/null +++ b/section_3/precomputation/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "cSpell.words": [ + "ifact", + "pfact", + "rfact" + ] +} \ No newline at end of file diff --git a/section_3/precomputation/README.md b/section_3/precomputation/README.md index 1940201..c092be2 100644 --- a/section_3/precomputation/README.md +++ b/section_3/precomputation/README.md @@ -23,3 +23,88 @@ Certainly, for the purposes of this demonstration, it is not necessary to implement both iterative and recursive methods. We do so for fun and for any lessons the reader can glean. +## C Driver + +[Here](./main.c), you will find a version written in C. We will +repurpose `main()` to drive versions in assembly language. + +## Iterative + +```c +long Iterative(long n) { + long retval = 1; + for (long i = 1; i <= n; i++) { + retval *= i; + } + return retval; +} +``` + +First, notice that this algorithm's work increases linearly with the +parameter n. Therefore this algorithm is O(n). + +We translated this function into assembly language to produce the code +provided below. This code is *condensed*. To see the original code with +comments, please see [here](/asm.S). + +```asm +ifact: + mov x2, x0 + mov x0, 1 // equivalent to retval = 1 + mov x1, 1 // equivalent to i = 1 + + // This has five instructions (20 bytes) in the inner loop which + // increases in work by O(n). + +10: cmp x1, x2 + bgt 99f + mul x0, x0, x1 + add x1, x1, 1 + b 10b + +99: + ret +``` + +Reminder, the above code is *condense*. You will note that the code that +performs the calculation is 5 instructions (or 20 bytes) long. This +isn't much but again, the algorithm runs in O(n) time. + +## Recursive + +```c +long Recursive(long n) { + long retval; + if (n <= 1) + retval = 1; + else + retval = n * Recursive(n - 1); + + return retval; +} +``` + +The code below is *condensed*. The original code, with comments, can be +found [here](./asm.S). + +```asm +rfact: + PUSH_P x29, x30 + mov x29, sp + + cmp x0, 1 + bgt 10f + mov x0, 1 // ensure x0 is 1 - it could be less. + b 99f + +10: // If we get here, n must be more then 1. Recursion is needed. + + PUSH_R x0 // save the current n + sub x0, x0, 1 // prepare for recursion + bl rfact // + POP_R x1 // restore the current n + mul x0, x0, x1 // multiply it by recursive return + +99: POP_P x29, x30 + ret +``` diff --git a/section_3/precomputation/apple-linux-convergence.S b/section_3/precomputation/apple-linux-convergence.S new file mode 100644 index 0000000..8827423 --- /dev/null +++ b/section_3/precomputation/apple-linux-convergence.S @@ -0,0 +1,156 @@ +/* Macros to permit the "same" assembly language to build on ARM64 + Linux systems as well as Apple Silicon systems. + + See the fuller documentation at: + https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md + + Perry Kivolowitz + A Gentle Introduction to Assembly Language +*/ + +.macro GLD_PTR xreg, label +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + ldr \xreg, [\xreg, _\label@GOTPAGEOFF] +#else + ldr \xreg, =\label + ldr \xreg, [\xreg] +#endif +.endm + +.macro GLD_ADDR xreg, label // Get a global address +#if defined(__APPLE__) + adrp \xreg, _\label@GOTPAGE + add \xreg, \xreg, _\label@GOTPAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_ADDR xreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF +#else + ldr \xreg, =\label +#endif +.endm + +.macro LLD_DBL xreg, dreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \dreg, [\xreg] +// fmov \dreg, \xreg +#else + ldr \xreg, =\label + ldur \dreg, [\xreg] +#endif +.endm + +.macro LLD_FLT xreg, sreg, label +#if defined(__APPLE__) + adrp \xreg, \label@PAGE + add \xreg, \xreg, \label@PAGEOFF + ldur \sreg, [\xreg] +#else + ldr \xreg, =\label + ldur \sreg, [\xreg] +#endif +.endm + +.macro GLABEL label +#if defined(__APPLE__) + .global _\label +#else + .global \label +#endif +.endm + +.macro MAIN +#if defined(__APPLE__) +_main: +#else +main: +#endif +.endm + +/* Fetching the address of the externally defined errno is quite + different on Apple and Linux. This macro leaves the address of + errno in x0. +*/ +.macro ERRNO_ADDR +#if defined(__APPLE__) + bl ___error +#else + bl __errno_location +#endif +.endm + +.macro CRT label +#if defined(__APPLE__) + bl _\label +#else + bl \label +#endif +.endm + +.macro START_PROC // after starting label + .cfi_startproc +.endm + +.macro END_PROC // after the return + .cfi_endproc +.endm + +.macro PUSH_P a, b + stp \a, \b, [sp, -16]! +.endm + +.macro PUSH_R a + str \a, [sp, -16]! +.endm + +.macro POP_P a, b + ldp \a, \b, [sp], 16 +.endm + +.macro POP_R a + ldr \a, [sp], 16 +.endm + +/* The smaller of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MIN src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, LT +.endm + +/* The larger of src_a and src_b is put into dest. A cmp instruction + or other instruction that sets the flags must be performed first. + This macro makes it easy to remember which register does what in the + csel. + + Thank you to u/TNorthover for nudge to add the cmp. +*/ + +.macro MAX src_a, src_b, dest + cmp \src_a, \src_b + csel \dest, \src_a, \src_b, GT +.endm + +.macro AASCIZ label, string + .p2align 2 +\label: .asciz "\string" +.endm + +.macro MOD src_a, src_b, dest, scratch + sdiv \scratch, \src_a, \src_b + msub \dest, \scratch, \src_b, \src_a +.endm diff --git a/section_3/precomputation/asm.S b/section_3/precomputation/asm.S new file mode 100644 index 0000000..14cd8bc --- /dev/null +++ b/section_3/precomputation/asm.S @@ -0,0 +1,151 @@ +#include "apple-linux-convergence.S" + + .text + .p2align 2 + GLABEL ifact + GLABEL rfact + GLABEL pfact + +#if defined(__APPLE__) +_ifact: +#else +ifact: +#endif + START_PROC + + // The parameter n comes to us in register x0. We want to + // return the result in x0 so copy x0 to x2 for processing. + // Then the calculation is straight forward with x1 serving as + // the equivalent of i in the assembly language. + // + // A case can be made for counting down to 1 as this would + // require 1 fewer register at the expense of a cmp to initially + // vet the value. + // + // Finally, notice we did not backup and restore x29 and x30. We + // can get away with this because we know this function calls no + // other functions. Therefore, the value of the link register, + // x30, remains undisturbed. + + mov x2, x0 + mov x0, 1 // equivalent to retval = 1 + mov x1, 1 // equivalent to i = 1 + + // For loops are typically implemented differently from what one + // would imagine to save an instruction inside the loop. Here, + // however, we implement it in the way a programmer coming from + // C would expect. + + // This has five instructions (20 bytes) in the inner loop which + // increases in work by O(n). + +10: cmp x1, x2 + bgt 99f + mul x0, x0, x1 + add x1, x1, 1 + b 10b + +99: + ret + END_PROC + +#if defined(__APPLE__) +_rfact: +#else +rfact: +#endif + START_PROC + PUSH_P x29, x30 + mov x29, sp + + // The parameter n comes to us in x0 but it is passed by value. + // That is, the n in C is a local variable. We need to keep the + // present copy of n around when we recursively call ourselves + // with n - 1. + // + // Think stack when you think local variable. The argument can + // be made to keep the local copy in a durable register but this + // will mean a stack push anyway so instead, we choose to make + // the stack push explicit by placing it at the recursive call. + + cmp x0, 1 + bgt 10f + mov x0, 1 // ensure x0 is 1 - it could be less. + b 99f + +10: // If we get here, n must be more then 1. Recursion is needed. + + // This has five instructions (20 bytes) in the inner loop which + // increases in work by O(n) and also incurs references to RAM. + + PUSH_R x0 // save the current n + sub x0, x0, 1 // prepare for recursion + CRT rfact // borrow the macro to be x-compatible. + POP_R x1 // restore the current n + mul x0, x0, x1 // multiply it by recursive return + +99: POP_P x29, x30 + ret + END_PROC + +#if defined(__APPLE__) +_pfact: +#else +pfact: +#endif + START_PROC + PUSH_P x29, x30 + mov x29, sp + + // The parameter comes to us in x0. Since we're using it to + // create an address, we better vet the value to be between 1 + // and 15 inclusive. + // + // After that, the base address of the precomputed factorials + // is loaded into x0. Then, the parameter n (having been + // multiplied by 8) is added to the base address to form the + // address of the precomputed factorial that we're after. It's + // value is loaded into x0 for return. + + // Two instructions (8 bytes) are needed to form the correct + // address. 15 * 8 bytes are needed for the precomputed values. + // This summs to 118 bytes of ram needed, more than the previous + // two methods. But, execution time is constant so is O(1). Far + // faster. + + mov x1, x0 + mov x0, 1 + cmp x1, xzr + ble 99f + cmp x1, 15 + bgt 99f + + LLD_ADDR x0, fv + ldr x0, [x0, x1, lsl 3] + +99: POP_P x29, x30 + ret + END_PROC + + .data + + .p2align 3 + +fv: .dword 1 + .dword 1 + .dword 2 + .dword 6 + .dword 24 + .dword 120 + .dword 720 + .dword 5040 + .dword 40320 + .dword 362880 + .dword 3628800 + .dword 39916800 + .dword 479001600 + .dword 6227020800 + .dword 87178291200 + .dword 1307674368000 + + .end diff --git a/section_3/precomputation/main.c b/section_3/precomputation/main.c index a0c4152..328592d 100644 --- a/section_3/precomputation/main.c +++ b/section_3/precomputation/main.c @@ -7,11 +7,18 @@ long Iterative(long n); long Recursive(long n); long Precomputed(long n); +extern long ifact(long); +extern long rfact(long); +extern long pfact(long); + int main() { long (*func[])(long) = { Iterative, Recursive, Precomputed, + ifact, + rfact, + pfact, NULL };