most of the way there on precomputation.

This commit is contained in:
Perry Kivolowitz 2024-04-13 14:24:55 -05:00
parent 32fe3b3b94
commit 054a5deaa5
5 changed files with 406 additions and 0 deletions

View file

@ -0,0 +1,7 @@
{
"cSpell.words": [
"ifact",
"pfact",
"rfact"
]
}

View file

@ -23,3 +23,88 @@ Certainly, for the purposes of this demonstration, it is not necessary
to implement both iterative and recursive methods. We do so for fun and
for any lessons the reader can glean.
## C Driver
[Here](./main.c), you will find a version written in C. We will
repurpose `main()` to drive versions in assembly language.
## Iterative
```c
long Iterative(long n) {
long retval = 1;
for (long i = 1; i <= n; i++) {
retval *= i;
}
return retval;
}
```
First, notice that this algorithm's work increases linearly with the
parameter n. Therefore this algorithm is O(n).
We translated this function into assembly language to produce the code
provided below. This code is *condensed*. To see the original code with
comments, please see [here](/asm.S).
```asm
ifact:
mov x2, x0
mov x0, 1 // equivalent to retval = 1
mov x1, 1 // equivalent to i = 1
// This has five instructions (20 bytes) in the inner loop which
// increases in work by O(n).
10: cmp x1, x2
bgt 99f
mul x0, x0, x1
add x1, x1, 1
b 10b
99:
ret
```
Reminder, the above code is *condense*. You will note that the code that
performs the calculation is 5 instructions (or 20 bytes) long. This
isn't much but again, the algorithm runs in O(n) time.
## Recursive
```c
long Recursive(long n) {
long retval;
if (n <= 1)
retval = 1;
else
retval = n * Recursive(n - 1);
return retval;
}
```
The code below is *condensed*. The original code, with comments, can be
found [here](./asm.S).
```asm
rfact:
PUSH_P x29, x30
mov x29, sp
cmp x0, 1
bgt 10f
mov x0, 1 // ensure x0 is 1 - it could be less.
b 99f
10: // If we get here, n must be more then 1. Recursion is needed.
PUSH_R x0 // save the current n
sub x0, x0, 1 // prepare for recursion
bl rfact //
POP_R x1 // restore the current n
mul x0, x0, x1 // multiply it by recursive return
99: POP_P x29, x30
ret
```

View file

@ -0,0 +1,156 @@
/* Macros to permit the "same" assembly language to build on ARM64
Linux systems as well as Apple Silicon systems.
See the fuller documentation at:
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
Perry Kivolowitz
A Gentle Introduction to Assembly Language
*/
.macro GLD_PTR xreg, label
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
#else
ldr \xreg, =\label
ldr \xreg, [\xreg]
#endif
.endm
.macro GLD_ADDR xreg, label // Get a global address
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
add \xreg, \xreg, _\label@GOTPAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_ADDR xreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_DBL xreg, dreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \dreg, [\xreg]
// fmov \dreg, \xreg
#else
ldr \xreg, =\label
ldur \dreg, [\xreg]
#endif
.endm
.macro LLD_FLT xreg, sreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \sreg, [\xreg]
#else
ldr \xreg, =\label
ldur \sreg, [\xreg]
#endif
.endm
.macro GLABEL label
#if defined(__APPLE__)
.global _\label
#else
.global \label
#endif
.endm
.macro MAIN
#if defined(__APPLE__)
_main:
#else
main:
#endif
.endm
/* Fetching the address of the externally defined errno is quite
different on Apple and Linux. This macro leaves the address of
errno in x0.
*/
.macro ERRNO_ADDR
#if defined(__APPLE__)
bl ___error
#else
bl __errno_location
#endif
.endm
.macro CRT label
#if defined(__APPLE__)
bl _\label
#else
bl \label
#endif
.endm
.macro START_PROC // after starting label
.cfi_startproc
.endm
.macro END_PROC // after the return
.cfi_endproc
.endm
.macro PUSH_P a, b
stp \a, \b, [sp, -16]!
.endm
.macro PUSH_R a
str \a, [sp, -16]!
.endm
.macro POP_P a, b
ldp \a, \b, [sp], 16
.endm
.macro POP_R a
ldr \a, [sp], 16
.endm
/* The smaller of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MIN src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, LT
.endm
/* The larger of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MAX src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, GT
.endm
.macro AASCIZ label, string
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -0,0 +1,151 @@
#include "apple-linux-convergence.S"
.text
.p2align 2
GLABEL ifact
GLABEL rfact
GLABEL pfact
#if defined(__APPLE__)
_ifact:
#else
ifact:
#endif
START_PROC
// The parameter n comes to us in register x0. We want to
// return the result in x0 so copy x0 to x2 for processing.
// Then the calculation is straight forward with x1 serving as
// the equivalent of i in the assembly language.
//
// A case can be made for counting down to 1 as this would
// require 1 fewer register at the expense of a cmp to initially
// vet the value.
//
// Finally, notice we did not backup and restore x29 and x30. We
// can get away with this because we know this function calls no
// other functions. Therefore, the value of the link register,
// x30, remains undisturbed.
mov x2, x0
mov x0, 1 // equivalent to retval = 1
mov x1, 1 // equivalent to i = 1
// For loops are typically implemented differently from what one
// would imagine to save an instruction inside the loop. Here,
// however, we implement it in the way a programmer coming from
// C would expect.
// This has five instructions (20 bytes) in the inner loop which
// increases in work by O(n).
10: cmp x1, x2
bgt 99f
mul x0, x0, x1
add x1, x1, 1
b 10b
99:
ret
END_PROC
#if defined(__APPLE__)
_rfact:
#else
rfact:
#endif
START_PROC
PUSH_P x29, x30
mov x29, sp
// The parameter n comes to us in x0 but it is passed by value.
// That is, the n in C is a local variable. We need to keep the
// present copy of n around when we recursively call ourselves
// with n - 1.
//
// Think stack when you think local variable. The argument can
// be made to keep the local copy in a durable register but this
// will mean a stack push anyway so instead, we choose to make
// the stack push explicit by placing it at the recursive call.
cmp x0, 1
bgt 10f
mov x0, 1 // ensure x0 is 1 - it could be less.
b 99f
10: // If we get here, n must be more then 1. Recursion is needed.
// This has five instructions (20 bytes) in the inner loop which
// increases in work by O(n) and also incurs references to RAM.
PUSH_R x0 // save the current n
sub x0, x0, 1 // prepare for recursion
CRT rfact // borrow the macro to be x-compatible.
POP_R x1 // restore the current n
mul x0, x0, x1 // multiply it by recursive return
99: POP_P x29, x30
ret
END_PROC
#if defined(__APPLE__)
_pfact:
#else
pfact:
#endif
START_PROC
PUSH_P x29, x30
mov x29, sp
// The parameter comes to us in x0. Since we're using it to
// create an address, we better vet the value to be between 1
// and 15 inclusive.
//
// After that, the base address of the precomputed factorials
// is loaded into x0. Then, the parameter n (having been
// multiplied by 8) is added to the base address to form the
// address of the precomputed factorial that we're after. It's
// value is loaded into x0 for return.
// Two instructions (8 bytes) are needed to form the correct
// address. 15 * 8 bytes are needed for the precomputed values.
// This summs to 118 bytes of ram needed, more than the previous
// two methods. But, execution time is constant so is O(1). Far
// faster.
mov x1, x0
mov x0, 1
cmp x1, xzr
ble 99f
cmp x1, 15
bgt 99f
LLD_ADDR x0, fv
ldr x0, [x0, x1, lsl 3]
99: POP_P x29, x30
ret
END_PROC
.data
.p2align 3
fv: .dword 1
.dword 1
.dword 2
.dword 6
.dword 24
.dword 120
.dword 720
.dword 5040
.dword 40320
.dword 362880
.dword 3628800
.dword 39916800
.dword 479001600
.dword 6227020800
.dword 87178291200
.dword 1307674368000
.end

View file

@ -7,11 +7,18 @@ long Iterative(long n);
long Recursive(long n);
long Precomputed(long n);
extern long ifact(long);
extern long rfact(long);
extern long pfact(long);
int main() {
long (*func[])(long) = {
Iterative,
Recursive,
Precomputed,
ifact,
rfact,
pfact,
NULL
};