mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-23 15:36:45 +08:00
most of the way there on precomputation.
This commit is contained in:
parent
32fe3b3b94
commit
054a5deaa5
5 changed files with 406 additions and 0 deletions
7
section_3/precomputation/.vscode/settings.json
vendored
Normal file
7
section_3/precomputation/.vscode/settings.json
vendored
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"cSpell.words": [
|
||||||
|
"ifact",
|
||||||
|
"pfact",
|
||||||
|
"rfact"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -23,3 +23,88 @@ Certainly, for the purposes of this demonstration, it is not necessary
|
||||||
to implement both iterative and recursive methods. We do so for fun and
|
to implement both iterative and recursive methods. We do so for fun and
|
||||||
for any lessons the reader can glean.
|
for any lessons the reader can glean.
|
||||||
|
|
||||||
|
## C Driver
|
||||||
|
|
||||||
|
[Here](./main.c), you will find a version written in C. We will
|
||||||
|
repurpose `main()` to drive versions in assembly language.
|
||||||
|
|
||||||
|
## Iterative
|
||||||
|
|
||||||
|
```c
|
||||||
|
long Iterative(long n) {
|
||||||
|
long retval = 1;
|
||||||
|
for (long i = 1; i <= n; i++) {
|
||||||
|
retval *= i;
|
||||||
|
}
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
First, notice that this algorithm's work increases linearly with the
|
||||||
|
parameter n. Therefore this algorithm is O(n).
|
||||||
|
|
||||||
|
We translated this function into assembly language to produce the code
|
||||||
|
provided below. This code is *condensed*. To see the original code with
|
||||||
|
comments, please see [here](/asm.S).
|
||||||
|
|
||||||
|
```asm
|
||||||
|
ifact:
|
||||||
|
mov x2, x0
|
||||||
|
mov x0, 1 // equivalent to retval = 1
|
||||||
|
mov x1, 1 // equivalent to i = 1
|
||||||
|
|
||||||
|
// This has five instructions (20 bytes) in the inner loop which
|
||||||
|
// increases in work by O(n).
|
||||||
|
|
||||||
|
10: cmp x1, x2
|
||||||
|
bgt 99f
|
||||||
|
mul x0, x0, x1
|
||||||
|
add x1, x1, 1
|
||||||
|
b 10b
|
||||||
|
|
||||||
|
99:
|
||||||
|
ret
|
||||||
|
```
|
||||||
|
|
||||||
|
Reminder, the above code is *condense*. You will note that the code that
|
||||||
|
performs the calculation is 5 instructions (or 20 bytes) long. This
|
||||||
|
isn't much but again, the algorithm runs in O(n) time.
|
||||||
|
|
||||||
|
## Recursive
|
||||||
|
|
||||||
|
```c
|
||||||
|
long Recursive(long n) {
|
||||||
|
long retval;
|
||||||
|
if (n <= 1)
|
||||||
|
retval = 1;
|
||||||
|
else
|
||||||
|
retval = n * Recursive(n - 1);
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The code below is *condensed*. The original code, with comments, can be
|
||||||
|
found [here](./asm.S).
|
||||||
|
|
||||||
|
```asm
|
||||||
|
rfact:
|
||||||
|
PUSH_P x29, x30
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
cmp x0, 1
|
||||||
|
bgt 10f
|
||||||
|
mov x0, 1 // ensure x0 is 1 - it could be less.
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
10: // If we get here, n must be more then 1. Recursion is needed.
|
||||||
|
|
||||||
|
PUSH_R x0 // save the current n
|
||||||
|
sub x0, x0, 1 // prepare for recursion
|
||||||
|
bl rfact //
|
||||||
|
POP_R x1 // restore the current n
|
||||||
|
mul x0, x0, x1 // multiply it by recursive return
|
||||||
|
|
||||||
|
99: POP_P x29, x30
|
||||||
|
ret
|
||||||
|
```
|
||||||
|
|
|
||||||
156
section_3/precomputation/apple-linux-convergence.S
Normal file
156
section_3/precomputation/apple-linux-convergence.S
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
/* Macros to permit the "same" assembly language to build on ARM64
|
||||||
|
Linux systems as well as Apple Silicon systems.
|
||||||
|
|
||||||
|
See the fuller documentation at:
|
||||||
|
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
|
||||||
|
|
||||||
|
Perry Kivolowitz
|
||||||
|
A Gentle Introduction to Assembly Language
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro GLD_PTR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldr \xreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLD_ADDR xreg, label // Get a global address
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
add \xreg, \xreg, _\label@GOTPAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_ADDR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_DBL xreg, dreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
// fmov \dreg, \xreg
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_FLT xreg, sreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLABEL label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
.global _\label
|
||||||
|
#else
|
||||||
|
.global \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MAIN
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_main:
|
||||||
|
#else
|
||||||
|
main:
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* Fetching the address of the externally defined errno is quite
|
||||||
|
different on Apple and Linux. This macro leaves the address of
|
||||||
|
errno in x0.
|
||||||
|
*/
|
||||||
|
.macro ERRNO_ADDR
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl ___error
|
||||||
|
#else
|
||||||
|
bl __errno_location
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CRT label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl _\label
|
||||||
|
#else
|
||||||
|
bl \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro START_PROC // after starting label
|
||||||
|
.cfi_startproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro END_PROC // after the return
|
||||||
|
.cfi_endproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_P a, b
|
||||||
|
stp \a, \b, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_R a
|
||||||
|
str \a, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_P a, b
|
||||||
|
ldp \a, \b, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_R a
|
||||||
|
ldr \a, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The smaller of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MIN src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, LT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The larger of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MAX src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, GT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro AASCIZ label, string
|
||||||
|
.p2align 2
|
||||||
|
\label: .asciz "\string"
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
151
section_3/precomputation/asm.S
Normal file
151
section_3/precomputation/asm.S
Normal file
|
|
@ -0,0 +1,151 @@
|
||||||
|
#include "apple-linux-convergence.S"
|
||||||
|
|
||||||
|
.text
|
||||||
|
.p2align 2
|
||||||
|
GLABEL ifact
|
||||||
|
GLABEL rfact
|
||||||
|
GLABEL pfact
|
||||||
|
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_ifact:
|
||||||
|
#else
|
||||||
|
ifact:
|
||||||
|
#endif
|
||||||
|
START_PROC
|
||||||
|
|
||||||
|
// The parameter n comes to us in register x0. We want to
|
||||||
|
// return the result in x0 so copy x0 to x2 for processing.
|
||||||
|
// Then the calculation is straight forward with x1 serving as
|
||||||
|
// the equivalent of i in the assembly language.
|
||||||
|
//
|
||||||
|
// A case can be made for counting down to 1 as this would
|
||||||
|
// require 1 fewer register at the expense of a cmp to initially
|
||||||
|
// vet the value.
|
||||||
|
//
|
||||||
|
// Finally, notice we did not backup and restore x29 and x30. We
|
||||||
|
// can get away with this because we know this function calls no
|
||||||
|
// other functions. Therefore, the value of the link register,
|
||||||
|
// x30, remains undisturbed.
|
||||||
|
|
||||||
|
mov x2, x0
|
||||||
|
mov x0, 1 // equivalent to retval = 1
|
||||||
|
mov x1, 1 // equivalent to i = 1
|
||||||
|
|
||||||
|
// For loops are typically implemented differently from what one
|
||||||
|
// would imagine to save an instruction inside the loop. Here,
|
||||||
|
// however, we implement it in the way a programmer coming from
|
||||||
|
// C would expect.
|
||||||
|
|
||||||
|
// This has five instructions (20 bytes) in the inner loop which
|
||||||
|
// increases in work by O(n).
|
||||||
|
|
||||||
|
10: cmp x1, x2
|
||||||
|
bgt 99f
|
||||||
|
mul x0, x0, x1
|
||||||
|
add x1, x1, 1
|
||||||
|
b 10b
|
||||||
|
|
||||||
|
99:
|
||||||
|
ret
|
||||||
|
END_PROC
|
||||||
|
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_rfact:
|
||||||
|
#else
|
||||||
|
rfact:
|
||||||
|
#endif
|
||||||
|
START_PROC
|
||||||
|
PUSH_P x29, x30
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
// The parameter n comes to us in x0 but it is passed by value.
|
||||||
|
// That is, the n in C is a local variable. We need to keep the
|
||||||
|
// present copy of n around when we recursively call ourselves
|
||||||
|
// with n - 1.
|
||||||
|
//
|
||||||
|
// Think stack when you think local variable. The argument can
|
||||||
|
// be made to keep the local copy in a durable register but this
|
||||||
|
// will mean a stack push anyway so instead, we choose to make
|
||||||
|
// the stack push explicit by placing it at the recursive call.
|
||||||
|
|
||||||
|
cmp x0, 1
|
||||||
|
bgt 10f
|
||||||
|
mov x0, 1 // ensure x0 is 1 - it could be less.
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
10: // If we get here, n must be more then 1. Recursion is needed.
|
||||||
|
|
||||||
|
// This has five instructions (20 bytes) in the inner loop which
|
||||||
|
// increases in work by O(n) and also incurs references to RAM.
|
||||||
|
|
||||||
|
PUSH_R x0 // save the current n
|
||||||
|
sub x0, x0, 1 // prepare for recursion
|
||||||
|
CRT rfact // borrow the macro to be x-compatible.
|
||||||
|
POP_R x1 // restore the current n
|
||||||
|
mul x0, x0, x1 // multiply it by recursive return
|
||||||
|
|
||||||
|
99: POP_P x29, x30
|
||||||
|
ret
|
||||||
|
END_PROC
|
||||||
|
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_pfact:
|
||||||
|
#else
|
||||||
|
pfact:
|
||||||
|
#endif
|
||||||
|
START_PROC
|
||||||
|
PUSH_P x29, x30
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
// The parameter comes to us in x0. Since we're using it to
|
||||||
|
// create an address, we better vet the value to be between 1
|
||||||
|
// and 15 inclusive.
|
||||||
|
//
|
||||||
|
// After that, the base address of the precomputed factorials
|
||||||
|
// is loaded into x0. Then, the parameter n (having been
|
||||||
|
// multiplied by 8) is added to the base address to form the
|
||||||
|
// address of the precomputed factorial that we're after. It's
|
||||||
|
// value is loaded into x0 for return.
|
||||||
|
|
||||||
|
// Two instructions (8 bytes) are needed to form the correct
|
||||||
|
// address. 15 * 8 bytes are needed for the precomputed values.
|
||||||
|
// This summs to 118 bytes of ram needed, more than the previous
|
||||||
|
// two methods. But, execution time is constant so is O(1). Far
|
||||||
|
// faster.
|
||||||
|
|
||||||
|
mov x1, x0
|
||||||
|
mov x0, 1
|
||||||
|
cmp x1, xzr
|
||||||
|
ble 99f
|
||||||
|
cmp x1, 15
|
||||||
|
bgt 99f
|
||||||
|
|
||||||
|
LLD_ADDR x0, fv
|
||||||
|
ldr x0, [x0, x1, lsl 3]
|
||||||
|
|
||||||
|
99: POP_P x29, x30
|
||||||
|
ret
|
||||||
|
END_PROC
|
||||||
|
|
||||||
|
.data
|
||||||
|
|
||||||
|
.p2align 3
|
||||||
|
|
||||||
|
fv: .dword 1
|
||||||
|
.dword 1
|
||||||
|
.dword 2
|
||||||
|
.dword 6
|
||||||
|
.dword 24
|
||||||
|
.dword 120
|
||||||
|
.dword 720
|
||||||
|
.dword 5040
|
||||||
|
.dword 40320
|
||||||
|
.dword 362880
|
||||||
|
.dword 3628800
|
||||||
|
.dword 39916800
|
||||||
|
.dword 479001600
|
||||||
|
.dword 6227020800
|
||||||
|
.dword 87178291200
|
||||||
|
.dword 1307674368000
|
||||||
|
|
||||||
|
.end
|
||||||
|
|
@ -7,11 +7,18 @@ long Iterative(long n);
|
||||||
long Recursive(long n);
|
long Recursive(long n);
|
||||||
long Precomputed(long n);
|
long Precomputed(long n);
|
||||||
|
|
||||||
|
extern long ifact(long);
|
||||||
|
extern long rfact(long);
|
||||||
|
extern long pfact(long);
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
long (*func[])(long) = {
|
long (*func[])(long) = {
|
||||||
Iterative,
|
Iterative,
|
||||||
Recursive,
|
Recursive,
|
||||||
Precomputed,
|
Precomputed,
|
||||||
|
ifact,
|
||||||
|
rfact,
|
||||||
|
pfact,
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue