mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-21 02:26:59 +08:00
most of the way there on precomputation.
This commit is contained in:
parent
32fe3b3b94
commit
054a5deaa5
5 changed files with 406 additions and 0 deletions
7
section_3/precomputation/.vscode/settings.json
vendored
Normal file
7
section_3/precomputation/.vscode/settings.json
vendored
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"cSpell.words": [
|
||||
"ifact",
|
||||
"pfact",
|
||||
"rfact"
|
||||
]
|
||||
}
|
||||
|
|
@ -23,3 +23,88 @@ Certainly, for the purposes of this demonstration, it is not necessary
|
|||
to implement both iterative and recursive methods. We do so for fun and
|
||||
for any lessons the reader can glean.
|
||||
|
||||
## C Driver
|
||||
|
||||
[Here](./main.c), you will find a version written in C. We will
|
||||
repurpose `main()` to drive versions in assembly language.
|
||||
|
||||
## Iterative
|
||||
|
||||
```c
|
||||
long Iterative(long n) {
|
||||
long retval = 1;
|
||||
for (long i = 1; i <= n; i++) {
|
||||
retval *= i;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
```
|
||||
|
||||
First, notice that this algorithm's work increases linearly with the
|
||||
parameter n. Therefore this algorithm is O(n).
|
||||
|
||||
We translated this function into assembly language to produce the code
|
||||
provided below. This code is *condensed*. To see the original code with
|
||||
comments, please see [here](/asm.S).
|
||||
|
||||
```asm
|
||||
ifact:
|
||||
mov x2, x0
|
||||
mov x0, 1 // equivalent to retval = 1
|
||||
mov x1, 1 // equivalent to i = 1
|
||||
|
||||
// This has five instructions (20 bytes) in the inner loop which
|
||||
// increases in work by O(n).
|
||||
|
||||
10: cmp x1, x2
|
||||
bgt 99f
|
||||
mul x0, x0, x1
|
||||
add x1, x1, 1
|
||||
b 10b
|
||||
|
||||
99:
|
||||
ret
|
||||
```
|
||||
|
||||
Reminder, the above code is *condense*. You will note that the code that
|
||||
performs the calculation is 5 instructions (or 20 bytes) long. This
|
||||
isn't much but again, the algorithm runs in O(n) time.
|
||||
|
||||
## Recursive
|
||||
|
||||
```c
|
||||
long Recursive(long n) {
|
||||
long retval;
|
||||
if (n <= 1)
|
||||
retval = 1;
|
||||
else
|
||||
retval = n * Recursive(n - 1);
|
||||
|
||||
return retval;
|
||||
}
|
||||
```
|
||||
|
||||
The code below is *condensed*. The original code, with comments, can be
|
||||
found [here](./asm.S).
|
||||
|
||||
```asm
|
||||
rfact:
|
||||
PUSH_P x29, x30
|
||||
mov x29, sp
|
||||
|
||||
cmp x0, 1
|
||||
bgt 10f
|
||||
mov x0, 1 // ensure x0 is 1 - it could be less.
|
||||
b 99f
|
||||
|
||||
10: // If we get here, n must be more then 1. Recursion is needed.
|
||||
|
||||
PUSH_R x0 // save the current n
|
||||
sub x0, x0, 1 // prepare for recursion
|
||||
bl rfact //
|
||||
POP_R x1 // restore the current n
|
||||
mul x0, x0, x1 // multiply it by recursive return
|
||||
|
||||
99: POP_P x29, x30
|
||||
ret
|
||||
```
|
||||
|
|
|
|||
156
section_3/precomputation/apple-linux-convergence.S
Normal file
156
section_3/precomputation/apple-linux-convergence.S
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
/* Macros to permit the "same" assembly language to build on ARM64
|
||||
Linux systems as well as Apple Silicon systems.
|
||||
|
||||
See the fuller documentation at:
|
||||
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
|
||||
|
||||
Perry Kivolowitz
|
||||
A Gentle Introduction to Assembly Language
|
||||
*/
|
||||
|
||||
.macro GLD_PTR xreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, _\label@GOTPAGE
|
||||
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldr \xreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro GLD_ADDR xreg, label // Get a global address
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, _\label@GOTPAGE
|
||||
add \xreg, \xreg, _\label@GOTPAGEOFF
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_ADDR xreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_DBL xreg, dreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
ldur \dreg, [\xreg]
|
||||
// fmov \dreg, \xreg
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldur \dreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_FLT xreg, sreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
ldur \sreg, [\xreg]
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldur \sreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro GLABEL label
|
||||
#if defined(__APPLE__)
|
||||
.global _\label
|
||||
#else
|
||||
.global \label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro MAIN
|
||||
#if defined(__APPLE__)
|
||||
_main:
|
||||
#else
|
||||
main:
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* Fetching the address of the externally defined errno is quite
|
||||
different on Apple and Linux. This macro leaves the address of
|
||||
errno in x0.
|
||||
*/
|
||||
.macro ERRNO_ADDR
|
||||
#if defined(__APPLE__)
|
||||
bl ___error
|
||||
#else
|
||||
bl __errno_location
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro CRT label
|
||||
#if defined(__APPLE__)
|
||||
bl _\label
|
||||
#else
|
||||
bl \label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro START_PROC // after starting label
|
||||
.cfi_startproc
|
||||
.endm
|
||||
|
||||
.macro END_PROC // after the return
|
||||
.cfi_endproc
|
||||
.endm
|
||||
|
||||
.macro PUSH_P a, b
|
||||
stp \a, \b, [sp, -16]!
|
||||
.endm
|
||||
|
||||
.macro PUSH_R a
|
||||
str \a, [sp, -16]!
|
||||
.endm
|
||||
|
||||
.macro POP_P a, b
|
||||
ldp \a, \b, [sp], 16
|
||||
.endm
|
||||
|
||||
.macro POP_R a
|
||||
ldr \a, [sp], 16
|
||||
.endm
|
||||
|
||||
/* The smaller of src_a and src_b is put into dest. A cmp instruction
|
||||
or other instruction that sets the flags must be performed first.
|
||||
This macro makes it easy to remember which register does what in the
|
||||
csel.
|
||||
|
||||
Thank you to u/TNorthover for nudge to add the cmp.
|
||||
*/
|
||||
|
||||
.macro MIN src_a, src_b, dest
|
||||
cmp \src_a, \src_b
|
||||
csel \dest, \src_a, \src_b, LT
|
||||
.endm
|
||||
|
||||
/* The larger of src_a and src_b is put into dest. A cmp instruction
|
||||
or other instruction that sets the flags must be performed first.
|
||||
This macro makes it easy to remember which register does what in the
|
||||
csel.
|
||||
|
||||
Thank you to u/TNorthover for nudge to add the cmp.
|
||||
*/
|
||||
|
||||
.macro MAX src_a, src_b, dest
|
||||
cmp \src_a, \src_b
|
||||
csel \dest, \src_a, \src_b, GT
|
||||
.endm
|
||||
|
||||
.macro AASCIZ label, string
|
||||
.p2align 2
|
||||
\label: .asciz "\string"
|
||||
.endm
|
||||
|
||||
.macro MOD src_a, src_b, dest, scratch
|
||||
sdiv \scratch, \src_a, \src_b
|
||||
msub \dest, \scratch, \src_b, \src_a
|
||||
.endm
|
||||
151
section_3/precomputation/asm.S
Normal file
151
section_3/precomputation/asm.S
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
#include "apple-linux-convergence.S"
|
||||
|
||||
.text
|
||||
.p2align 2
|
||||
GLABEL ifact
|
||||
GLABEL rfact
|
||||
GLABEL pfact
|
||||
|
||||
#if defined(__APPLE__)
|
||||
_ifact:
|
||||
#else
|
||||
ifact:
|
||||
#endif
|
||||
START_PROC
|
||||
|
||||
// The parameter n comes to us in register x0. We want to
|
||||
// return the result in x0 so copy x0 to x2 for processing.
|
||||
// Then the calculation is straight forward with x1 serving as
|
||||
// the equivalent of i in the assembly language.
|
||||
//
|
||||
// A case can be made for counting down to 1 as this would
|
||||
// require 1 fewer register at the expense of a cmp to initially
|
||||
// vet the value.
|
||||
//
|
||||
// Finally, notice we did not backup and restore x29 and x30. We
|
||||
// can get away with this because we know this function calls no
|
||||
// other functions. Therefore, the value of the link register,
|
||||
// x30, remains undisturbed.
|
||||
|
||||
mov x2, x0
|
||||
mov x0, 1 // equivalent to retval = 1
|
||||
mov x1, 1 // equivalent to i = 1
|
||||
|
||||
// For loops are typically implemented differently from what one
|
||||
// would imagine to save an instruction inside the loop. Here,
|
||||
// however, we implement it in the way a programmer coming from
|
||||
// C would expect.
|
||||
|
||||
// This has five instructions (20 bytes) in the inner loop which
|
||||
// increases in work by O(n).
|
||||
|
||||
10: cmp x1, x2
|
||||
bgt 99f
|
||||
mul x0, x0, x1
|
||||
add x1, x1, 1
|
||||
b 10b
|
||||
|
||||
99:
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
#if defined(__APPLE__)
|
||||
_rfact:
|
||||
#else
|
||||
rfact:
|
||||
#endif
|
||||
START_PROC
|
||||
PUSH_P x29, x30
|
||||
mov x29, sp
|
||||
|
||||
// The parameter n comes to us in x0 but it is passed by value.
|
||||
// That is, the n in C is a local variable. We need to keep the
|
||||
// present copy of n around when we recursively call ourselves
|
||||
// with n - 1.
|
||||
//
|
||||
// Think stack when you think local variable. The argument can
|
||||
// be made to keep the local copy in a durable register but this
|
||||
// will mean a stack push anyway so instead, we choose to make
|
||||
// the stack push explicit by placing it at the recursive call.
|
||||
|
||||
cmp x0, 1
|
||||
bgt 10f
|
||||
mov x0, 1 // ensure x0 is 1 - it could be less.
|
||||
b 99f
|
||||
|
||||
10: // If we get here, n must be more then 1. Recursion is needed.
|
||||
|
||||
// This has five instructions (20 bytes) in the inner loop which
|
||||
// increases in work by O(n) and also incurs references to RAM.
|
||||
|
||||
PUSH_R x0 // save the current n
|
||||
sub x0, x0, 1 // prepare for recursion
|
||||
CRT rfact // borrow the macro to be x-compatible.
|
||||
POP_R x1 // restore the current n
|
||||
mul x0, x0, x1 // multiply it by recursive return
|
||||
|
||||
99: POP_P x29, x30
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
#if defined(__APPLE__)
|
||||
_pfact:
|
||||
#else
|
||||
pfact:
|
||||
#endif
|
||||
START_PROC
|
||||
PUSH_P x29, x30
|
||||
mov x29, sp
|
||||
|
||||
// The parameter comes to us in x0. Since we're using it to
|
||||
// create an address, we better vet the value to be between 1
|
||||
// and 15 inclusive.
|
||||
//
|
||||
// After that, the base address of the precomputed factorials
|
||||
// is loaded into x0. Then, the parameter n (having been
|
||||
// multiplied by 8) is added to the base address to form the
|
||||
// address of the precomputed factorial that we're after. It's
|
||||
// value is loaded into x0 for return.
|
||||
|
||||
// Two instructions (8 bytes) are needed to form the correct
|
||||
// address. 15 * 8 bytes are needed for the precomputed values.
|
||||
// This summs to 118 bytes of ram needed, more than the previous
|
||||
// two methods. But, execution time is constant so is O(1). Far
|
||||
// faster.
|
||||
|
||||
mov x1, x0
|
||||
mov x0, 1
|
||||
cmp x1, xzr
|
||||
ble 99f
|
||||
cmp x1, 15
|
||||
bgt 99f
|
||||
|
||||
LLD_ADDR x0, fv
|
||||
ldr x0, [x0, x1, lsl 3]
|
||||
|
||||
99: POP_P x29, x30
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
.data
|
||||
|
||||
.p2align 3
|
||||
|
||||
fv: .dword 1
|
||||
.dword 1
|
||||
.dword 2
|
||||
.dword 6
|
||||
.dword 24
|
||||
.dword 120
|
||||
.dword 720
|
||||
.dword 5040
|
||||
.dword 40320
|
||||
.dword 362880
|
||||
.dword 3628800
|
||||
.dword 39916800
|
||||
.dword 479001600
|
||||
.dword 6227020800
|
||||
.dword 87178291200
|
||||
.dword 1307674368000
|
||||
|
||||
.end
|
||||
|
|
@ -7,11 +7,18 @@ long Iterative(long n);
|
|||
long Recursive(long n);
|
||||
long Precomputed(long n);
|
||||
|
||||
extern long ifact(long);
|
||||
extern long rfact(long);
|
||||
extern long pfact(long);
|
||||
|
||||
int main() {
|
||||
long (*func[])(long) = {
|
||||
Iterative,
|
||||
Recursive,
|
||||
Precomputed,
|
||||
ifact,
|
||||
rfact,
|
||||
pfact,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue