mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-23 04:36:47 +08:00
Merge branch 'main' of https://github.com/pkivolowitz/asm_book
ijoijoijoij
This commit is contained in:
commit
65931b77a0
29 changed files with 1125 additions and 84 deletions
|
|
@ -308,7 +308,11 @@ What would a book about assembly language be without bit bashing?
|
||||||
|
|
||||||
### Section 4 - More Stuff
|
### Section 4 - More Stuff
|
||||||
|
|
||||||
In this section, we present miscellaneous material.
|
In this section, we present miscellaneous material including our "world
|
||||||
|
famous lecture" on debugging. This lecture has been invited at several
|
||||||
|
colleges and universities. It is intended for audiences working with
|
||||||
|
languages like C, C++ and assembly language but some of the lessons
|
||||||
|
contained therein are applicable to all languages.
|
||||||
|
|
||||||
| Chapter | Markdown | PDF |
|
| Chapter | Markdown | PDF |
|
||||||
| ------- | -------- | --- |
|
| ------- | -------- | --- |
|
||||||
|
|
@ -319,6 +323,9 @@ In this section, we present miscellaneous material.
|
||||||
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
|
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
|
||||||
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
|
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
|
||||||
| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
|
| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
|
||||||
|
| 8 | [Jump Tables](./more/jump_tables/README.md) | [Link](./more/jump_tables/README.pdf) |
|
||||||
|
| 9 | [argv](./more/argv_example/jess1.S) | ASM CODE |
|
||||||
|
| - | [Debugging Lecture](./debugging/Discourses%20and%20Dialogs%20on%20Debugging.pptx) | PPTX |
|
||||||
|
|
||||||
## Macro Suite
|
## Macro Suite
|
||||||
|
|
||||||
|
|
|
||||||
BIN
README.pdf
BIN
README.pdf
Binary file not shown.
BIN
debugging/Discourses and Dialogs on Debugging.pptx
Normal file
BIN
debugging/Discourses and Dialogs on Debugging.pptx
Normal file
Binary file not shown.
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
156
more/argv_example/apple-linux-convergence.S
Normal file
156
more/argv_example/apple-linux-convergence.S
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
/* Macros to permit the "same" assembly language to build on ARM64
|
||||||
|
Linux systems as well as Apple Silicon systems.
|
||||||
|
|
||||||
|
See the fuller documentation at:
|
||||||
|
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
|
||||||
|
|
||||||
|
Perry Kivolowitz
|
||||||
|
A Gentle Introduction to Assembly Language
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro GLD_PTR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldr \xreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLD_ADDR xreg, label // Get a global address
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
add \xreg, \xreg, _\label@GOTPAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_ADDR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_DBL xreg, dreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
// fmov \dreg, \xreg
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_FLT xreg, sreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLABEL label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
.global _\label
|
||||||
|
#else
|
||||||
|
.global \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MAIN
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_main:
|
||||||
|
#else
|
||||||
|
main:
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* Fetching the address of the externally defined errno is quite
|
||||||
|
different on Apple and Linux. This macro leaves the address of
|
||||||
|
errno in x0.
|
||||||
|
*/
|
||||||
|
.macro ERRNO_ADDR
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl ___error
|
||||||
|
#else
|
||||||
|
bl __errno_location
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CRT label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl _\label
|
||||||
|
#else
|
||||||
|
bl \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro START_PROC // after starting label
|
||||||
|
.cfi_startproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro END_PROC // after the return
|
||||||
|
.cfi_endproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_P a, b
|
||||||
|
stp \a, \b, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_R a
|
||||||
|
str \a, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_P a, b
|
||||||
|
ldp \a, \b, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_R a
|
||||||
|
ldr \a, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The smaller of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MIN src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, LT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The larger of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MAX src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, GT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro AASCIZ label, string
|
||||||
|
.p2align 2
|
||||||
|
\label: .asciz "\string"
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
111
more/argv_example/jess1.S
Normal file
111
more/argv_example/jess1.S
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
#include "apple-linux-convergence.S"
|
||||||
|
|
||||||
|
.p2align 2
|
||||||
|
.text
|
||||||
|
GLABEL main
|
||||||
|
|
||||||
|
/* This program will get a string followed by a double followed by an
|
||||||
|
integer from the command line demonstrating how each of these types
|
||||||
|
can be retrieved.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
./a.out test 29.3 29
|
||||||
|
*/
|
||||||
|
|
||||||
|
MAIN
|
||||||
|
PUSH_P x29, x30
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
// Check argc to see if it is 4. This is not the only way to
|
||||||
|
// validate command line arguments but it is an easy way.
|
||||||
|
cmp w0, 4
|
||||||
|
bne 99f // take branch if argc isn't "right".
|
||||||
|
|
||||||
|
// Skip past argv[0]
|
||||||
|
add x1, x1, 8
|
||||||
|
|
||||||
|
// Fetch argv[1] as a string.
|
||||||
|
// x1 is a pointer to a pointer to chars (i.e. the string).
|
||||||
|
// Being a pointer to a pointer, it must be dereferenced to
|
||||||
|
// make a pointer.
|
||||||
|
ldr x0, [x1] // dereference
|
||||||
|
// Now x0 contains a pointer to the command line argument.
|
||||||
|
// Print the string (as a string). But doing this causes a
|
||||||
|
// function call which will destroy x1. So, save x1 temporarily.
|
||||||
|
// This could be avoided if x1 were moved to a backed up x
|
||||||
|
// register (e.g. x20).
|
||||||
|
PUSH_R x1
|
||||||
|
CRT puts // ptr is in x0 where puts() needs it.
|
||||||
|
POP_R x1
|
||||||
|
|
||||||
|
// Advance x1 once again to get to argv[2] which can be done
|
||||||
|
// in the same instruction as dereferencing it use a
|
||||||
|
// preincrement.
|
||||||
|
ldr x0, [x1, 8]! // dereference
|
||||||
|
|
||||||
|
// Now the string version of argv[2] is now pointed to by x0.
|
||||||
|
// This is exactly where atof would want it. We need atof
|
||||||
|
// because it turns strings into numbers. BUT, same as before,
|
||||||
|
// calling a function would destroy x1 so let's do the same
|
||||||
|
// trick of backing up x1 on the stack and then restoring after
|
||||||
|
// the function call.
|
||||||
|
PUSH_R x1
|
||||||
|
CRT atof // ptr is in x0 where atof() needs it.
|
||||||
|
POP_R x1
|
||||||
|
// The string value will be converted to a double left in d0.
|
||||||
|
// d0 is also a scratch register so for our next call to atoi,
|
||||||
|
// d0 will have to be preserved on the stack - alternatively,
|
||||||
|
// we could have used a high d register backed up and restored
|
||||||
|
// at the start and ending of main().
|
||||||
|
|
||||||
|
// Advance x1 once again to get to argv[3] which can be done
|
||||||
|
// in the same instruction as dereferencing it use a
|
||||||
|
// preincrement.
|
||||||
|
ldr x0, [x1, 8]! // dereference
|
||||||
|
|
||||||
|
// Now the string version of argv[3] is now pointed to by x0.
|
||||||
|
// This is exactly where atoi would want it. We need atoi
|
||||||
|
// because it turns strings into numbers. BUT, same as before,
|
||||||
|
// calling a function would destroy x1 so let's do the same
|
||||||
|
// trick of backing up x1 on the stack and then restoring after
|
||||||
|
// the function call. We must also do the same for d0. Actually,
|
||||||
|
// we won't need argv after this so we will skip backing up x1.
|
||||||
|
|
||||||
|
PUSH_R d0
|
||||||
|
CRT atoi // ptr is in x0 where atof() needs it.
|
||||||
|
POP_R d0
|
||||||
|
// d0 now contains the double.
|
||||||
|
// x0 now contains the integer.
|
||||||
|
// x0 must be copied to x1 because x0 must be a pointer to fmt
|
||||||
|
// for printf to work.
|
||||||
|
mov x1, x0
|
||||||
|
LLD_ADDR x0, fmt
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
sub sp, sp, 16
|
||||||
|
str x1, [sp, 8]
|
||||||
|
str d0, [sp]
|
||||||
|
CRT printf
|
||||||
|
add sp, sp, 16
|
||||||
|
#else
|
||||||
|
bl printf
|
||||||
|
#endif
|
||||||
|
|
||||||
|
99: POP_P x29, x30
|
||||||
|
mov w0, wzr
|
||||||
|
ret
|
||||||
|
|
||||||
|
/* What did we learn?
|
||||||
|
* x1 has argv when main begins.
|
||||||
|
* pointers to the arguments are the contents of argv NOT
|
||||||
|
the actual values. Therefore, x1, which is a pointer (to a pointer),
|
||||||
|
must be dereferenced to get to the actual pointer. In the code,
|
||||||
|
there are three lines with the comment "// dereference".
|
||||||
|
* all command line arguments are c-strings. If that's not what you
|
||||||
|
want, they must be converted - see the code for atoi and atof for
|
||||||
|
examples.
|
||||||
|
*/
|
||||||
|
.data
|
||||||
|
|
||||||
|
fmt: .asciz "double: %f integer: %d\n"
|
||||||
|
|
||||||
|
.end
|
||||||
6
more/jump_tables/.gdb_history
Normal file
6
more/jump_tables/.gdb_history
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
b MyMemSet
|
||||||
|
run
|
||||||
|
n
|
||||||
|
n
|
||||||
|
:q
|
||||||
|
q
|
||||||
297
more/jump_tables/README.md
Normal file
297
more/jump_tables/README.md
Normal file
|
|
@ -0,0 +1,297 @@
|
||||||
|
# Jump or Branch Tables
|
||||||
|
|
||||||
|
A jump or branch table is a powerful instruction saving technique that
|
||||||
|
can be used to switch between multiple single instructions or even
|
||||||
|
choose one of a series of functions to call (or branches to take).
|
||||||
|
|
||||||
|
This concept can be found as the implementation of some `switch`
|
||||||
|
statements and is found at the very very lowest end of an Operating
|
||||||
|
System (interrupt vectors, for example).
|
||||||
|
|
||||||
|
The
|
||||||
|
|
||||||
|
## Single Instructions a la Duff's Device
|
||||||
|
|
||||||
|
[Duff's Device](https://en.wikipedia.org/wiki/Duff%27s_device) shoe
|
||||||
|
horned a jump table into the middle of a `while` loop. At the same
|
||||||
|
time, it also demonstrates a simple case of *loop unrolling*.
|
||||||
|
It's very creative.
|
||||||
|
|
||||||
|
Let's expand on Duff's Device.
|
||||||
|
|
||||||
|
The full source code for this example can be found
|
||||||
|
[here](./branch_table.S). It demonstrates a branch table consisting of
|
||||||
|
instructions which are meant to be executed in sequence after jumping
|
||||||
|
into the middle of the sequence.
|
||||||
|
|
||||||
|
Here:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
mov x6, 8
|
||||||
|
MOD x2, x6, x4, x5 // x4 gets l % 8
|
||||||
|
cbz x4, 10f // Handle evenly divisible case.
|
||||||
|
sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5
|
||||||
|
```
|
||||||
|
|
||||||
|
we are performing this: *x4 is getting the result of modding the
|
||||||
|
number of times we want the instructions executed by the number of
|
||||||
|
times we unrolled the loop*.
|
||||||
|
|
||||||
|
Specifically, this example does `length % 8`. However, the AARCH64 ISA
|
||||||
|
does not include a *mod* instruction. The `MOD` macro used above is
|
||||||
|
defined as:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
```
|
||||||
|
|
||||||
|
`msub` is a cool instruction. It does this:
|
||||||
|
|
||||||
|
```d = c - (b * a)```
|
||||||
|
|
||||||
|
Example: 13 % 8 == 5. First the `sdiv`: 13 / 8 is 1. Then, the `msub`:
|
||||||
|
13 - (1 * 8) is 5.
|
||||||
|
|
||||||
|
Next:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
cbz x4, 10f // Handle evenly divisible case.
|
||||||
|
sub x4, x6, x4 // Invert sense of x4 e.g. 5 becomes 3
|
||||||
|
```
|
||||||
|
|
||||||
|
This code is key.
|
||||||
|
|
||||||
|
If the result of the `mod` is 0, then the entire table must be executed.
|
||||||
|
This is implemented by the `cbz`.
|
||||||
|
|
||||||
|
If the result of the `mod` is not 0, then its value must be *flipped*.
|
||||||
|
The idea here is that if the result of the mod is 5, for example, we
|
||||||
|
have 5 stragglers. We want to execute 5 of the sequential instructions
|
||||||
|
below. So, we want to jump 3 instructions into the table. Notice that
|
||||||
|
3 is 8 - 5.
|
||||||
|
|
||||||
|
Finally, we have the computation of the address to where we jump into
|
||||||
|
the middle of the table.
|
||||||
|
|
||||||
|
```asm
|
||||||
|
LLD_ADDR x5, 10f
|
||||||
|
add x5, x5, x4, lsl 2
|
||||||
|
br x5
|
||||||
|
```
|
||||||
|
|
||||||
|
Each of the lines above bears description:
|
||||||
|
|
||||||
|
The `LLD_ADDR` is from the [*convergence
|
||||||
|
macros*](./apple-linux-convergence.S). It loads the address of the
|
||||||
|
beginning of the table.
|
||||||
|
|
||||||
|
Next, the `add` instruction multiplies the flipped result of the `mod`
|
||||||
|
by 4 (the length of one instruction) THEN adds it to the base address of
|
||||||
|
the table. We have calculated *instruction addresses* exactly the way we
|
||||||
|
would with array dereferences. Thank you John von Neumann.
|
||||||
|
|
||||||
|
Finally, we `br` which means branch to an address contained in a
|
||||||
|
register.
|
||||||
|
|
||||||
|
```asm
|
||||||
|
10: str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
// loop code not shown
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performing Multiple Instructions
|
||||||
|
|
||||||
|
If you need to execute more than one instruction you have two choices:
|
||||||
|
|
||||||
|
### Multiple Instructions by Address Arithmetic
|
||||||
|
|
||||||
|
Suppose you needed two instructions in each step of the sequence.
|
||||||
|
Simply multiply the index by 8 instead of 4 (i.e. the length of two
|
||||||
|
instructions). The same technique works with a larger number. E.g.
|
||||||
|
you need three instructions per step: multiply by 12.
|
||||||
|
|
||||||
|
Suppose some need 3 instruction and some need 2. You must handle this
|
||||||
|
because using this technique requires that all steps in the sequence
|
||||||
|
of steps must be the same length so that the address arithmetic works.
|
||||||
|
|
||||||
|
To deal with some cases being shorter than others, insert the occasional
|
||||||
|
`nop` instruction in the indexes that are shorter than the others.
|
||||||
|
|
||||||
|
### Multiple Instructions by Branch / Branch
|
||||||
|
|
||||||
|
Here's another [example of code](./jmptbl.s) that implements a branch or
|
||||||
|
jump table:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
jt: b 0f
|
||||||
|
b 1f
|
||||||
|
b 2f
|
||||||
|
b 3f
|
||||||
|
b 4f
|
||||||
|
b 5f
|
||||||
|
b 6f
|
||||||
|
b 7f
|
||||||
|
```
|
||||||
|
|
||||||
|
You jump into the middle of the table as per above and then immediately
|
||||||
|
jump some place else. This is like:
|
||||||
|
|
||||||
|
```c
|
||||||
|
if (index == 0) {
|
||||||
|
blah
|
||||||
|
} else if (index == 1) {
|
||||||
|
blah
|
||||||
|
} else if (index == 2) {
|
||||||
|
blah
|
||||||
|
} etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multiple Instructions by Branch / Call
|
||||||
|
|
||||||
|
You can modify the above techniques to make something like:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
jt: bl func_0
|
||||||
|
bl func_1
|
||||||
|
bl func_2
|
||||||
|
bl func_3
|
||||||
|
bl func_4
|
||||||
|
bl func_5
|
||||||
|
bl func_6
|
||||||
|
bl func_7
|
||||||
|
```
|
||||||
|
|
||||||
|
or to be more similar to a `break` statement coming after each case:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
jt: bl func_0
|
||||||
|
b common_label
|
||||||
|
bl func_1
|
||||||
|
b common_label
|
||||||
|
bl func_2
|
||||||
|
b common_label
|
||||||
|
bl func_3
|
||||||
|
b common_label
|
||||||
|
bl func_4
|
||||||
|
b common_label
|
||||||
|
bl func_5
|
||||||
|
b common_label
|
||||||
|
bl func_6
|
||||||
|
b common_label
|
||||||
|
bl func_7
|
||||||
|
b common_label
|
||||||
|
|
||||||
|
// perhaps some loop control... if none, the preceding
|
||||||
|
// b can be removed since can fall through to the common
|
||||||
|
// label.
|
||||||
|
common_label:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Small Gaps in Sequential Indexes
|
||||||
|
|
||||||
|
Suppose your range of indexes was 0 through 8 inclusive (notice there
|
||||||
|
are 9 integers in the range) but index 7 is skipped. That is, your
|
||||||
|
potential indexes are 0 through 6 inclusive and then 8 but never
|
||||||
|
7.
|
||||||
|
|
||||||
|
In a `switch` statement, this would look like:
|
||||||
|
|
||||||
|
```c++
|
||||||
|
/*
|
||||||
|
// Ensure index is a valid value before getting here. In this case the
|
||||||
|
// valid range is 0 through 8 inclusive (a range of 9 values). To fill
|
||||||
|
// out to the next power of 2 (which would be 16), one could put in
|
||||||
|
// empty cases plus a default.
|
||||||
|
*/
|
||||||
|
switch (index & 0xF) {
|
||||||
|
case 0: blah blah;
|
||||||
|
break;
|
||||||
|
case 1: blah blah;
|
||||||
|
break;
|
||||||
|
case 2: blah blah;
|
||||||
|
break;
|
||||||
|
case 3: blah blah;
|
||||||
|
break;
|
||||||
|
case 4: blah blah;
|
||||||
|
break;
|
||||||
|
case 5: blah blah;
|
||||||
|
break;
|
||||||
|
case 6: blah blah;
|
||||||
|
break;
|
||||||
|
case 8: blah blah;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Gaps in the potential indexes presents a surmountable problem if the
|
||||||
|
gaps are few.
|
||||||
|
|
||||||
|
In the case where there are a small number of gaps simple fill them
|
||||||
|
with a branch to a common, otherwise "do nothing", label. For example,
|
||||||
|
you might have:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
b_table: b label0
|
||||||
|
b label1
|
||||||
|
b label2
|
||||||
|
b label3
|
||||||
|
b label4
|
||||||
|
b label5
|
||||||
|
b label6
|
||||||
|
b do_nothing
|
||||||
|
b label8
|
||||||
|
```
|
||||||
|
|
||||||
|
in the style of Duff's Device where you are executing sequential single
|
||||||
|
instructions, it might loop like this:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
x_fer: str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
nop
|
||||||
|
str w1, [x0], 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, the `nop` instruction means "no operation". It does nothing but
|
||||||
|
is a valid instruction meant to take up space (and decades ago, take
|
||||||
|
up time).
|
||||||
|
|
||||||
|
In a high level language this might look like this:
|
||||||
|
|
||||||
|
```c
|
||||||
|
for (int i = 0; i <= 8; i++) {
|
||||||
|
if (i == 7)
|
||||||
|
continue;
|
||||||
|
blah blah
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## More about the `switch` statement
|
||||||
|
|
||||||
|
`switch` statements are optimized using many techniques than suggested
|
||||||
|
here. In fact, the implementation of optimized `switch` statements is
|
||||||
|
fascinating. There might be:
|
||||||
|
|
||||||
|
* binary searches for large numbers of cases
|
||||||
|
|
||||||
|
* separation of ranges where each sub-range is optimized in a different
|
||||||
|
way
|
||||||
|
|
||||||
|
* degeneration into streams of if / else ifs
|
||||||
|
|
||||||
|
and other techniques. The people who work on the compilers we take for
|
||||||
|
granted really are due some respect and *free beer*.
|
||||||
BIN
more/jump_tables/README.pdf
Normal file
BIN
more/jump_tables/README.pdf
Normal file
Binary file not shown.
156
more/jump_tables/apple-linux-convergence.S
Normal file
156
more/jump_tables/apple-linux-convergence.S
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
/* Macros to permit the "same" assembly language to build on ARM64
|
||||||
|
Linux systems as well as Apple Silicon systems.
|
||||||
|
|
||||||
|
See the fuller documentation at:
|
||||||
|
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
|
||||||
|
|
||||||
|
Perry Kivolowitz
|
||||||
|
A Gentle Introduction to Assembly Language
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro GLD_PTR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldr \xreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLD_ADDR xreg, label // Get a global address
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, _\label@GOTPAGE
|
||||||
|
add \xreg, \xreg, _\label@GOTPAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_ADDR xreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_DBL xreg, dreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
// fmov \dreg, \xreg
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \dreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro LLD_FLT xreg, sreg, label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
adrp \xreg, \label@PAGE
|
||||||
|
add \xreg, \xreg, \label@PAGEOFF
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#else
|
||||||
|
ldr \xreg, =\label
|
||||||
|
ldur \sreg, [\xreg]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro GLABEL label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
.global _\label
|
||||||
|
#else
|
||||||
|
.global \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MAIN
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_main:
|
||||||
|
#else
|
||||||
|
main:
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* Fetching the address of the externally defined errno is quite
|
||||||
|
different on Apple and Linux. This macro leaves the address of
|
||||||
|
errno in x0.
|
||||||
|
*/
|
||||||
|
.macro ERRNO_ADDR
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl ___error
|
||||||
|
#else
|
||||||
|
bl __errno_location
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro CRT label
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
bl _\label
|
||||||
|
#else
|
||||||
|
bl \label
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro START_PROC // after starting label
|
||||||
|
.cfi_startproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro END_PROC // after the return
|
||||||
|
.cfi_endproc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_P a, b
|
||||||
|
stp \a, \b, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PUSH_R a
|
||||||
|
str \a, [sp, -16]!
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_P a, b
|
||||||
|
ldp \a, \b, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro POP_R a
|
||||||
|
ldr \a, [sp], 16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The smaller of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MIN src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, LT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* The larger of src_a and src_b is put into dest. A cmp instruction
|
||||||
|
or other instruction that sets the flags must be performed first.
|
||||||
|
This macro makes it easy to remember which register does what in the
|
||||||
|
csel.
|
||||||
|
|
||||||
|
Thank you to u/TNorthover for nudge to add the cmp.
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro MAX src_a, src_b, dest
|
||||||
|
cmp \src_a, \src_b
|
||||||
|
csel \dest, \src_a, \src_b, GT
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro AASCIZ label, string
|
||||||
|
.p2align 2
|
||||||
|
\label: .asciz "\string"
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
57
more/jump_tables/branch_table.S
Normal file
57
more/jump_tables/branch_table.S
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
#include "apple-linux-convergence.S"
|
||||||
|
|
||||||
|
.p2align 2
|
||||||
|
.text
|
||||||
|
GLABEL MyMemSet
|
||||||
|
|
||||||
|
/* MyMemSet(unsigned char * b, unsigned char v, long l)
|
||||||
|
x0 w1 x2
|
||||||
|
|
||||||
|
The length is first checked against less than or equal to 0. If
|
||||||
|
so, the body of the function is skipped.
|
||||||
|
|
||||||
|
The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
|
||||||
|
into the number of instructions to jump to or beyond the initial
|
||||||
|
str. A modulo of 0 is handled separately - it causes a branch to the
|
||||||
|
initial str.
|
||||||
|
|
||||||
|
This code can be dramatically improved by copying more than one byte
|
||||||
|
at a time. You will have to figure out how to do this optimally in
|
||||||
|
P6 - MemCpy
|
||||||
|
*/
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
_MyMemSet:
|
||||||
|
#else
|
||||||
|
MyMemSet:
|
||||||
|
#endif
|
||||||
|
START_PROC
|
||||||
|
PUSH_P x29, x30
|
||||||
|
mov x29, sp
|
||||||
|
cmp x2, xzr // Test for bad length.
|
||||||
|
ble 99f // Take branch of 0 or less.
|
||||||
|
|
||||||
|
add x3, x2, x0 // x3 gets address of one beyond buffer
|
||||||
|
mov x6, 8
|
||||||
|
MOD x2, x6, x4, x5 // x4 gets l % 8
|
||||||
|
cbz x4, 10f // Handle evenly divisible case.
|
||||||
|
sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5
|
||||||
|
|
||||||
|
LLD_ADDR x5, 10f
|
||||||
|
add x5, x5, x4, lsl 2
|
||||||
|
br x5
|
||||||
|
|
||||||
|
10: str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
str w1, [x0], 1
|
||||||
|
cmp x3, x0
|
||||||
|
bgt 10b
|
||||||
|
|
||||||
|
99: POP_P x29, x30
|
||||||
|
ret
|
||||||
|
END_PROC
|
||||||
|
|
||||||
83
more/jump_tables/jmptbl.s
Normal file
83
more/jump_tables/jmptbl.s
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
.text
|
||||||
|
.align 4
|
||||||
|
.global main
|
||||||
|
|
||||||
|
main: str x30, [sp, -16]!
|
||||||
|
mov x0, xzr // set up call to time(nullptr)
|
||||||
|
bl time // call time setting up srand
|
||||||
|
bl srand // call srand setting up rand
|
||||||
|
bl rand // get a random number
|
||||||
|
and x0, x0, 7 // ensure its range is 0 to 7
|
||||||
|
// note use of x register is on purpose
|
||||||
|
lsl x0, x0, 2 // multiply by 4
|
||||||
|
ldr x1, =jt // load base address of jump table
|
||||||
|
add x1, x1, x0 // add offset to base address
|
||||||
|
br x1
|
||||||
|
|
||||||
|
// If, as in this case, all the "cases" have the same number of
|
||||||
|
// instructions then this intermediate jump table can be omitted saving
|
||||||
|
// some space and a tiny amount of time. To omit the intermediate jump
|
||||||
|
// table, you'd multiply by 12 above and not 4. Twelve because each
|
||||||
|
// "case" has 3 instructions (3 x 4 == 12).
|
||||||
|
|
||||||
|
// Question for you: If you did omit the jump table, relative to what
|
||||||
|
// would you jump (since "jt" would be gone).
|
||||||
|
|
||||||
|
jt: b 0f
|
||||||
|
b 1f
|
||||||
|
b 2f
|
||||||
|
b 3f
|
||||||
|
b 4f
|
||||||
|
b 5f
|
||||||
|
b 6f
|
||||||
|
b 7f
|
||||||
|
|
||||||
|
0: ldr x0, =ZR
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
1: ldr x0, =ON
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
2: ldr x0, =TW
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
3: ldr x0, =TH
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
4: ldr x0, =FR
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
5: ldr x0, =FV
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
6: ldr x0, =SX
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
7: ldr x0, =SV
|
||||||
|
bl puts
|
||||||
|
b 99f
|
||||||
|
|
||||||
|
99: mov w0, wzr
|
||||||
|
ldr x30, [sp], 16
|
||||||
|
ret
|
||||||
|
|
||||||
|
.data
|
||||||
|
.section .rodata
|
||||||
|
|
||||||
|
ZR: .asciz "0 returned"
|
||||||
|
ON: .asciz "1 returned"
|
||||||
|
TW: .asciz "2 returned"
|
||||||
|
TH: .asciz "3 returned"
|
||||||
|
FR: .asciz "4 returned"
|
||||||
|
FV: .asciz "5 returned"
|
||||||
|
SX: .asciz "6 returned"
|
||||||
|
SV: .asciz "7 returned"
|
||||||
|
|
||||||
|
.end
|
||||||
55
more/jump_tables/jt.c
Normal file
55
more/jump_tables/jt.c
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
/* This is the prototype for the assembly language version. You may
|
||||||
|
have always thought that switch statements are implemented as a long
|
||||||
|
chain of if / else. Well, sometimes they are. Sometimes they are
|
||||||
|
implemented using binary search and still other times they are
|
||||||
|
implemented as jump tables.
|
||||||
|
|
||||||
|
My assembly language version is found in jmptbl.s.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
|
||||||
|
srand(time(0));
|
||||||
|
r = rand() & 7;
|
||||||
|
switch (r)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
puts("0 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 1:
|
||||||
|
puts("1 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
puts("2 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
puts("3 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
puts("4 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 5:
|
||||||
|
puts("5 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 6:
|
||||||
|
puts("6 returned");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 7:
|
||||||
|
puts("7 returned");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
31
more/jump_tables/test_interop.cpp
Normal file
31
more/jump_tables/test_interop.cpp
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
extern "C" void MyMemSet(unsigned char *, unsigned char v, long length);
|
||||||
|
|
||||||
|
/* MyMemSet(unsigned char *, unsigned char v, long length);
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
void MyMemSet(unsigned char * b, unsigned char v, long l) {
|
||||||
|
for (long i = 0; i < l; i++) {
|
||||||
|
b[i] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
const long BUFFER_SIZE = 1000;
|
||||||
|
|
||||||
|
unsigned char buffer[BUFFER_SIZE];
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
unsigned char before = buffer[-1];
|
||||||
|
unsigned char after = buffer[BUFFER_SIZE];
|
||||||
|
|
||||||
|
MyMemSet(buffer, 0xF0, 3);
|
||||||
|
|
||||||
|
if (before != buffer[-1])
|
||||||
|
printf("Bytes prior to buffer are smashed.\n");
|
||||||
|
if (after != buffer[BUFFER_SIZE])
|
||||||
|
printf("Bytes after buffer are smashed.\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -17,7 +17,9 @@ sin x = x - x^3/3! + x^5/5! - x^7/7! ...
|
||||||
|
|
||||||
Notice each term flips from addition to subtraction.
|
Notice each term flips from addition to subtraction.
|
||||||
|
|
||||||
Notice each term is based on the odd integers starting at 1.
|
Notice each term is based on the odd integers starting at 1. While the
|
||||||
|
"1" case might look different, it is the same as all the others since
|
||||||
|
1 is just 1 to the first power divided by 1 factorial.
|
||||||
|
|
||||||
## Command line
|
## Command line
|
||||||
|
|
||||||
|
|
@ -29,40 +31,76 @@ arguments are therefore required.
|
||||||
be a double.
|
be a double.
|
||||||
|
|
||||||
* The number of terms to evaluate. The number of terms must lie between
|
* The number of terms to evaluate. The number of terms must lie between
|
||||||
1 and 10 inclusive.
|
1 and 10 inclusive. Note the value of 10 as an upper bound in new. It
|
||||||
|
was 8.
|
||||||
|
|
||||||
## C version
|
## C version
|
||||||
|
|
||||||
To assist your efforts, [here](./c_version.c) is a version of this
|
To assist your efforts, [here](./c_version.c) is a version of this
|
||||||
project written in C.
|
project written in C. This has been updated to print nice debugging
|
||||||
|
output which is not part of the project.
|
||||||
|
|
||||||
## Errors to stderr
|
This C version also demonstrates a different way of calculating the
|
||||||
|
toggle. This version flips the sign of the toggle by multiplying by -1.
|
||||||
Error messages must be sent to `stderr`.
|
The previous version used odd and even values of the term.
|
||||||
|
|
||||||
If you are using the convergence macros to allow your program to build
|
|
||||||
on both Apple Silicon Mac OS and Linux, note the special casing needed
|
|
||||||
to deal with `stderr`. If this is you, compile the C version on Mac OS
|
|
||||||
with the `-S` compiler option to see the generated assembly language and
|
|
||||||
search for `stderr`.
|
|
||||||
|
|
||||||
## Sample executions
|
## Sample executions
|
||||||
|
|
||||||
```text
|
```text
|
||||||
SINE % ./a.out 0 8
|
pk_taylor_series > gcc main.S -o a
|
||||||
The sine of 0.00 degrees is 0.000000 in radians.
|
pk_taylor_series > ./a 0 10
|
||||||
SINE % ./a.out 90 8
|
The sine of 0.00 degrees is 0.00000000.
|
||||||
The sine of 90.00 degrees is 1.000000 in radians.
|
pk_taylor_series > ./a 30 10
|
||||||
SINE % ./a.out 180 8
|
The sine of 30.00 degrees is 0.50000000.
|
||||||
The sine of 180.00 degrees is -0.000001 in radians.
|
pk_taylor_series > ./a 45 10
|
||||||
SINE % ./a.out 180 82
|
The sine of 45.00 degrees is 0.70710678.
|
||||||
|
pk_taylor_series > ./a 90 10
|
||||||
|
The sine of 90.00 degrees is 1.00000000.
|
||||||
|
pk_taylor_series > ./a 180 10
|
||||||
|
The sine of 180.00 degrees is -0.00000000.
|
||||||
|
pk_taylor_series > ./a 360 10
|
||||||
|
The sine of 360.00 degrees is -0.00104818.
|
||||||
|
pk_taylor_series > ./a 360 100
|
||||||
Number of terms is out of range.
|
Number of terms is out of range.
|
||||||
SINE % ./a.out 180 -10
|
pk_taylor_series > ./a 360 -1
|
||||||
Number of terms is out of range.
|
Number of terms is out of range.
|
||||||
SINE % echo $?
|
pk_taylor_series >
|
||||||
1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Floating point instructions I used
|
||||||
|
|
||||||
|
These are the floating point instructions I used in my implementation.
|
||||||
|
|
||||||
|
* fmov
|
||||||
|
|
||||||
|
* scvtf
|
||||||
|
|
||||||
|
* fmul
|
||||||
|
|
||||||
|
* fdiv
|
||||||
|
|
||||||
|
* fadd
|
||||||
|
|
||||||
|
## How I broke up the program
|
||||||
|
|
||||||
|
I have functions named:
|
||||||
|
|
||||||
|
* main
|
||||||
|
|
||||||
|
* HandleOptions
|
||||||
|
|
||||||
|
* Factorial
|
||||||
|
|
||||||
|
* IntegerPower - x to the nth power
|
||||||
|
|
||||||
|
* ComputeSine - The main calculation
|
||||||
|
|
||||||
|
* PrintAnswer
|
||||||
|
|
||||||
|
* ConvertTheta - Wrap D2R
|
||||||
|
|
||||||
|
* D2R - Degrees to radians
|
||||||
|
|
||||||
## CSC3510
|
## CSC3510
|
||||||
|
|
||||||
The following applies to Carthage College CSC3510 students.
|
The following applies to Carthage College CSC3510 students.
|
||||||
|
|
@ -74,4 +112,3 @@ Work is to be done solo.
|
||||||
### What to hand in
|
### What to hand in
|
||||||
|
|
||||||
Just the .S file. **Your name must be at the top of the file.**
|
Just the .S file. **Your name must be at the top of the file.**
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,13 +1,14 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
double pi = 3.14159265359;
|
double pi = 3.14159265358979323846;
|
||||||
|
|
||||||
double D2R(double d) {
|
double D2R(double d) {
|
||||||
return d * pi / 180.0;
|
return d * pi / 180.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
long Factorial(int n) {
|
double Factorial(int n) {
|
||||||
long retval = 1;
|
long retval = 1;
|
||||||
|
|
||||||
if (n > 0) {
|
if (n > 0) {
|
||||||
|
|
@ -15,7 +16,7 @@ long Factorial(int n) {
|
||||||
retval = retval * n--;
|
retval = retval * n--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return retval;
|
return (double) retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
double IntegerPower(double b, int e) {
|
double IntegerPower(double b, int e) {
|
||||||
|
|
@ -48,20 +49,20 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
double r_angle = D2R(angle);
|
double r_angle = D2R(angle);
|
||||||
|
|
||||||
|
double toggle = 1.0;
|
||||||
for (int term = 0, base = 1; term < terms; term++, base += 2) {
|
for (int term = 0, base = 1; term < terms; term++, base += 2) {
|
||||||
double toggle = (term & 1) ? -1.0 : 1.0;
|
if (toggle > 0) {
|
||||||
|
printf("%+03.8e + %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
|
||||||
|
Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
|
||||||
|
} else {
|
||||||
|
printf("%+03.8e - %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
|
||||||
|
Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
|
||||||
|
}
|
||||||
sin += toggle *
|
sin += toggle *
|
||||||
IntegerPower(r_angle, base) / Factorial(base);
|
IntegerPower(r_angle, base) / Factorial(base);
|
||||||
/*
|
toggle = toggle * -1;
|
||||||
if (toggle > 0) {
|
|
||||||
printf("adding %d p/b intermediate: %f\n", base, sin);
|
|
||||||
} else {
|
|
||||||
printf("subtracting %d p/b intermediate: %f\n", base, sin);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
printf("The sine of %.2f degrees is %f in radians.\n", angle, sin);
|
printf("The sine of %0.4f degrees is %0.10f.\n", angle, sin);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
|
|
@ -149,3 +149,8 @@ main:
|
||||||
.p2align 2
|
.p2align 2
|
||||||
\label: .asciz "\string"
|
\label: .asciz "\string"
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro MOD src_a, src_b, dest, scratch
|
||||||
|
sdiv \scratch, \src_a, \src_b
|
||||||
|
msub \dest, \scratch, \src_b, \src_a
|
||||||
|
.endm
|
||||||
|
|
|
||||||
|
|
@ -2,27 +2,23 @@
|
||||||
|
|
||||||
The `fmov` instruction is used to move floating point values in and out
|
The `fmov` instruction is used to move floating point values in and out
|
||||||
of floating point registers and to some degree, moving data between
|
of floating point registers and to some degree, moving data between
|
||||||
integer and floating point registers.
|
integer and floating point registers.
|
||||||
|
|
||||||
## Loading Floating Point Numbers as Immediate Values
|
## Loading Floating Point Numbers as Immediate Values
|
||||||
|
|
||||||
Just as we saw with integer
|
Just as we saw with integer registers, some values can be used as
|
||||||
registers, some values can be used as immediate values and some cannot.
|
immediate values and some cannot. It comes down to how many bits are
|
||||||
|
necessary to encode the value. Too many bits... not enough room to fit
|
||||||
|
in a 4 byte instruction plus the opcode.
|
||||||
|
|
||||||
For example, this works:
|
For example, this works:
|
||||||
|
|
||||||
`mov x0, 65536`
|
`mov x0, 65535`
|
||||||
|
|
||||||
but this does not:
|
but this does not:
|
||||||
|
|
||||||
`mov x0, 65537`
|
`mov x0, 65537`
|
||||||
|
|
||||||
The reason is that all AARCH64 instructions must fit within a 32 bit
|
|
||||||
instruction that must hold the instruction's op code, its flags and
|
|
||||||
other bits and bobs plus any immediate value. In the above example we
|
|
||||||
can see that the `mov` instruction provides up to 16 bits for an
|
|
||||||
immediate value.
|
|
||||||
|
|
||||||
The constraints placed on immediate values for `fmov` are much tighter
|
The constraints placed on immediate values for `fmov` are much tighter
|
||||||
because floating point numbers are far more complex than integers.
|
because floating point numbers are far more complex than integers.
|
||||||
|
|
||||||
|
|
@ -40,7 +36,7 @@ Let's take a look at some code:
|
||||||
fmov d0, 1.96875 // Zoinks!
|
fmov d0, 1.96875 // Zoinks!
|
||||||
```
|
```
|
||||||
|
|
||||||
From this we can see that an immediate value for an `fmov` seems to have
|
From this we can see that an immediate value for an `fmov` has
|
||||||
4 bits available for the mantissa. In fact, the only values that work
|
4 bits available for the mantissa. In fact, the only values that work
|
||||||
as immediate values will be those floating point values whose fractional
|
as immediate values will be those floating point values whose fractional
|
||||||
values are combinations of:
|
values are combinations of:
|
||||||
|
|
@ -56,6 +52,9 @@ values are combinations of:
|
||||||
As far as exponents go, `fmov` can accommodate 3 bits. So, exponents of
|
As far as exponents go, `fmov` can accommodate 3 bits. So, exponents of
|
||||||
plus or minus 2**7 can be used.
|
plus or minus 2**7 can be used.
|
||||||
|
|
||||||
|
A sign bit makes the total number of bits available for immediate moves
|
||||||
|
to be 8.
|
||||||
|
|
||||||
## Loading / Storing Floating Point Numbers in General
|
## Loading / Storing Floating Point Numbers in General
|
||||||
|
|
||||||
When in doubt, load fixed floating point numbers from memory. This is
|
When in doubt, load fixed floating point numbers from memory. This is
|
||||||
|
|
@ -64,11 +63,16 @@ covered [in this chapter](./literals.md).
|
||||||
## SIMD
|
## SIMD
|
||||||
|
|
||||||
`fmov` can also deal with the more complicated special cases induced by
|
`fmov` can also deal with the more complicated special cases induced by
|
||||||
SIMD instructions.
|
SIMD instructions. `fmov` is able to move values between the various
|
||||||
|
register widths such as single precision to double precision. **However,
|
||||||
|
no conversion of value is performed - `fmov` just copies bits.**
|
||||||
|
|
||||||
|
If you need to change the precision of a floating point value, the
|
||||||
|
`fcvt` family of instructions must be used instead.
|
||||||
|
|
||||||
## Movement To / From Integer Registers
|
## Movement To / From Integer Registers
|
||||||
|
|
||||||
`fmov` can *bits* between the integer and floating point registers. We
|
`fmov` can copy *bits* between the integer and floating point registers.
|
||||||
emphasize the *bits*. No conversions are done using `fmov`. There exist
|
We emphasize the *bits*. No conversions are done using `fmov`. There
|
||||||
other instructions for that. See [this chapter](./rounding.md) for more
|
exist other instructions for that. See [this chapter](./rounding.md) for
|
||||||
information.
|
more information.
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -20,30 +20,32 @@ To load a `float`, you could translate the value to binary and do
|
||||||
as the following:
|
as the following:
|
||||||
|
|
||||||
```asm
|
```asm
|
||||||
.text // 1
|
.text // 1
|
||||||
.global main // 2
|
.global main // 2
|
||||||
.align 2 // 3
|
.align 2 // 3
|
||||||
// 4
|
// 4
|
||||||
main: str x30, [sp, -16]! // 5
|
main: str x30, [sp, -16]! // 5
|
||||||
ldr s0, =0x3fc00000 // 6
|
ldr s0, =0x3fc00000 // 6
|
||||||
fcvt d0, s0 // 7
|
fcvt d0, s0 // 7
|
||||||
ldr x0, =fmt // 8
|
ldr x0, =fmt // 8
|
||||||
bl printf // 9
|
bl printf // 9
|
||||||
ldr x30, [sp], 16 // 10
|
ldr x30, [sp], 16 // 10
|
||||||
mov w0, wzr // 11
|
mov w0, wzr // 11
|
||||||
ret // 12
|
ret // 12
|
||||||
// 13
|
// 13
|
||||||
.data // 14
|
.data // 14
|
||||||
fmt: .asciz "%f\n" // 15
|
fmt: .asciz "%f\n" // 15
|
||||||
.end // 16
|
.end // 16
|
||||||
```
|
```
|
||||||
|
|
||||||
The above code is found [here](./t.s).
|
The above code is kind of found [here](./t.s) - the file is used
|
||||||
|
for miscellaneous testing.
|
||||||
|
|
||||||
`Line 6` puts the translated value of 1.5 into `s0` (since the value
|
`Line 6` puts the translated value of 1.5 into `s0` (since we are
|
||||||
is a `float` it goes in an `s` register). The assembler performs some
|
thinking of the value as a `float` it goes in an `s` register). The
|
||||||
magic getting a 32 bit value seemingly fit into a 32 bit instruction.
|
assembler performs some magic getting a 32 bit value seemingly fit into
|
||||||
See [below](./literals.md#fitting-32-bits-into-a-32-bit-bag).
|
a 32 bit instruction. See
|
||||||
|
[below](./literals.md#fitting-32-bits-into-a-32-bit-bag).
|
||||||
|
|
||||||
`Line 7` converts the single precision number into a double precision
|
`Line 7` converts the single precision number into a double precision
|
||||||
number for printing.
|
number for printing.
|
||||||
|
|
@ -136,6 +138,9 @@ Cool huh?
|
||||||
|
|
||||||
## Fitting 32 bits into a 32 bit bag
|
## Fitting 32 bits into a 32 bit bag
|
||||||
|
|
||||||
|
**This section is currently LINUX-centric - in the future it will
|
||||||
|
address both native Apple and Linux equally.***
|
||||||
|
|
||||||
AARCH64 instructions are 32 bits in width. Yet, `line 6` from
|
AARCH64 instructions are 32 bits in width. Yet, `line 6` from
|
||||||
[this](./t.s) program reads:
|
[this](./t.s) program reads:
|
||||||
|
|
||||||
|
|
@ -195,15 +200,16 @@ Scan downward to find `0x7a0`:
|
||||||
0x7a0 <main+32> .inst 0x3fc00000 ; undefined
|
0x7a0 <main+32> .inst 0x3fc00000 ; undefined
|
||||||
```
|
```
|
||||||
|
|
||||||
Hey look! Here's our literal float. The `.inst` is an ARM
|
Hey look! Here's our literal float. The `.inst` is an ARM specific GNU
|
||||||
specific GNU assembler directive what allows the programmer
|
assembler directive says: `¯\_(-)_/¯`.
|
||||||
to encode their own instruction. Note, the encoded instruction does not
|
|
||||||
have to make any sense - instead the compiler has emitted a make believe
|
Note, the encoded "instruction" does not have to make any sense -
|
||||||
instruction that happens to have the value of our literal.
|
instead the compiler has emitted a make believe instruction that happens
|
||||||
|
to have the value of our literal.
|
||||||
|
|
||||||
What we're seeing the actual `line 6` doing is reaching ahead a short
|
What we're seeing the actual `line 6` doing is reaching ahead a short
|
||||||
distance to load the value of another "instruction" when really it is
|
distance to load the value of another location in memory where our
|
||||||
our constant.
|
constant is really found.
|
||||||
|
|
||||||
Let us take this explanation further. Notice we see:
|
Let us take this explanation further. Notice we see:
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,12 +1,16 @@
|
||||||
.text
|
.text
|
||||||
.global main
|
.global _main
|
||||||
.align 2
|
.align 2
|
||||||
|
|
||||||
main: str x30, [sp, -16]!
|
_main:
|
||||||
|
str x30, [sp, -16]!
|
||||||
|
mov x0, 0xFFFFFFFF
|
||||||
|
/*
|
||||||
ldr s0, =0x3fc00000
|
ldr s0, =0x3fc00000
|
||||||
fcvt d0, s0
|
fcvt d0, s0
|
||||||
ldr x0, =fmt
|
ldr x0, =fmt
|
||||||
bl printf
|
bl printf
|
||||||
|
*/
|
||||||
ldr x30, [sp], 16
|
ldr x30, [sp], 16
|
||||||
mov w0, wzr
|
mov w0, wzr
|
||||||
ret
|
ret
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue