ijoijoijoij
This commit is contained in:
Perry Kivolowitz 2023-04-15 22:22:49 -05:00
commit 65931b77a0
29 changed files with 1125 additions and 84 deletions

View file

@ -308,7 +308,11 @@ What would a book about assembly language be without bit bashing?
### Section 4 - More Stuff
In this section, we present miscellaneous material.
In this section, we present miscellaneous material including our "world
famous lecture" on debugging. This lecture has been invited at several
colleges and universities. It is intended for audiences working with
languages like C, C++ and assembly language but some of the lessons
contained therein are applicable to all languages.
| Chapter | Markdown | PDF |
| ------- | -------- | --- |
@ -319,6 +323,9 @@ In this section, we present miscellaneous material.
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
| 8 | [Jump Tables](./more/jump_tables/README.md) | [Link](./more/jump_tables/README.pdf) |
| 9 | [argv](./more/argv_example/jess1.S) | ASM CODE |
| - | [Debugging Lecture](./debugging/Discourses%20and%20Dialogs%20on%20Debugging.pptx) | PPTX |
## Macro Suite

Binary file not shown.

Binary file not shown.

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -0,0 +1,156 @@
/* Macros to permit the "same" assembly language to build on ARM64
Linux systems as well as Apple Silicon systems.
See the fuller documentation at:
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
Perry Kivolowitz
A Gentle Introduction to Assembly Language
*/
.macro GLD_PTR xreg, label
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
#else
ldr \xreg, =\label
ldr \xreg, [\xreg]
#endif
.endm
.macro GLD_ADDR xreg, label // Get a global address
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
add \xreg, \xreg, _\label@GOTPAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_ADDR xreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_DBL xreg, dreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \dreg, [\xreg]
// fmov \dreg, \xreg
#else
ldr \xreg, =\label
ldur \dreg, [\xreg]
#endif
.endm
.macro LLD_FLT xreg, sreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \sreg, [\xreg]
#else
ldr \xreg, =\label
ldur \sreg, [\xreg]
#endif
.endm
.macro GLABEL label
#if defined(__APPLE__)
.global _\label
#else
.global \label
#endif
.endm
.macro MAIN
#if defined(__APPLE__)
_main:
#else
main:
#endif
.endm
/* Fetching the address of the externally defined errno is quite
different on Apple and Linux. This macro leaves the address of
errno in x0.
*/
.macro ERRNO_ADDR
#if defined(__APPLE__)
bl ___error
#else
bl __errno_location
#endif
.endm
.macro CRT label
#if defined(__APPLE__)
bl _\label
#else
bl \label
#endif
.endm
.macro START_PROC // after starting label
.cfi_startproc
.endm
.macro END_PROC // after the return
.cfi_endproc
.endm
.macro PUSH_P a, b
stp \a, \b, [sp, -16]!
.endm
.macro PUSH_R a
str \a, [sp, -16]!
.endm
.macro POP_P a, b
ldp \a, \b, [sp], 16
.endm
.macro POP_R a
ldr \a, [sp], 16
.endm
/* The smaller of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MIN src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, LT
.endm
/* The larger of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MAX src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, GT
.endm
.macro AASCIZ label, string
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

111
more/argv_example/jess1.S Normal file
View file

@ -0,0 +1,111 @@
#include "apple-linux-convergence.S"
.p2align 2
.text
GLABEL main
/* This program will get a string followed by a double followed by an
integer from the command line demonstrating how each of these types
can be retrieved.
Example:
./a.out test 29.3 29
*/
MAIN
PUSH_P x29, x30
mov x29, sp
// Check argc to see if it is 4. This is not the only way to
// validate command line arguments but it is an easy way.
cmp w0, 4
bne 99f // take branch if argc isn't "right".
// Skip past argv[0]
add x1, x1, 8
// Fetch argv[1] as a string.
// x1 is a pointer to a pointer to chars (i.e. the string).
// Being a pointer to a pointer, it must be dereferenced to
// make a pointer.
ldr x0, [x1] // dereference
// Now x0 contains a pointer to the command line argument.
// Print the string (as a string). But doing this causes a
// function call which will destroy x1. So, save x1 temporarily.
// This could be avoided if x1 were moved to a backed up x
// register (e.g. x20).
PUSH_R x1
CRT puts // ptr is in x0 where puts() needs it.
POP_R x1
// Advance x1 once again to get to argv[2] which can be done
// in the same instruction as dereferencing it use a
// preincrement.
ldr x0, [x1, 8]! // dereference
// Now the string version of argv[2] is now pointed to by x0.
// This is exactly where atof would want it. We need atof
// because it turns strings into numbers. BUT, same as before,
// calling a function would destroy x1 so let's do the same
// trick of backing up x1 on the stack and then restoring after
// the function call.
PUSH_R x1
CRT atof // ptr is in x0 where atof() needs it.
POP_R x1
// The string value will be converted to a double left in d0.
// d0 is also a scratch register so for our next call to atoi,
// d0 will have to be preserved on the stack - alternatively,
// we could have used a high d register backed up and restored
// at the start and ending of main().
// Advance x1 once again to get to argv[3] which can be done
// in the same instruction as dereferencing it use a
// preincrement.
ldr x0, [x1, 8]! // dereference
// Now the string version of argv[3] is now pointed to by x0.
// This is exactly where atoi would want it. We need atoi
// because it turns strings into numbers. BUT, same as before,
// calling a function would destroy x1 so let's do the same
// trick of backing up x1 on the stack and then restoring after
// the function call. We must also do the same for d0. Actually,
// we won't need argv after this so we will skip backing up x1.
PUSH_R d0
CRT atoi // ptr is in x0 where atof() needs it.
POP_R d0
// d0 now contains the double.
// x0 now contains the integer.
// x0 must be copied to x1 because x0 must be a pointer to fmt
// for printf to work.
mov x1, x0
LLD_ADDR x0, fmt
#if defined(__APPLE__)
sub sp, sp, 16
str x1, [sp, 8]
str d0, [sp]
CRT printf
add sp, sp, 16
#else
bl printf
#endif
99: POP_P x29, x30
mov w0, wzr
ret
/* What did we learn?
* x1 has argv when main begins.
* pointers to the arguments are the contents of argv NOT
the actual values. Therefore, x1, which is a pointer (to a pointer),
must be dereferenced to get to the actual pointer. In the code,
there are three lines with the comment "// dereference".
* all command line arguments are c-strings. If that's not what you
want, they must be converted - see the code for atoi and atof for
examples.
*/
.data
fmt: .asciz "double: %f integer: %d\n"
.end

View file

@ -0,0 +1,6 @@
b MyMemSet
run
n
n
:q
q

297
more/jump_tables/README.md Normal file
View file

@ -0,0 +1,297 @@
# Jump or Branch Tables
A jump or branch table is a powerful instruction saving technique that
can be used to switch between multiple single instructions or even
choose one of a series of functions to call (or branches to take).
This concept can be found as the implementation of some `switch`
statements and is found at the very very lowest end of an Operating
System (interrupt vectors, for example).
The
## Single Instructions a la Duff's Device
[Duff's Device](https://en.wikipedia.org/wiki/Duff%27s_device) shoe
horned a jump table into the middle of a `while` loop. At the same
time, it also demonstrates a simple case of *loop unrolling*.
It's very creative.
Let's expand on Duff's Device.
The full source code for this example can be found
[here](./branch_table.S). It demonstrates a branch table consisting of
instructions which are meant to be executed in sequence after jumping
into the middle of the sequence.
Here:
```asm
mov x6, 8
MOD x2, x6, x4, x5 // x4 gets l % 8
cbz x4, 10f // Handle evenly divisible case.
sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5
```
we are performing this: *x4 is getting the result of modding the
number of times we want the instructions executed by the number of
times we unrolled the loop*.
Specifically, this example does `length % 8`. However, the AARCH64 ISA
does not include a *mod* instruction. The `MOD` macro used above is
defined as:
```asm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm
```
`msub` is a cool instruction. It does this:
```d = c - (b * a)```
Example: 13 % 8 == 5. First the `sdiv`: 13 / 8 is 1. Then, the `msub`:
13 - (1 * 8) is 5.
Next:
```asm
cbz x4, 10f // Handle evenly divisible case.
sub x4, x6, x4 // Invert sense of x4 e.g. 5 becomes 3
```
This code is key.
If the result of the `mod` is 0, then the entire table must be executed.
This is implemented by the `cbz`.
If the result of the `mod` is not 0, then its value must be *flipped*.
The idea here is that if the result of the mod is 5, for example, we
have 5 stragglers. We want to execute 5 of the sequential instructions
below. So, we want to jump 3 instructions into the table. Notice that
3 is 8 - 5.
Finally, we have the computation of the address to where we jump into
the middle of the table.
```asm
LLD_ADDR x5, 10f
add x5, x5, x4, lsl 2
br x5
```
Each of the lines above bears description:
The `LLD_ADDR` is from the [*convergence
macros*](./apple-linux-convergence.S). It loads the address of the
beginning of the table.
Next, the `add` instruction multiplies the flipped result of the `mod`
by 4 (the length of one instruction) THEN adds it to the base address of
the table. We have calculated *instruction addresses* exactly the way we
would with array dereferences. Thank you John von Neumann.
Finally, we `br` which means branch to an address contained in a
register.
```asm
10: str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
// loop code not shown
```
## Performing Multiple Instructions
If you need to execute more than one instruction you have two choices:
### Multiple Instructions by Address Arithmetic
Suppose you needed two instructions in each step of the sequence.
Simply multiply the index by 8 instead of 4 (i.e. the length of two
instructions). The same technique works with a larger number. E.g.
you need three instructions per step: multiply by 12.
Suppose some need 3 instruction and some need 2. You must handle this
because using this technique requires that all steps in the sequence
of steps must be the same length so that the address arithmetic works.
To deal with some cases being shorter than others, insert the occasional
`nop` instruction in the indexes that are shorter than the others.
### Multiple Instructions by Branch / Branch
Here's another [example of code](./jmptbl.s) that implements a branch or
jump table:
```asm
jt: b 0f
b 1f
b 2f
b 3f
b 4f
b 5f
b 6f
b 7f
```
You jump into the middle of the table as per above and then immediately
jump some place else. This is like:
```c
if (index == 0) {
blah
} else if (index == 1) {
blah
} else if (index == 2) {
blah
} etc.
```
### Multiple Instructions by Branch / Call
You can modify the above techniques to make something like:
```asm
jt: bl func_0
bl func_1
bl func_2
bl func_3
bl func_4
bl func_5
bl func_6
bl func_7
```
or to be more similar to a `break` statement coming after each case:
```asm
jt: bl func_0
b common_label
bl func_1
b common_label
bl func_2
b common_label
bl func_3
b common_label
bl func_4
b common_label
bl func_5
b common_label
bl func_6
b common_label
bl func_7
b common_label
// perhaps some loop control... if none, the preceding
// b can be removed since can fall through to the common
// label.
common_label:
```
## Small Gaps in Sequential Indexes
Suppose your range of indexes was 0 through 8 inclusive (notice there
are 9 integers in the range) but index 7 is skipped. That is, your
potential indexes are 0 through 6 inclusive and then 8 but never
7.
In a `switch` statement, this would look like:
```c++
/*
// Ensure index is a valid value before getting here. In this case the
// valid range is 0 through 8 inclusive (a range of 9 values). To fill
// out to the next power of 2 (which would be 16), one could put in
// empty cases plus a default.
*/
switch (index & 0xF) {
case 0: blah blah;
break;
case 1: blah blah;
break;
case 2: blah blah;
break;
case 3: blah blah;
break;
case 4: blah blah;
break;
case 5: blah blah;
break;
case 6: blah blah;
break;
case 8: blah blah;
break;
}
```
Gaps in the potential indexes presents a surmountable problem if the
gaps are few.
In the case where there are a small number of gaps simple fill them
with a branch to a common, otherwise "do nothing", label. For example,
you might have:
```asm
b_table: b label0
b label1
b label2
b label3
b label4
b label5
b label6
b do_nothing
b label8
```
in the style of Duff's Device where you are executing sequential single
instructions, it might loop like this:
```asm
x_fer: str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
nop
str w1, [x0], 1
```
Here, the `nop` instruction means "no operation". It does nothing but
is a valid instruction meant to take up space (and decades ago, take
up time).
In a high level language this might look like this:
```c
for (int i = 0; i <= 8; i++) {
if (i == 7)
continue;
blah blah
}
```
## More about the `switch` statement
`switch` statements are optimized using many techniques than suggested
here. In fact, the implementation of optimized `switch` statements is
fascinating. There might be:
* binary searches for large numbers of cases
* separation of ranges where each sub-range is optimized in a different
way
* degeneration into streams of if / else ifs
and other techniques. The people who work on the compilers we take for
granted really are due some respect and *free beer*.

BIN
more/jump_tables/README.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,156 @@
/* Macros to permit the "same" assembly language to build on ARM64
Linux systems as well as Apple Silicon systems.
See the fuller documentation at:
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
Perry Kivolowitz
A Gentle Introduction to Assembly Language
*/
.macro GLD_PTR xreg, label
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
#else
ldr \xreg, =\label
ldr \xreg, [\xreg]
#endif
.endm
.macro GLD_ADDR xreg, label // Get a global address
#if defined(__APPLE__)
adrp \xreg, _\label@GOTPAGE
add \xreg, \xreg, _\label@GOTPAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_ADDR xreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
#else
ldr \xreg, =\label
#endif
.endm
.macro LLD_DBL xreg, dreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \dreg, [\xreg]
// fmov \dreg, \xreg
#else
ldr \xreg, =\label
ldur \dreg, [\xreg]
#endif
.endm
.macro LLD_FLT xreg, sreg, label
#if defined(__APPLE__)
adrp \xreg, \label@PAGE
add \xreg, \xreg, \label@PAGEOFF
ldur \sreg, [\xreg]
#else
ldr \xreg, =\label
ldur \sreg, [\xreg]
#endif
.endm
.macro GLABEL label
#if defined(__APPLE__)
.global _\label
#else
.global \label
#endif
.endm
.macro MAIN
#if defined(__APPLE__)
_main:
#else
main:
#endif
.endm
/* Fetching the address of the externally defined errno is quite
different on Apple and Linux. This macro leaves the address of
errno in x0.
*/
.macro ERRNO_ADDR
#if defined(__APPLE__)
bl ___error
#else
bl __errno_location
#endif
.endm
.macro CRT label
#if defined(__APPLE__)
bl _\label
#else
bl \label
#endif
.endm
.macro START_PROC // after starting label
.cfi_startproc
.endm
.macro END_PROC // after the return
.cfi_endproc
.endm
.macro PUSH_P a, b
stp \a, \b, [sp, -16]!
.endm
.macro PUSH_R a
str \a, [sp, -16]!
.endm
.macro POP_P a, b
ldp \a, \b, [sp], 16
.endm
.macro POP_R a
ldr \a, [sp], 16
.endm
/* The smaller of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MIN src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, LT
.endm
/* The larger of src_a and src_b is put into dest. A cmp instruction
or other instruction that sets the flags must be performed first.
This macro makes it easy to remember which register does what in the
csel.
Thank you to u/TNorthover for nudge to add the cmp.
*/
.macro MAX src_a, src_b, dest
cmp \src_a, \src_b
csel \dest, \src_a, \src_b, GT
.endm
.macro AASCIZ label, string
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -0,0 +1,57 @@
#include "apple-linux-convergence.S"
.p2align 2
.text
GLABEL MyMemSet
/* MyMemSet(unsigned char * b, unsigned char v, long l)
x0 w1 x2
The length is first checked against less than or equal to 0. If
so, the body of the function is skipped.
The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
into the number of instructions to jump to or beyond the initial
str. A modulo of 0 is handled separately - it causes a branch to the
initial str.
This code can be dramatically improved by copying more than one byte
at a time. You will have to figure out how to do this optimally in
P6 - MemCpy
*/
#if defined(__APPLE__)
_MyMemSet:
#else
MyMemSet:
#endif
START_PROC
PUSH_P x29, x30
mov x29, sp
cmp x2, xzr // Test for bad length.
ble 99f // Take branch of 0 or less.
add x3, x2, x0 // x3 gets address of one beyond buffer
mov x6, 8
MOD x2, x6, x4, x5 // x4 gets l % 8
cbz x4, 10f // Handle evenly divisible case.
sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5
LLD_ADDR x5, 10f
add x5, x5, x4, lsl 2
br x5
10: str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
str w1, [x0], 1
cmp x3, x0
bgt 10b
99: POP_P x29, x30
ret
END_PROC

83
more/jump_tables/jmptbl.s Normal file
View file

@ -0,0 +1,83 @@
.text
.align 4
.global main
main: str x30, [sp, -16]!
mov x0, xzr // set up call to time(nullptr)
bl time // call time setting up srand
bl srand // call srand setting up rand
bl rand // get a random number
and x0, x0, 7 // ensure its range is 0 to 7
// note use of x register is on purpose
lsl x0, x0, 2 // multiply by 4
ldr x1, =jt // load base address of jump table
add x1, x1, x0 // add offset to base address
br x1
// If, as in this case, all the "cases" have the same number of
// instructions then this intermediate jump table can be omitted saving
// some space and a tiny amount of time. To omit the intermediate jump
// table, you'd multiply by 12 above and not 4. Twelve because each
// "case" has 3 instructions (3 x 4 == 12).
// Question for you: If you did omit the jump table, relative to what
// would you jump (since "jt" would be gone).
jt: b 0f
b 1f
b 2f
b 3f
b 4f
b 5f
b 6f
b 7f
0: ldr x0, =ZR
bl puts
b 99f
1: ldr x0, =ON
bl puts
b 99f
2: ldr x0, =TW
bl puts
b 99f
3: ldr x0, =TH
bl puts
b 99f
4: ldr x0, =FR
bl puts
b 99f
5: ldr x0, =FV
bl puts
b 99f
6: ldr x0, =SX
bl puts
b 99f
7: ldr x0, =SV
bl puts
b 99f
99: mov w0, wzr
ldr x30, [sp], 16
ret
.data
.section .rodata
ZR: .asciz "0 returned"
ON: .asciz "1 returned"
TW: .asciz "2 returned"
TH: .asciz "3 returned"
FR: .asciz "4 returned"
FV: .asciz "5 returned"
SX: .asciz "6 returned"
SV: .asciz "7 returned"
.end

55
more/jump_tables/jt.c Normal file
View file

@ -0,0 +1,55 @@
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
/* This is the prototype for the assembly language version. You may
have always thought that switch statements are implemented as a long
chain of if / else. Well, sometimes they are. Sometimes they are
implemented using binary search and still other times they are
implemented as jump tables.
My assembly language version is found in jmptbl.s.
*/
int main()
{
int r;
srand(time(0));
r = rand() & 7;
switch (r)
{
case 0:
puts("0 returned");
break;
case 1:
puts("1 returned");
break;
case 2:
puts("2 returned");
break;
case 3:
puts("3 returned");
break;
case 4:
puts("4 returned");
break;
case 5:
puts("5 returned");
break;
case 6:
puts("6 returned");
break;
case 7:
puts("7 returned");
break;
}
return 0;
}

View file

@ -0,0 +1,31 @@
#include <stdio.h>
extern "C" void MyMemSet(unsigned char *, unsigned char v, long length);
/* MyMemSet(unsigned char *, unsigned char v, long length);
*/
/*
void MyMemSet(unsigned char * b, unsigned char v, long l) {
for (long i = 0; i < l; i++) {
b[i] = v;
}
}
*/
const long BUFFER_SIZE = 1000;
unsigned char buffer[BUFFER_SIZE];
int main() {
unsigned char before = buffer[-1];
unsigned char after = buffer[BUFFER_SIZE];
MyMemSet(buffer, 0xF0, 3);
if (before != buffer[-1])
printf("Bytes prior to buffer are smashed.\n");
if (after != buffer[BUFFER_SIZE])
printf("Bytes after buffer are smashed.\n");
return 0;
}

View file

@ -17,7 +17,9 @@ sin x = x - x^3/3! + x^5/5! - x^7/7! ...
Notice each term flips from addition to subtraction.
Notice each term is based on the odd integers starting at 1.
Notice each term is based on the odd integers starting at 1. While the
"1" case might look different, it is the same as all the others since
1 is just 1 to the first power divided by 1 factorial.
## Command line
@ -29,40 +31,76 @@ arguments are therefore required.
be a double.
* The number of terms to evaluate. The number of terms must lie between
1 and 10 inclusive.
1 and 10 inclusive. Note the value of 10 as an upper bound in new. It
was 8.
## C version
To assist your efforts, [here](./c_version.c) is a version of this
project written in C.
project written in C. This has been updated to print nice debugging
output which is not part of the project.
## Errors to stderr
Error messages must be sent to `stderr`.
If you are using the convergence macros to allow your program to build
on both Apple Silicon Mac OS and Linux, note the special casing needed
to deal with `stderr`. If this is you, compile the C version on Mac OS
with the `-S` compiler option to see the generated assembly language and
search for `stderr`.
This C version also demonstrates a different way of calculating the
toggle. This version flips the sign of the toggle by multiplying by -1.
The previous version used odd and even values of the term.
## Sample executions
```text
SINE % ./a.out 0 8
The sine of 0.00 degrees is 0.000000 in radians.
SINE % ./a.out 90 8
The sine of 90.00 degrees is 1.000000 in radians.
SINE % ./a.out 180 8
The sine of 180.00 degrees is -0.000001 in radians.
SINE % ./a.out 180 82
pk_taylor_series > gcc main.S -o a
pk_taylor_series > ./a 0 10
The sine of 0.00 degrees is 0.00000000.
pk_taylor_series > ./a 30 10
The sine of 30.00 degrees is 0.50000000.
pk_taylor_series > ./a 45 10
The sine of 45.00 degrees is 0.70710678.
pk_taylor_series > ./a 90 10
The sine of 90.00 degrees is 1.00000000.
pk_taylor_series > ./a 180 10
The sine of 180.00 degrees is -0.00000000.
pk_taylor_series > ./a 360 10
The sine of 360.00 degrees is -0.00104818.
pk_taylor_series > ./a 360 100
Number of terms is out of range.
SINE % ./a.out 180 -10
pk_taylor_series > ./a 360 -1
Number of terms is out of range.
SINE % echo $?
1
pk_taylor_series >
```
## Floating point instructions I used
These are the floating point instructions I used in my implementation.
* fmov
* scvtf
* fmul
* fdiv
* fadd
## How I broke up the program
I have functions named:
* main
* HandleOptions
* Factorial
* IntegerPower - x to the nth power
* ComputeSine - The main calculation
* PrintAnswer
* ConvertTheta - Wrap D2R
* D2R - Degrees to radians
## CSC3510
The following applies to Carthage College CSC3510 students.
@ -74,4 +112,3 @@ Work is to be done solo.
### What to hand in
Just the .S file. **Your name must be at the top of the file.**

Binary file not shown.

View file

@ -1,13 +1,14 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
double pi = 3.14159265359;
double pi = 3.14159265358979323846;
double D2R(double d) {
return d * pi / 180.0;
}
long Factorial(int n) {
double Factorial(int n) {
long retval = 1;
if (n > 0) {
@ -15,7 +16,7 @@ long Factorial(int n) {
retval = retval * n--;
}
}
return retval;
return (double) retval;
}
double IntegerPower(double b, int e) {
@ -48,20 +49,20 @@ int main(int argc, char ** argv) {
double r_angle = D2R(angle);
double toggle = 1.0;
for (int term = 0, base = 1; term < terms; term++, base += 2) {
double toggle = (term & 1) ? -1.0 : 1.0;
if (toggle > 0) {
printf("%+03.8e + %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
} else {
printf("%+03.8e - %+03.8e / %+03.8e [term %2d is %+03.8e]\n", sin, IntegerPower(r_angle, base),
Factorial(base), term + 1, toggle * IntegerPower(r_angle, base) / Factorial(base));
}
sin += toggle *
IntegerPower(r_angle, base) / Factorial(base);
/*
if (toggle > 0) {
printf("adding %d p/b intermediate: %f\n", base, sin);
} else {
printf("subtracting %d p/b intermediate: %f\n", base, sin);
}
*/
toggle = toggle * -1;
}
printf("The sine of %.2f degrees is %f in radians.\n", angle, sin);
printf("The sine of %0.4f degrees is %0.10f.\n", angle, sin);
return 0;
}

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -149,3 +149,8 @@ main:
.p2align 2
\label: .asciz "\string"
.endm
.macro MOD src_a, src_b, dest, scratch
sdiv \scratch, \src_a, \src_b
msub \dest, \scratch, \src_b, \src_a
.endm

View file

@ -2,27 +2,23 @@
The `fmov` instruction is used to move floating point values in and out
of floating point registers and to some degree, moving data between
integer and floating point registers.
integer and floating point registers.
## Loading Floating Point Numbers as Immediate Values
Just as we saw with integer
registers, some values can be used as immediate values and some cannot.
Just as we saw with integer registers, some values can be used as
immediate values and some cannot. It comes down to how many bits are
necessary to encode the value. Too many bits... not enough room to fit
in a 4 byte instruction plus the opcode.
For example, this works:
`mov x0, 65536`
`mov x0, 65535`
but this does not:
`mov x0, 65537`
The reason is that all AARCH64 instructions must fit within a 32 bit
instruction that must hold the instruction's op code, its flags and
other bits and bobs plus any immediate value. In the above example we
can see that the `mov` instruction provides up to 16 bits for an
immediate value.
The constraints placed on immediate values for `fmov` are much tighter
because floating point numbers are far more complex than integers.
@ -40,7 +36,7 @@ Let's take a look at some code:
fmov d0, 1.96875 // Zoinks!
```
From this we can see that an immediate value for an `fmov` seems to have
From this we can see that an immediate value for an `fmov` has
4 bits available for the mantissa. In fact, the only values that work
as immediate values will be those floating point values whose fractional
values are combinations of:
@ -56,6 +52,9 @@ values are combinations of:
As far as exponents go, `fmov` can accommodate 3 bits. So, exponents of
plus or minus 2**7 can be used.
A sign bit makes the total number of bits available for immediate moves
to be 8.
## Loading / Storing Floating Point Numbers in General
When in doubt, load fixed floating point numbers from memory. This is
@ -64,11 +63,16 @@ covered [in this chapter](./literals.md).
## SIMD
`fmov` can also deal with the more complicated special cases induced by
SIMD instructions.
SIMD instructions. `fmov` is able to move values between the various
register widths such as single precision to double precision. **However,
no conversion of value is performed - `fmov` just copies bits.**
If you need to change the precision of a floating point value, the
`fcvt` family of instructions must be used instead.
## Movement To / From Integer Registers
`fmov` can *bits* between the integer and floating point registers. We
emphasize the *bits*. No conversions are done using `fmov`. There exist
other instructions for that. See [this chapter](./rounding.md) for more
information.
`fmov` can copy *bits* between the integer and floating point registers.
We emphasize the *bits*. No conversions are done using `fmov`. There
exist other instructions for that. See [this chapter](./rounding.md) for
more information.

Binary file not shown.

View file

@ -20,30 +20,32 @@ To load a `float`, you could translate the value to binary and do
as the following:
```asm
.text // 1
.global main // 2
.align 2 // 3
// 4
main: str x30, [sp, -16]! // 5
ldr s0, =0x3fc00000 // 6
fcvt d0, s0 // 7
ldr x0, =fmt // 8
bl printf // 9
ldr x30, [sp], 16 // 10
mov w0, wzr // 11
ret // 12
// 13
.data // 14
fmt: .asciz "%f\n" // 15
.end // 16
.text // 1
.global main // 2
.align 2 // 3
// 4
main: str x30, [sp, -16]! // 5
ldr s0, =0x3fc00000 // 6
fcvt d0, s0 // 7
ldr x0, =fmt // 8
bl printf // 9
ldr x30, [sp], 16 // 10
mov w0, wzr // 11
ret // 12
// 13
.data // 14
fmt: .asciz "%f\n" // 15
.end // 16
```
The above code is found [here](./t.s).
The above code is kind of found [here](./t.s) - the file is used
for miscellaneous testing.
`Line 6` puts the translated value of 1.5 into `s0` (since the value
is a `float` it goes in an `s` register). The assembler performs some
magic getting a 32 bit value seemingly fit into a 32 bit instruction.
See [below](./literals.md#fitting-32-bits-into-a-32-bit-bag).
`Line 6` puts the translated value of 1.5 into `s0` (since we are
thinking of the value as a `float` it goes in an `s` register). The
assembler performs some magic getting a 32 bit value seemingly fit into
a 32 bit instruction. See
[below](./literals.md#fitting-32-bits-into-a-32-bit-bag).
`Line 7` converts the single precision number into a double precision
number for printing.
@ -136,6 +138,9 @@ Cool huh?
## Fitting 32 bits into a 32 bit bag
**This section is currently LINUX-centric - in the future it will
address both native Apple and Linux equally.***
AARCH64 instructions are 32 bits in width. Yet, `line 6` from
[this](./t.s) program reads:
@ -195,15 +200,16 @@ Scan downward to find `0x7a0`:
0x7a0 <main+32> .inst 0x3fc00000 ; undefined
```
Hey look! Here's our literal float. The `.inst` is an ARM
specific GNU assembler directive what allows the programmer
to encode their own instruction. Note, the encoded instruction does not
have to make any sense - instead the compiler has emitted a make believe
instruction that happens to have the value of our literal.
Hey look! Here's our literal float. The `.inst` is an ARM specific GNU
assembler directive says: `¯\_(-)_/¯`.
Note, the encoded "instruction" does not have to make any sense -
instead the compiler has emitted a make believe instruction that happens
to have the value of our literal.
What we're seeing the actual `line 6` doing is reaching ahead a short
distance to load the value of another "instruction" when really it is
our constant.
distance to load the value of another location in memory where our
constant is really found.
Let us take this explanation further. Notice we see:

Binary file not shown.

View file

@ -1,12 +1,16 @@
.text
.global main
.align 2
.global _main
.align 2
main: str x30, [sp, -16]!
_main:
str x30, [sp, -16]!
mov x0, 0xFFFFFFFF
/*
ldr s0, =0x3fc00000
fcvt d0, s0
ldr x0, =fmt
bl printf
*/
ldr x30, [sp], 16
mov w0, wzr
ret