mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-21 04:46:47 +08:00
57 lines
1.7 KiB
ArmAsm
57 lines
1.7 KiB
ArmAsm
#include "apple-linux-convergence.S"
|
|
|
|
.p2align 2
|
|
.text
|
|
GLABEL MyMemSet
|
|
|
|
/* MyMemSet(unsigned char * b, unsigned char v, long l)
|
|
x0 w1 x2
|
|
|
|
The length is first checked against less than or equal to 0. If
|
|
so, the body of the function is skipped.
|
|
|
|
The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
|
|
into the number of instructions to jump to or beyond the initial
|
|
str. A modulo of 0 is handled separately - it causes a branch to the
|
|
initial str.
|
|
|
|
This code can be dramatically improved by copying more than one byte
|
|
at a time. You will have to figure out how to do this optimally in
|
|
P6 - MemCpy
|
|
*/
|
|
#if defined(__APPLE__)
|
|
_MyMemSet:
|
|
#else
|
|
MyMemSet:
|
|
#endif
|
|
START_PROC
|
|
PUSH_P x29, x30
|
|
mov x29, sp
|
|
cmp x2, xzr // Test for bad length.
|
|
ble 99f // Take branch of 0 or less.
|
|
|
|
add x3, x2, x0 // x3 gets address of one beyond buffer
|
|
mov x6, 8
|
|
MOD x2, x6, x4, x5 // x4 gets l % 8
|
|
cbz x4, 10f // Handle evenly divisible case.
|
|
sub x4, x6, x4 // Invert sense of x4 e.g. 3 becomes 5
|
|
|
|
LLD_ADDR x5, 10f
|
|
add x5, x5, x4, lsl 2
|
|
br x5
|
|
|
|
10: str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
str w1, [x0], 1
|
|
cmp x3, x0
|
|
bgt 10b
|
|
|
|
99: POP_P x29, x30
|
|
ret
|
|
END_PROC
|
|
|