asm_book/more/jump_tables/branch_table.S

#include    "apple-linux-convergence.S"

        .p2align    2
        .text
        GLABEL      MyMemSet

/*  MyMemSet(unsigned char * b, unsigned char v, long l)
             x0                 w1               x2

    The length is first checked against less than or equal to 0. If
    so, the body of the function is skipped.

    The loop will be unrolled 8x. The length (x2) modulo 8 gets turned
    into the number of instructions to jump to or beyond the initial
    str. A modulo of 0 is handled separately - it causes a branch to the
    initial str.

    This code can be dramatically improved by copying more than one byte
    at a time. You will have to figure out how to do this optimally in
    P6 - MemCpy
*/
#if defined(__APPLE__)
_MyMemSet:
#else
MyMemSet:
#endif
        START_PROC
        PUSH_P      x29, x30
        mov         x29, sp
        cmp         x2, xzr         // Test for bad length.
        ble         99f             // Take branch of 0 or less.

        add         x3, x2, x0      // x3 gets address of one beyond buffer
        mov         x6, 8
        MOD         x2, x6, x4, x5  // x4 gets l % 8
        cbz         x4, 10f         // Handle evenly divisible case.
        sub         x4, x6, x4      // Invert sense of x4 e.g. 3 becomes 5

        LLD_ADDR    x5, 10f
        add         x5, x5, x4, lsl 2
        br          x5

10:     str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        str         w1, [x0], 1
        cmp         x3, x0
        bgt         10b

99:     POP_P       x29, x30
        ret
        END_PROC