spin-lock added

2026-06-21 02:26:59 +08:00 · 2023-04-20 11:03:26 -05:00 · 2023-04-20 11:03:26 -05:00 · dc1fef201c
commit dc1fef201c
parent e149b1ad41
5 changed files with 407 additions and 0 deletions
--- a/more/spin-lock/README.md
+++ b/more/spin-lock/README.md
@ -0,0 +1,163 @@
+# Another use for the instructions used in **atomics**
+
+In the section on **atomics** we saw how the ARM V8 load linked / store
+conditional instructions can be used to create atomic operations on
+variables in memory.
+
+Here, for review, we present an atomic increment:
+
+```text
+        .text                                                     // 1 
+        .p2align    2                                             // 2 
+                                                                  // 3 
+#if defined(__APPLE__)                                            // 4 
+        .global     _LoadLinkedStoreConditional                   // 5 
+_LoadLinkedStoreConditional:                                      // 6 
+#else                                                             // 7 
+        .global     LoadLinkedStoreConditional                    // 8 
+LoadLinkedStoreConditional:                                       // 9 
+#endif                                                            // 10 
+1:      ldaxr       w1, [x0]                                      // 11 
+        add         w1, w1, 1                                     // 12 
+        stlxr       w2, w1, [x0]                                  // 13 
+        cbnz        w2, 1b                                        // 14 
+        ret                                                       // 15 
+```
+
+The nonsense between lines 4 and 10 declare the label in ways compatible
+with both Apple M and Linux.
+
+The interesting part happens from line 11 through line 14. Line 11
+dereferences a pointer to an `int32_t` putting its current value into
+`w1`. Line 12 is the increment.
+
+Notice the dereference instruction is not the usual `ldr`. Instead it is
+`ldaxr` which is a dereference that marks the memory location in `x0` as
+a load for which we're hoping for exclusivity. Hoping.
+
+We don't actually know if we had exclusive access to the memory location
+until the `stlxr` returns 0, meaning no one else has attempted to change
+the value at the location.
+
+If `stlxr` doesn't return 0, then the value WE have is stale. So, we try
+again.
+
+## Making a spin-lock
+
+When one has a shared resource used by more than one thread it must be
+protected. This is the nugget to be aware of when working with threads.
+
+Take a look at this thread worker:
+
+```text
+void Worker(int32_t id) {                                           // 1 
+    int32_t counter = 0;                                            // 2 
+    while (counter < 4) {                                           // 3 
+        Lock(&lock_variable);                                       // 4 
+        counter++;                                                  // 5 
+        cout << "thread: " << id << " counter: " << counter << endl;// 6 
+        std::this_thread::sleep_for(chrono::milliseconds(5));       // 7 
+        Unlock(&lock_variable);                                     // 8 
+        sched_yield();                                              // 9 
+    }                                                               // 10 
+}
+```
+
+The purpose of the worker is to print something to the console 4 times
+then exit. The shared resource is the console itself. Without protecting
+the console, threads will step over each other trying to print to it.
+
+Here is a sample of what could happen without our spin-lock:
+
+```text
+thread: 0thread: 3 counter: 1
+thread: 7 counter: 1 counter: thread: 
+thread: thread: 10thread: 5 counter: 1
+thread:  counter: thread: 121 counter: 
+thread: 8 counter: 113
+thread: thread: 2thread:  counter: 151 counter:
+```
+
+With our spin-lock, here's what we might get:
+
+```text
+thread: 12 counter: 3
+thread: 4 counter: 2
+thread: 7 counter: 4
+thread: 3 counter: 2
+thread: 1 counter: 4
+thread: 2 counter: 4
+thread: 13 counter: 3
+thread: 12 counter: 4
+```
+
+Line 7 stresses the lock.
+
+Line 9 causes the currently running thread to voluntarily deschedule.
+This makes the output more interesting. With out it, after unlocking,
+the same thread may regain the lock immediately.
+
+Now let's look at the spin-lock. But first, a spin-lock is called a
+spin-lock because a thread that doesn't get the lock will `spin` trying
+to get it. This wastes time and generates heat, using electricity.
+Bummer.
+
+Here is the source code to the spin-lock for ARM V8.
+
+```text
+#if defined(__APPLE__)                                            // 1 
+_Lock:                                                            // 2 
+#else                                                             // 3 
+Lock:                                                             // 4 
+#endif                                                            // 5 
+        START_PROC                                                // 6 
+1:      ldaxr       w1, [x0]                                      // 7 
+        cbnz        w1, 1b          // lock taken - spin.         // 8 
+        add         w1, w1, 1                                     // 9 
+        stlxr       w2, w1, [x0]                                  // 10 
+        cbnz        w2, 1b          // shucks - somebody meddled. // 11 
+        // considered using dmb here                              // 12 
+        ret                                                       // 13 
+        END_PROC                                                  // 14
+```
+
+Once again, line 7 does a `ldaxr` dereferencing the lock itself (once
+again an `int32_t`) and marks the location of the lock as being
+hopefully, exclusive.
+
+Having gotten the value of the lock, on line 8, its value is inspected
+and if found to be non-zero, we branch back to attempting to get it
+again - this is the spin.
+
+If the contents of the lock is 0, its value in `w1` is changed to
+non-zero. Note, this could be made a bit better if a value of 1 was
+stored in another `w` register and simply used directly on line 10.
+
+Line 10 conditionally stores the changed value back to the location of
+the lock. If the `stlxr` returns 0, we got the lock. If not, we start
+over - somebody else got in there ahead of us. Perhaps this happened
+because we were descheduled. Perhaps we lost the lock to another thread
+running on a different core.
+
+The unlock looks like this:
+
+```text
+#if defined(__APPLE__)                                            // 1 
+_Unlock:                                                          // 2 
+#else                                                             // 3 
+Unlock:                                                           // 4 
+#endif                                                            // 5 
+        START_PROC                                                // 6 
+        str         wzr, [x0]                                     // 7 
+        // considered using dmb here                              // 8 
+        ret                                                       // 9 
+        END_PROC                                                  // 10
+```
+
+All it does is set to value of the lock to zero. The correct operation
+of the lock requires that no bad actor simply stomps on the lock by
+calling `Unlock` without first owning the lock. Just say no to lock
+stompers.
+
+Please see the source code located [here](./spin_lock.S) for some
+additional comments regarding the implementation.
--- a/more/spin-lock/README.pdf
+++ b/more/spin-lock/README.pdf
--- a/more/spin-lock/apple-linux-convergence.S
+++ b/more/spin-lock/apple-linux-convergence.S
@ -0,0 +1,156 @@
+/*  Macros to permit the "same" assembly language to build on ARM64
+    Linux systems as well as Apple Silicon systems.
+
+    See the fuller documentation at:
+    https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
+
+    Perry Kivolowitz
+    A Gentle Introduction to Assembly Language
+*/
+
+.macro  GLD_PTR     xreg, label
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        ldr	        \xreg, [\xreg, _\label@GOTPAGEOFF]
+#else
+        ldr         \xreg, =\label
+        ldr         \xreg, [\xreg]
+#endif
+.endm
+
+.macro  GLD_ADDR    xreg, label     // Get a global address
+#if defined(__APPLE__)
+        adrp	    \xreg, _\label@GOTPAGE
+        add         \xreg, \xreg, _\label@GOTPAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_ADDR xreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+#else
+        ldr         \xreg, =\label
+#endif
+.endm
+
+.macro  LLD_DBL xreg, dreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \dreg, [\xreg]
+//      fmov    \dreg, \xreg
+#else
+        ldr     \xreg, =\label
+        ldur    \dreg, [\xreg]
+#endif
+.endm
+
+.macro  LLD_FLT xreg, sreg, label
+#if defined(__APPLE__)
+        adrp    \xreg, \label@PAGE
+        add     \xreg, \xreg, \label@PAGEOFF
+        ldur    \sreg, [\xreg]
+#else
+        ldr     \xreg, =\label
+        ldur    \sreg, [\xreg]
+#endif
+.endm
+
+.macro GLABEL label
+#if defined(__APPLE__)
+        .global _\label
+#else
+        .global \label
+#endif
+.endm
+
+.macro MAIN
+#if defined(__APPLE__)
+_main:
+#else
+main:
+#endif
+.endm
+
+/*  Fetching the address of the externally defined errno is quite
+    different on Apple and Linux. This macro leaves the address of
+    errno in x0.
+*/
+.macro  ERRNO_ADDR
+#if defined(__APPLE__)
+        bl      ___error
+#else
+        bl      __errno_location
+#endif
+.endm
+
+.macro  CRT label
+#if defined(__APPLE__)
+        bl  _\label
+#else
+        bl  \label
+#endif
+.endm
+
+.macro  START_PROC          // after starting label
+        .cfi_startproc
+.endm
+
+.macro  END_PROC            // after the return
+        .cfi_endproc
+.endm
+
+.macro  PUSH_P  a, b 
+        stp     \a, \b, [sp, -16]!
+.endm
+
+.macro  PUSH_R  a 
+        str     \a, [sp, -16]!
+.endm
+
+.macro  POP_P   a, b 
+        ldp     \a, \b, [sp], 16
+.endm
+
+.macro  POP_R   a 
+        ldr     \a, [sp], 16
+.endm
+
+/*  The smaller of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+    
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MIN     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, LT
+.endm
+
+/*  The larger of src_a and src_b is put into dest. A cmp instruction
+    or other instruction that sets the flags must be performed first.
+    This macro makes it easy to remember which register does what in the
+    csel.
+
+    Thank you to u/TNorthover for nudge to add the cmp.
+*/
+
+.macro  MAX     src_a, src_b, dest
+        cmp     \src_a, \src_b
+        csel    \dest, \src_a, \src_b, GT
+.endm
+
+.macro  AASCIZ      label, string
+        .p2align    2
+\label: .asciz      "\string"
+.endm
+
+.macro  MOD         src_a, src_b, dest, scratch
+        sdiv        \scratch, \src_a, \src_b
+        msub        \dest, \scratch, \src_b, \src_a
+.endm
--- a/more/spin-lock/main.cpp
+++ b/more/spin-lock/main.cpp
@ -0,0 +1,37 @@
+#include <iostream>
+#include <thread>
+#include <vector>
+#include <chrono>
+
+using namespace std;
+
+extern "C" void Lock(int32_t *);
+extern "C" void Unlock(int32_t *);
+
+int32_t lock_variable = 0;
+const uint32_t NUM_THREADS = 16;
+
+void Worker(int32_t id) {
+    int32_t counter = 0;
+    while (counter < 4) {
+        Lock(&lock_variable);
+        counter++;
+        cout << "thread: " << id << " counter: " << counter << endl;
+        std::this_thread::sleep_for(chrono::milliseconds(5));
+        Unlock(&lock_variable);
+        sched_yield();
+    }
+}
+
+int main() {
+    vector<thread *> threads;
+
+    for (uint32_t i = 0; i < NUM_THREADS; i++) {
+        threads.push_back(new thread(Worker, i));
+    }
+
+    for (auto & t : threads)
+        t->join();
+    
+    return 0;
+}
--- a/more/spin-lock/spin_lock.S
+++ b/more/spin-lock/spin_lock.S
@ -0,0 +1,51 @@
+#include "apple-linux-convergence.S"
+
+        .p2align    2
+        .text
+
+/*  Demonstration  of use  of  load-linked  and  store-conditional doing
+    something  interesting.  In this case, creating a spin lock.  A spin
+    lock is a simple  but  grossly inefficient  form of a mutex.  If the
+    "lock" is found to be owned (non-zero) by someone else,  the calling
+    thread spins - checking the ownership of the "lock" in a tight loop.
+
+    The spinning uses up  time.  A  better  mutex would add some kind of
+    queing for threads that don't own the lock. And, some kind of waking
+    would also be needed. Threads on the queue would be "asleep."
+*/
+
+#if defined(__APPLE__)
+        .global     _Lock
+        .global     _Unlock
+#else
+        .global     Lock
+        .global     Unlock 
+#endif
+
+#if defined(__APPLE__)
+_Lock:
+#else
+Lock:
+#endif
+        START_PROC
+        mov         w3, 1
+1:      ldaxr       w1, [x0]
+        cbnz        w1, 1b          // lock taken - spin.
+        stlxr       w2, w3, [x0]
+        cbnz        w2, 1b          // shucks - somebody meddled.
+        // considered using dmb here
+        ret
+        END_PROC
+
+#if defined(__APPLE__)
+_Unlock:
+#else
+Unlock:
+#endif
+        START_PROC
+        str         wzr, [x0]
+        // considered using dmb here
+        ret
+        END_PROC
+
+        .end