mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-21 02:26:59 +08:00
spin-lock added
This commit is contained in:
parent
e149b1ad41
commit
dc1fef201c
5 changed files with 407 additions and 0 deletions
163
more/spin-lock/README.md
Normal file
163
more/spin-lock/README.md
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
# Another use for the instructions used in **atomics**
|
||||
|
||||
In the section on **atomics** we saw how the ARM V8 load linked / store
|
||||
conditional instructions can be used to create atomic operations on
|
||||
variables in memory.
|
||||
|
||||
Here, for review, we present an atomic increment:
|
||||
|
||||
```text
|
||||
.text // 1
|
||||
.p2align 2 // 2
|
||||
// 3
|
||||
#if defined(__APPLE__) // 4
|
||||
.global _LoadLinkedStoreConditional // 5
|
||||
_LoadLinkedStoreConditional: // 6
|
||||
#else // 7
|
||||
.global LoadLinkedStoreConditional // 8
|
||||
LoadLinkedStoreConditional: // 9
|
||||
#endif // 10
|
||||
1: ldaxr w1, [x0] // 11
|
||||
add w1, w1, 1 // 12
|
||||
stlxr w2, w1, [x0] // 13
|
||||
cbnz w2, 1b // 14
|
||||
ret // 15
|
||||
```
|
||||
|
||||
The nonsense between lines 4 and 10 declare the label in ways compatible
|
||||
with both Apple M and Linux.
|
||||
|
||||
The interesting part happens from line 11 through line 14. Line 11
|
||||
dereferences a pointer to an `int32_t` putting its current value into
|
||||
`w1`. Line 12 is the increment.
|
||||
|
||||
Notice the dereference instruction is not the usual `ldr`. Instead it is
|
||||
`ldaxr` which is a dereference that marks the memory location in `x0` as
|
||||
a load for which we're hoping for exclusivity. Hoping.
|
||||
|
||||
We don't actually know if we had exclusive access to the memory location
|
||||
until the `stlxr` returns 0, meaning no one else has attempted to change
|
||||
the value at the location.
|
||||
|
||||
If `stlxr` doesn't return 0, then the value WE have is stale. So, we try
|
||||
again.
|
||||
|
||||
## Making a spin-lock
|
||||
|
||||
When one has a shared resource used by more than one thread it must be
|
||||
protected. This is the nugget to be aware of when working with threads.
|
||||
|
||||
Take a look at this thread worker:
|
||||
|
||||
```text
|
||||
void Worker(int32_t id) { // 1
|
||||
int32_t counter = 0; // 2
|
||||
while (counter < 4) { // 3
|
||||
Lock(&lock_variable); // 4
|
||||
counter++; // 5
|
||||
cout << "thread: " << id << " counter: " << counter << endl;// 6
|
||||
std::this_thread::sleep_for(chrono::milliseconds(5)); // 7
|
||||
Unlock(&lock_variable); // 8
|
||||
sched_yield(); // 9
|
||||
} // 10
|
||||
}
|
||||
```
|
||||
|
||||
The purpose of the worker is to print something to the console 4 times
|
||||
then exit. The shared resource is the console itself. Without protecting
|
||||
the console, threads will step over each other trying to print to it.
|
||||
|
||||
Here is a sample of what could happen without our spin-lock:
|
||||
|
||||
```text
|
||||
thread: 0thread: 3 counter: 1
|
||||
thread: 7 counter: 1 counter: thread:
|
||||
thread: thread: 10thread: 5 counter: 1
|
||||
thread: counter: thread: 121 counter:
|
||||
thread: 8 counter: 113
|
||||
thread: thread: 2thread: counter: 151 counter:
|
||||
```
|
||||
|
||||
With our spin-lock, here's what we might get:
|
||||
|
||||
```text
|
||||
thread: 12 counter: 3
|
||||
thread: 4 counter: 2
|
||||
thread: 7 counter: 4
|
||||
thread: 3 counter: 2
|
||||
thread: 1 counter: 4
|
||||
thread: 2 counter: 4
|
||||
thread: 13 counter: 3
|
||||
thread: 12 counter: 4
|
||||
```
|
||||
|
||||
Line 7 stresses the lock.
|
||||
|
||||
Line 9 causes the currently running thread to voluntarily deschedule.
|
||||
This makes the output more interesting. With out it, after unlocking,
|
||||
the same thread may regain the lock immediately.
|
||||
|
||||
Now let's look at the spin-lock. But first, a spin-lock is called a
|
||||
spin-lock because a thread that doesn't get the lock will `spin` trying
|
||||
to get it. This wastes time and generates heat, using electricity.
|
||||
Bummer.
|
||||
|
||||
Here is the source code to the spin-lock for ARM V8.
|
||||
|
||||
```text
|
||||
#if defined(__APPLE__) // 1
|
||||
_Lock: // 2
|
||||
#else // 3
|
||||
Lock: // 4
|
||||
#endif // 5
|
||||
START_PROC // 6
|
||||
1: ldaxr w1, [x0] // 7
|
||||
cbnz w1, 1b // lock taken - spin. // 8
|
||||
add w1, w1, 1 // 9
|
||||
stlxr w2, w1, [x0] // 10
|
||||
cbnz w2, 1b // shucks - somebody meddled. // 11
|
||||
// considered using dmb here // 12
|
||||
ret // 13
|
||||
END_PROC // 14
|
||||
```
|
||||
|
||||
Once again, line 7 does a `ldaxr` dereferencing the lock itself (once
|
||||
again an `int32_t`) and marks the location of the lock as being
|
||||
hopefully, exclusive.
|
||||
|
||||
Having gotten the value of the lock, on line 8, its value is inspected
|
||||
and if found to be non-zero, we branch back to attempting to get it
|
||||
again - this is the spin.
|
||||
|
||||
If the contents of the lock is 0, its value in `w1` is changed to
|
||||
non-zero. Note, this could be made a bit better if a value of 1 was
|
||||
stored in another `w` register and simply used directly on line 10.
|
||||
|
||||
Line 10 conditionally stores the changed value back to the location of
|
||||
the lock. If the `stlxr` returns 0, we got the lock. If not, we start
|
||||
over - somebody else got in there ahead of us. Perhaps this happened
|
||||
because we were descheduled. Perhaps we lost the lock to another thread
|
||||
running on a different core.
|
||||
|
||||
The unlock looks like this:
|
||||
|
||||
```text
|
||||
#if defined(__APPLE__) // 1
|
||||
_Unlock: // 2
|
||||
#else // 3
|
||||
Unlock: // 4
|
||||
#endif // 5
|
||||
START_PROC // 6
|
||||
str wzr, [x0] // 7
|
||||
// considered using dmb here // 8
|
||||
ret // 9
|
||||
END_PROC // 10
|
||||
```
|
||||
|
||||
All it does is set to value of the lock to zero. The correct operation
|
||||
of the lock requires that no bad actor simply stomps on the lock by
|
||||
calling `Unlock` without first owning the lock. Just say no to lock
|
||||
stompers.
|
||||
|
||||
Please see the source code located [here](./spin_lock.S) for some
|
||||
additional comments regarding the implementation.
|
||||
BIN
more/spin-lock/README.pdf
Normal file
BIN
more/spin-lock/README.pdf
Normal file
Binary file not shown.
156
more/spin-lock/apple-linux-convergence.S
Normal file
156
more/spin-lock/apple-linux-convergence.S
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
/* Macros to permit the "same" assembly language to build on ARM64
|
||||
Linux systems as well as Apple Silicon systems.
|
||||
|
||||
See the fuller documentation at:
|
||||
https://github.com/pkivolowitz/asm_book/blob/main/macros/README.md
|
||||
|
||||
Perry Kivolowitz
|
||||
A Gentle Introduction to Assembly Language
|
||||
*/
|
||||
|
||||
.macro GLD_PTR xreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, _\label@GOTPAGE
|
||||
ldr \xreg, [\xreg, _\label@GOTPAGEOFF]
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldr \xreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro GLD_ADDR xreg, label // Get a global address
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, _\label@GOTPAGE
|
||||
add \xreg, \xreg, _\label@GOTPAGEOFF
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_ADDR xreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_DBL xreg, dreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
ldur \dreg, [\xreg]
|
||||
// fmov \dreg, \xreg
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldur \dreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro LLD_FLT xreg, sreg, label
|
||||
#if defined(__APPLE__)
|
||||
adrp \xreg, \label@PAGE
|
||||
add \xreg, \xreg, \label@PAGEOFF
|
||||
ldur \sreg, [\xreg]
|
||||
#else
|
||||
ldr \xreg, =\label
|
||||
ldur \sreg, [\xreg]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro GLABEL label
|
||||
#if defined(__APPLE__)
|
||||
.global _\label
|
||||
#else
|
||||
.global \label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro MAIN
|
||||
#if defined(__APPLE__)
|
||||
_main:
|
||||
#else
|
||||
main:
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* Fetching the address of the externally defined errno is quite
|
||||
different on Apple and Linux. This macro leaves the address of
|
||||
errno in x0.
|
||||
*/
|
||||
.macro ERRNO_ADDR
|
||||
#if defined(__APPLE__)
|
||||
bl ___error
|
||||
#else
|
||||
bl __errno_location
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro CRT label
|
||||
#if defined(__APPLE__)
|
||||
bl _\label
|
||||
#else
|
||||
bl \label
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro START_PROC // after starting label
|
||||
.cfi_startproc
|
||||
.endm
|
||||
|
||||
.macro END_PROC // after the return
|
||||
.cfi_endproc
|
||||
.endm
|
||||
|
||||
.macro PUSH_P a, b
|
||||
stp \a, \b, [sp, -16]!
|
||||
.endm
|
||||
|
||||
.macro PUSH_R a
|
||||
str \a, [sp, -16]!
|
||||
.endm
|
||||
|
||||
.macro POP_P a, b
|
||||
ldp \a, \b, [sp], 16
|
||||
.endm
|
||||
|
||||
.macro POP_R a
|
||||
ldr \a, [sp], 16
|
||||
.endm
|
||||
|
||||
/* The smaller of src_a and src_b is put into dest. A cmp instruction
|
||||
or other instruction that sets the flags must be performed first.
|
||||
This macro makes it easy to remember which register does what in the
|
||||
csel.
|
||||
|
||||
Thank you to u/TNorthover for nudge to add the cmp.
|
||||
*/
|
||||
|
||||
.macro MIN src_a, src_b, dest
|
||||
cmp \src_a, \src_b
|
||||
csel \dest, \src_a, \src_b, LT
|
||||
.endm
|
||||
|
||||
/* The larger of src_a and src_b is put into dest. A cmp instruction
|
||||
or other instruction that sets the flags must be performed first.
|
||||
This macro makes it easy to remember which register does what in the
|
||||
csel.
|
||||
|
||||
Thank you to u/TNorthover for nudge to add the cmp.
|
||||
*/
|
||||
|
||||
.macro MAX src_a, src_b, dest
|
||||
cmp \src_a, \src_b
|
||||
csel \dest, \src_a, \src_b, GT
|
||||
.endm
|
||||
|
||||
.macro AASCIZ label, string
|
||||
.p2align 2
|
||||
\label: .asciz "\string"
|
||||
.endm
|
||||
|
||||
.macro MOD src_a, src_b, dest, scratch
|
||||
sdiv \scratch, \src_a, \src_b
|
||||
msub \dest, \scratch, \src_b, \src_a
|
||||
.endm
|
||||
37
more/spin-lock/main.cpp
Normal file
37
more/spin-lock/main.cpp
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
|
||||
using namespace std;
|
||||
|
||||
extern "C" void Lock(int32_t *);
|
||||
extern "C" void Unlock(int32_t *);
|
||||
|
||||
int32_t lock_variable = 0;
|
||||
const uint32_t NUM_THREADS = 16;
|
||||
|
||||
void Worker(int32_t id) {
|
||||
int32_t counter = 0;
|
||||
while (counter < 4) {
|
||||
Lock(&lock_variable);
|
||||
counter++;
|
||||
cout << "thread: " << id << " counter: " << counter << endl;
|
||||
std::this_thread::sleep_for(chrono::milliseconds(5));
|
||||
Unlock(&lock_variable);
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
vector<thread *> threads;
|
||||
|
||||
for (uint32_t i = 0; i < NUM_THREADS; i++) {
|
||||
threads.push_back(new thread(Worker, i));
|
||||
}
|
||||
|
||||
for (auto & t : threads)
|
||||
t->join();
|
||||
|
||||
return 0;
|
||||
}
|
||||
51
more/spin-lock/spin_lock.S
Normal file
51
more/spin-lock/spin_lock.S
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#include "apple-linux-convergence.S"
|
||||
|
||||
.p2align 2
|
||||
.text
|
||||
|
||||
/* Demonstration of use of load-linked and store-conditional doing
|
||||
something interesting. In this case, creating a spin lock. A spin
|
||||
lock is a simple but grossly inefficient form of a mutex. If the
|
||||
"lock" is found to be owned (non-zero) by someone else, the calling
|
||||
thread spins - checking the ownership of the "lock" in a tight loop.
|
||||
|
||||
The spinning uses up time. A better mutex would add some kind of
|
||||
queing for threads that don't own the lock. And, some kind of waking
|
||||
would also be needed. Threads on the queue would be "asleep."
|
||||
*/
|
||||
|
||||
#if defined(__APPLE__)
|
||||
.global _Lock
|
||||
.global _Unlock
|
||||
#else
|
||||
.global Lock
|
||||
.global Unlock
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
_Lock:
|
||||
#else
|
||||
Lock:
|
||||
#endif
|
||||
START_PROC
|
||||
mov w3, 1
|
||||
1: ldaxr w1, [x0]
|
||||
cbnz w1, 1b // lock taken - spin.
|
||||
stlxr w2, w3, [x0]
|
||||
cbnz w2, 1b // shucks - somebody meddled.
|
||||
// considered using dmb here
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
#if defined(__APPLE__)
|
||||
_Unlock:
|
||||
#else
|
||||
Unlock:
|
||||
#endif
|
||||
START_PROC
|
||||
str wzr, [x0]
|
||||
// considered using dmb here
|
||||
ret
|
||||
END_PROC
|
||||
|
||||
.end
|
||||
Loading…
Reference in a new issue