finished chapter on atomic operations

This commit is contained in:
Perry Kivolowitz 2023-03-01 16:49:55 -06:00
parent 48eae14088
commit 8e745367c5
9 changed files with 129 additions and 81 deletions

View file

@ -319,6 +319,7 @@ In this section, we present miscellaneous material.
| 4 | [Under the hood: System Calls](./more/system_calls/README.md) | [Link](./more/system_calls/README.pdf) |
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
## Macro Suite

Binary file not shown.

View file

@ -45,62 +45,87 @@ const uint32_t NUM_THREADS = 16; // 13
/* volatile is necessary if any use of the optimizer // 15
is to be made. // 16
*/ // 17
volatile uint32_t naked_int = 0; // 18
volatile uint32_t naked_int; // 18
atomic<uint32_t> atomic_integer(0); // 19
// 20
void NakedWorker() { // 21
extern volatile uint32_t naked_int; // 22
// 23
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 24
naked_int++; // 25
} // 26
} // 27
// 28
void AtomicWorker() { // 29
extern atomic<uint32_t> atomic_integer; // 30
// 31
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 32
atomic_integer++; // 33
} // 34
} // 35
// 36
void DoNaked() { // 37
vector<thread *> threads; // 38
// 39
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 40
threads.push_back(new thread(NakedWorker)); // 41
} // 42
// 43
for (auto &t : threads) { // 44
t->join(); // 45
} // 46
} // 47
// 48
void DoAtomic() { // 49
vector<thread *> threads; // 50
// 51
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 52
threads.push_back(new thread(AtomicWorker)); // 53
} // 54
// 55
for (auto &t : threads) { // 56
t->join(); // 57
} // 58
} // 59
// 60
int main() { // 61
// 62
DoNaked(); // 63
DoAtomic(); // 64
// 65
cout << "Correct sum is: "; // 66
cout << NUM_THREADS * MAX_LOOPS << endl; // 67
cout << "Naked sum: " << naked_int << endl; // 68
cout << "Atomic sum: " << atomic_integer << endl; // 69
// 70
return 0; // 71
} // 72
perrykivolowitz@DAEDALUS atomics %
extern "C" void LoadLinkedStoreConditional(uint32_t * value); // 21
// 22
void LLSCWorker() { // 23
extern volatile uint32_t naked_int; // 24
// 25
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 26
LoadLinkedStoreConditional((uint32_t *) &naked_int); // 27
} // 28
} // 29
// 30
void NakedWorker() { // 31
extern volatile uint32_t naked_int; // 32
// 33
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 34
naked_int++; // 35
} // 36
} // 37
// 38
void AtomicWorker() { // 39
extern atomic<uint32_t> atomic_integer; // 40
// 41
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 42
atomic_integer++; // 43
} // 44
} // 45
// 46
void DoNaked() { // 47
vector<thread *> threads; // 48
// 49
naked_int = 0; // 50
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 51
threads.push_back(new thread(NakedWorker)); // 52
} // 53
// 54
for (auto &t : threads) { // 55
t->join(); // 56
} // 57
} // 58
// 59
void DoLLSC() { // 60
vector<thread *> threads; // 61
naked_int = 0; // 62
// 63
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 64
threads.push_back(new thread(LLSCWorker)); // 65
} // 66
// 67
for (auto &t : threads) { // 68
t->join(); // 69
} // 70
} // 71
// 72
void DoAtomic() { // 73
vector<thread *> threads; // 74
// 75
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 76
threads.push_back(new thread(AtomicWorker)); // 77
} // 78
// 79
for (auto &t : threads) { // 80
t->join(); // 81
} // 82
} // 83
// 84
int main() { // 85
// 86
DoNaked(); // 87
DoAtomic(); // 88
// 89
cout << "Correct sum is: "; // 90
cout << NUM_THREADS * MAX_LOOPS << endl; // 91
cout << "Naked sum: " << naked_int << endl; // 92
cout << "Atomic sum: " << atomic_integer << endl; // 93
// 94
DoLLSC(); // 95
cout << "LLSC sum: " << naked_int << endl; // 96
return 0; // 97
} // 98
```
This program will spawn 16 threads which will each loop 10,000 times,
@ -163,7 +188,7 @@ described below.
For ARMv8 and for later ARM versions (to perform operations other than
those listed above), there is a general solution that isn't pretty. It
is an example of Load Locked / Store Conditional. It isn't pretty
is an example of Load Linked / Store Conditional. It isn't pretty
because it involves a loop.
```asm
@ -171,11 +196,11 @@ because it involves a loop.
.p2align 2 // 2
// 3
#if defined(__APPLE__) // 4
.global _LoadLockedStoreConditional // 5
_LoadLockedStoreConditional: // 6
.global _LoadLinkedStoreConditional // 5
_LoadLinkedStoreConditional: // 6
#else // 7
.global LoadLockedStoreConditional // 8
LoadLockedStoreConditional: // 9
.global LoadLinkedStoreConditional // 8
LoadLinkedStoreConditional: // 9
#endif // 10
1: ldaxr w1, [x0] // 11
add w1, w1, 1 // 12
@ -187,7 +212,7 @@ LoadLockedStoreConditional: // 9
Lines 1 and 2 are boilerplate.
The conditional assembly block from line 4 through line 10 declare the
label `LoadLockedStoreConditional` as global for both Linux and Apple
label `LoadLinkedStoreConditional` as global for both Linux and Apple
assemblers. The label itself is also stated.
It is worth explaining that labels marked as global must have an
@ -201,27 +226,49 @@ the address as needing watching (by the hardware).
Line 12 can be expanded and / or replaced with whatever operation needed
to be done on the value.
Line 13 puts the potato on the fork. It is a store condition with
release. The release means that after the instruction finishes, the
previously marked address will no longer be marked. The value returned
in `w2` will be 0 of the store actually took place.
Line 13 puts the potato on the fork. It is a store conditional which may
or may not actually write anything to memory.
Here's the cool bit (literally): If `w2` contains a 1 it means that
some executing agent (most likely another thread in the same process)
has attempted to change the location in memory since this thread marked
the location, the store did NOT actually take place.
To understand this instruction, [Kristien et
al](../../reference_material/USENIX2020.pdf) et al. provide this
amazingly helpful picture:
![llsc](./llsc.png)
The value returned in `w2` will be 0 of the store actually took place
and will have the value of 1 if the store was rejected.
Imagine the following:
| T1 | T2 |
| -- | -- |
| Executes line 11. Gets value 10. Location is marked by T1. | |
| Executes line 12. `w1` goes up to 11. | |
| Yanked from CPU | |
| | Executes line 11. Gets value 10. Location is marked by T2. |
| | `w1` goes up to 11. |
| | |
| Executes line 11. Gets value *N*. Location is marked. | |
| T1 is descheduled. | |
| | Executes line 11. Gets value *N*. Location is marked again. |
| | `w1` goes up to *N + 1* on line 12. |
| | Line 13 succeeds in storing *N + 1* to memory and <br/> the location marking is cleared |
| | T2 is descheduled. |
| T1 is scheduled - recall it has stale values. | |
| Executes line 12 making *N + 1* which is now wrong. | |
| Executes line 13 which fails because the marking is now gone. | |
| Loops around, this time picking up *N + 1* | |
| Correctly makes *N + 2* | |
## Implementation of ARMv8.1A and Newer
Implementation of operations on atomic variables
Implementations of operations on atomic variables were improved in the
second version of ARMv8, called ARMv8.1. The load linked and store
conditional instructions are still available but several new
instructions were added which perform certain operations such as
addition, subtraction and various bitwise operations in a single atomic
instruction.
For example:
```asm
mov w1, 1
ldaddal w1, w0, [x0]
```
does the same work of atomically adding one to the value in memory
pointed to by `x0`.

BIN
more/atomics/README.pdf Normal file

Binary file not shown.

View file

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

View file

@ -18,13 +18,13 @@ const uint32_t NUM_THREADS = 16;
volatile uint32_t naked_int;
atomic<uint32_t> atomic_integer(0);
extern "C" void LoadLockedStoreConditional(uint32_t * value);
extern "C" void LoadLinkedStoreConditional(uint32_t * value);
void LLSCWorker() {
extern volatile uint32_t naked_int;
for (uint32_t i = 0; i < MAX_LOOPS; i++) {
LoadLockedStoreConditional((uint32_t *) &naked_int);
LoadLinkedStoreConditional((uint32_t *)&naked_int);
}
}

BIN
more/atomics/llsc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

View file

@ -2,11 +2,11 @@
.p2align 2
#if defined(__APPLE__)
.global _LoadLockedStoreConditional
_LoadLockedStoreConditional:
.global _LoadLinkedStoreConditional
_LoadLinkedStoreConditional:
#else
.global LoadLockedStoreConditional
LoadLockedStoreConditional:
.global LoadLinkedStoreConditional
LoadLinkedStoreConditional:
#endif
1: ldaxr w1, [x0]
add w1, w1, 1

Binary file not shown.