mirror of
https://github.com/pkivolowitz/asm_book.git
synced 2026-06-22 23:36:47 +08:00
finished chapter on atomic operations
This commit is contained in:
parent
48eae14088
commit
8e745367c5
9 changed files with 129 additions and 81 deletions
|
|
@ -319,6 +319,7 @@ In this section, we present miscellaneous material.
|
||||||
| 4 | [Under the hood: System Calls](./more/system_calls/README.md) | [Link](./more/system_calls/README.pdf) |
|
| 4 | [Under the hood: System Calls](./more/system_calls/README.md) | [Link](./more/system_calls/README.pdf) |
|
||||||
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
|
| 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) |
|
||||||
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
|
| 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) |
|
||||||
|
| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) |
|
||||||
|
|
||||||
## Macro Suite
|
## Macro Suite
|
||||||
|
|
||||||
|
|
|
||||||
BIN
README.pdf
BIN
README.pdf
Binary file not shown.
|
|
@ -45,62 +45,87 @@ const uint32_t NUM_THREADS = 16; // 13
|
||||||
/* volatile is necessary if any use of the optimizer // 15
|
/* volatile is necessary if any use of the optimizer // 15
|
||||||
is to be made. // 16
|
is to be made. // 16
|
||||||
*/ // 17
|
*/ // 17
|
||||||
volatile uint32_t naked_int = 0; // 18
|
volatile uint32_t naked_int; // 18
|
||||||
atomic<uint32_t> atomic_integer(0); // 19
|
atomic<uint32_t> atomic_integer(0); // 19
|
||||||
// 20
|
// 20
|
||||||
void NakedWorker() { // 21
|
extern "C" void LoadLinkedStoreConditional(uint32_t * value); // 21
|
||||||
extern volatile uint32_t naked_int; // 22
|
// 22
|
||||||
// 23
|
void LLSCWorker() { // 23
|
||||||
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 24
|
extern volatile uint32_t naked_int; // 24
|
||||||
naked_int++; // 25
|
// 25
|
||||||
} // 26
|
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 26
|
||||||
} // 27
|
LoadLinkedStoreConditional((uint32_t *) &naked_int); // 27
|
||||||
// 28
|
} // 28
|
||||||
void AtomicWorker() { // 29
|
} // 29
|
||||||
extern atomic<uint32_t> atomic_integer; // 30
|
// 30
|
||||||
// 31
|
void NakedWorker() { // 31
|
||||||
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 32
|
extern volatile uint32_t naked_int; // 32
|
||||||
atomic_integer++; // 33
|
// 33
|
||||||
} // 34
|
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 34
|
||||||
} // 35
|
naked_int++; // 35
|
||||||
// 36
|
} // 36
|
||||||
void DoNaked() { // 37
|
} // 37
|
||||||
vector<thread *> threads; // 38
|
// 38
|
||||||
// 39
|
void AtomicWorker() { // 39
|
||||||
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 40
|
extern atomic<uint32_t> atomic_integer; // 40
|
||||||
threads.push_back(new thread(NakedWorker)); // 41
|
// 41
|
||||||
} // 42
|
for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 42
|
||||||
// 43
|
atomic_integer++; // 43
|
||||||
for (auto &t : threads) { // 44
|
} // 44
|
||||||
t->join(); // 45
|
} // 45
|
||||||
} // 46
|
// 46
|
||||||
} // 47
|
void DoNaked() { // 47
|
||||||
// 48
|
vector<thread *> threads; // 48
|
||||||
void DoAtomic() { // 49
|
// 49
|
||||||
vector<thread *> threads; // 50
|
naked_int = 0; // 50
|
||||||
// 51
|
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 51
|
||||||
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 52
|
threads.push_back(new thread(NakedWorker)); // 52
|
||||||
threads.push_back(new thread(AtomicWorker)); // 53
|
} // 53
|
||||||
} // 54
|
// 54
|
||||||
// 55
|
for (auto &t : threads) { // 55
|
||||||
for (auto &t : threads) { // 56
|
t->join(); // 56
|
||||||
t->join(); // 57
|
} // 57
|
||||||
} // 58
|
} // 58
|
||||||
} // 59
|
// 59
|
||||||
// 60
|
void DoLLSC() { // 60
|
||||||
int main() { // 61
|
vector<thread *> threads; // 61
|
||||||
// 62
|
naked_int = 0; // 62
|
||||||
DoNaked(); // 63
|
// 63
|
||||||
DoAtomic(); // 64
|
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 64
|
||||||
// 65
|
threads.push_back(new thread(LLSCWorker)); // 65
|
||||||
cout << "Correct sum is: "; // 66
|
} // 66
|
||||||
cout << NUM_THREADS * MAX_LOOPS << endl; // 67
|
// 67
|
||||||
cout << "Naked sum: " << naked_int << endl; // 68
|
for (auto &t : threads) { // 68
|
||||||
cout << "Atomic sum: " << atomic_integer << endl; // 69
|
t->join(); // 69
|
||||||
// 70
|
} // 70
|
||||||
return 0; // 71
|
} // 71
|
||||||
} // 72
|
// 72
|
||||||
perrykivolowitz@DAEDALUS atomics %
|
void DoAtomic() { // 73
|
||||||
|
vector<thread *> threads; // 74
|
||||||
|
// 75
|
||||||
|
for (uint32_t i = 0; i < NUM_THREADS; i++) { // 76
|
||||||
|
threads.push_back(new thread(AtomicWorker)); // 77
|
||||||
|
} // 78
|
||||||
|
// 79
|
||||||
|
for (auto &t : threads) { // 80
|
||||||
|
t->join(); // 81
|
||||||
|
} // 82
|
||||||
|
} // 83
|
||||||
|
// 84
|
||||||
|
int main() { // 85
|
||||||
|
// 86
|
||||||
|
DoNaked(); // 87
|
||||||
|
DoAtomic(); // 88
|
||||||
|
// 89
|
||||||
|
cout << "Correct sum is: "; // 90
|
||||||
|
cout << NUM_THREADS * MAX_LOOPS << endl; // 91
|
||||||
|
cout << "Naked sum: " << naked_int << endl; // 92
|
||||||
|
cout << "Atomic sum: " << atomic_integer << endl; // 93
|
||||||
|
// 94
|
||||||
|
DoLLSC(); // 95
|
||||||
|
cout << "LLSC sum: " << naked_int << endl; // 96
|
||||||
|
return 0; // 97
|
||||||
|
} // 98
|
||||||
```
|
```
|
||||||
|
|
||||||
This program will spawn 16 threads which will each loop 10,000 times,
|
This program will spawn 16 threads which will each loop 10,000 times,
|
||||||
|
|
@ -163,7 +188,7 @@ described below.
|
||||||
|
|
||||||
For ARMv8 and for later ARM versions (to perform operations other than
|
For ARMv8 and for later ARM versions (to perform operations other than
|
||||||
those listed above), there is a general solution that isn't pretty. It
|
those listed above), there is a general solution that isn't pretty. It
|
||||||
is an example of Load Locked / Store Conditional. It isn't pretty
|
is an example of Load Linked / Store Conditional. It isn't pretty
|
||||||
because it involves a loop.
|
because it involves a loop.
|
||||||
|
|
||||||
```asm
|
```asm
|
||||||
|
|
@ -171,11 +196,11 @@ because it involves a loop.
|
||||||
.p2align 2 // 2
|
.p2align 2 // 2
|
||||||
// 3
|
// 3
|
||||||
#if defined(__APPLE__) // 4
|
#if defined(__APPLE__) // 4
|
||||||
.global _LoadLockedStoreConditional // 5
|
.global _LoadLinkedStoreConditional // 5
|
||||||
_LoadLockedStoreConditional: // 6
|
_LoadLinkedStoreConditional: // 6
|
||||||
#else // 7
|
#else // 7
|
||||||
.global LoadLockedStoreConditional // 8
|
.global LoadLinkedStoreConditional // 8
|
||||||
LoadLockedStoreConditional: // 9
|
LoadLinkedStoreConditional: // 9
|
||||||
#endif // 10
|
#endif // 10
|
||||||
1: ldaxr w1, [x0] // 11
|
1: ldaxr w1, [x0] // 11
|
||||||
add w1, w1, 1 // 12
|
add w1, w1, 1 // 12
|
||||||
|
|
@ -187,7 +212,7 @@ LoadLockedStoreConditional: // 9
|
||||||
Lines 1 and 2 are boilerplate.
|
Lines 1 and 2 are boilerplate.
|
||||||
|
|
||||||
The conditional assembly block from line 4 through line 10 declare the
|
The conditional assembly block from line 4 through line 10 declare the
|
||||||
label `LoadLockedStoreConditional` as global for both Linux and Apple
|
label `LoadLinkedStoreConditional` as global for both Linux and Apple
|
||||||
assemblers. The label itself is also stated.
|
assemblers. The label itself is also stated.
|
||||||
|
|
||||||
It is worth explaining that labels marked as global must have an
|
It is worth explaining that labels marked as global must have an
|
||||||
|
|
@ -201,27 +226,49 @@ the address as needing watching (by the hardware).
|
||||||
Line 12 can be expanded and / or replaced with whatever operation needed
|
Line 12 can be expanded and / or replaced with whatever operation needed
|
||||||
to be done on the value.
|
to be done on the value.
|
||||||
|
|
||||||
Line 13 puts the potato on the fork. It is a store condition with
|
Line 13 puts the potato on the fork. It is a store conditional which may
|
||||||
release. The release means that after the instruction finishes, the
|
or may not actually write anything to memory.
|
||||||
previously marked address will no longer be marked. The value returned
|
|
||||||
in `w2` will be 0 of the store actually took place.
|
|
||||||
|
|
||||||
Here's the cool bit (literally): If `w2` contains a 1 it means that
|
To understand this instruction, [Kristien et
|
||||||
some executing agent (most likely another thread in the same process)
|
al](../../reference_material/USENIX2020.pdf) et al. provide this
|
||||||
has attempted to change the location in memory since this thread marked
|
amazingly helpful picture:
|
||||||
the location, the store did NOT actually take place.
|
|
||||||
|

|
||||||
|
|
||||||
|
The value returned in `w2` will be 0 of the store actually took place
|
||||||
|
and will have the value of 1 if the store was rejected.
|
||||||
|
|
||||||
Imagine the following:
|
Imagine the following:
|
||||||
|
|
||||||
| T1 | T2 |
|
| T1 | T2 |
|
||||||
| -- | -- |
|
| -- | -- |
|
||||||
| Executes line 11. Gets value 10. Location is marked by T1. | |
|
| Executes line 11. Gets value *N*. Location is marked. | |
|
||||||
| Executes line 12. `w1` goes up to 11. | |
|
| T1 is descheduled. | |
|
||||||
| Yanked from CPU | |
|
| | Executes line 11. Gets value *N*. Location is marked again. |
|
||||||
| | Executes line 11. Gets value 10. Location is marked by T2. |
|
| | `w1` goes up to *N + 1* on line 12. |
|
||||||
| | `w1` goes up to 11. |
|
| | Line 13 succeeds in storing *N + 1* to memory and <br/> the location marking is cleared |
|
||||||
| | |
|
| | T2 is descheduled. |
|
||||||
|
| T1 is scheduled - recall it has stale values. | |
|
||||||
|
| Executes line 12 making *N + 1* which is now wrong. | |
|
||||||
|
| Executes line 13 which fails because the marking is now gone. | |
|
||||||
|
| Loops around, this time picking up *N + 1* | |
|
||||||
|
| Correctly makes *N + 2* | |
|
||||||
|
|
||||||
## Implementation of ARMv8.1A and Newer
|
## Implementation of ARMv8.1A and Newer
|
||||||
|
|
||||||
Implementation of operations on atomic variables
|
Implementations of operations on atomic variables were improved in the
|
||||||
|
second version of ARMv8, called ARMv8.1. The load linked and store
|
||||||
|
conditional instructions are still available but several new
|
||||||
|
instructions were added which perform certain operations such as
|
||||||
|
addition, subtraction and various bitwise operations in a single atomic
|
||||||
|
instruction.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```asm
|
||||||
|
mov w1, 1
|
||||||
|
ldaddal w1, w0, [x0]
|
||||||
|
```
|
||||||
|
|
||||||
|
does the same work of atomically adding one to the value in memory
|
||||||
|
pointed to by `x0`.
|
||||||
BIN
more/atomics/README.pdf
Normal file
BIN
more/atomics/README.pdf
Normal file
Binary file not shown.
|
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
|
|
@ -18,13 +18,13 @@ const uint32_t NUM_THREADS = 16;
|
||||||
volatile uint32_t naked_int;
|
volatile uint32_t naked_int;
|
||||||
atomic<uint32_t> atomic_integer(0);
|
atomic<uint32_t> atomic_integer(0);
|
||||||
|
|
||||||
extern "C" void LoadLockedStoreConditional(uint32_t * value);
|
extern "C" void LoadLinkedStoreConditional(uint32_t * value);
|
||||||
|
|
||||||
void LLSCWorker() {
|
void LLSCWorker() {
|
||||||
extern volatile uint32_t naked_int;
|
extern volatile uint32_t naked_int;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < MAX_LOOPS; i++) {
|
for (uint32_t i = 0; i < MAX_LOOPS; i++) {
|
||||||
LoadLockedStoreConditional((uint32_t *) &naked_int);
|
LoadLinkedStoreConditional((uint32_t *)&naked_int);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
BIN
more/atomics/llsc.png
Normal file
BIN
more/atomics/llsc.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 104 KiB |
|
|
@ -2,11 +2,11 @@
|
||||||
.p2align 2
|
.p2align 2
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
.global _LoadLockedStoreConditional
|
.global _LoadLinkedStoreConditional
|
||||||
_LoadLockedStoreConditional:
|
_LoadLinkedStoreConditional:
|
||||||
#else
|
#else
|
||||||
.global LoadLockedStoreConditional
|
.global LoadLinkedStoreConditional
|
||||||
LoadLockedStoreConditional:
|
LoadLinkedStoreConditional:
|
||||||
#endif
|
#endif
|
||||||
1: ldaxr w1, [x0]
|
1: ldaxr w1, [x0]
|
||||||
add w1, w1, 1
|
add w1, w1, 1
|
||||||
Binary file not shown.
Loading…
Reference in a new issue