diff --git a/README.md b/README.md index 00e72b3..35a3431 100644 --- a/README.md +++ b/README.md @@ -319,6 +319,7 @@ In this section, we present miscellaneous material. | 4 | [Under the hood: System Calls](./more/system_calls/README.md) | [Link](./more/system_calls/README.pdf) | | 5 | [Determining string literal lengths for C functions](./more/strlen_for_c/README.md) | [Link](./more/strlen_for_c/README.pdf) | | 6 | [Calling Assembly Language From Python](./python/) | [Link](./python/README.pdf) | +| 7 | [Atomic Operations](./more/atomics/README.md) | [Link](./more/atomics/README.pdf) | ## Macro Suite diff --git a/README.pdf b/README.pdf index 5d73fee..6d0e532 100644 Binary files a/README.pdf and b/README.pdf differ diff --git a/section_3/atomics/README.md b/more/atomics/README.md similarity index 72% rename from section_3/atomics/README.md rename to more/atomics/README.md index 47752e3..03a5eeb 100644 --- a/section_3/atomics/README.md +++ b/more/atomics/README.md @@ -45,62 +45,87 @@ const uint32_t NUM_THREADS = 16; // 13 /* volatile is necessary if any use of the optimizer // 15 is to be made. // 16 */ // 17 -volatile uint32_t naked_int = 0; // 18 +volatile uint32_t naked_int; // 18 atomic atomic_integer(0); // 19 // 20 -void NakedWorker() { // 21 - extern volatile uint32_t naked_int; // 22 - // 23 - for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 24 - naked_int++; // 25 - } // 26 -} // 27 - // 28 -void AtomicWorker() { // 29 - extern atomic atomic_integer; // 30 - // 31 - for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 32 - atomic_integer++; // 33 - } // 34 -} // 35 - // 36 -void DoNaked() { // 37 - vector threads; // 38 - // 39 - for (uint32_t i = 0; i < NUM_THREADS; i++) { // 40 - threads.push_back(new thread(NakedWorker)); // 41 - } // 42 - // 43 - for (auto &t : threads) { // 44 - t->join(); // 45 - } // 46 -} // 47 - // 48 -void DoAtomic() { // 49 - vector threads; // 50 - // 51 - for (uint32_t i = 0; i < NUM_THREADS; i++) { // 52 - threads.push_back(new thread(AtomicWorker)); // 53 - } // 54 - // 55 - for (auto &t : threads) { // 56 - t->join(); // 57 - } // 58 -} // 59 - // 60 -int main() { // 61 - // 62 - DoNaked(); // 63 - DoAtomic(); // 64 - // 65 - cout << "Correct sum is: "; // 66 - cout << NUM_THREADS * MAX_LOOPS << endl; // 67 - cout << "Naked sum: " << naked_int << endl; // 68 - cout << "Atomic sum: " << atomic_integer << endl; // 69 - // 70 - return 0; // 71 -} // 72 -perrykivolowitz@DAEDALUS atomics % +extern "C" void LoadLinkedStoreConditional(uint32_t * value); // 21 + // 22 +void LLSCWorker() { // 23 + extern volatile uint32_t naked_int; // 24 + // 25 + for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 26 + LoadLinkedStoreConditional((uint32_t *) &naked_int); // 27 + } // 28 +} // 29 + // 30 +void NakedWorker() { // 31 + extern volatile uint32_t naked_int; // 32 + // 33 + for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 34 + naked_int++; // 35 + } // 36 +} // 37 + // 38 +void AtomicWorker() { // 39 + extern atomic atomic_integer; // 40 + // 41 + for (uint32_t i = 0; i < MAX_LOOPS; i++) { // 42 + atomic_integer++; // 43 + } // 44 +} // 45 + // 46 +void DoNaked() { // 47 + vector threads; // 48 + // 49 + naked_int = 0; // 50 + for (uint32_t i = 0; i < NUM_THREADS; i++) { // 51 + threads.push_back(new thread(NakedWorker)); // 52 + } // 53 + // 54 + for (auto &t : threads) { // 55 + t->join(); // 56 + } // 57 +} // 58 + // 59 +void DoLLSC() { // 60 + vector threads; // 61 + naked_int = 0; // 62 + // 63 + for (uint32_t i = 0; i < NUM_THREADS; i++) { // 64 + threads.push_back(new thread(LLSCWorker)); // 65 + } // 66 + // 67 + for (auto &t : threads) { // 68 + t->join(); // 69 + } // 70 +} // 71 + // 72 +void DoAtomic() { // 73 + vector threads; // 74 + // 75 + for (uint32_t i = 0; i < NUM_THREADS; i++) { // 76 + threads.push_back(new thread(AtomicWorker)); // 77 + } // 78 + // 79 + for (auto &t : threads) { // 80 + t->join(); // 81 + } // 82 +} // 83 + // 84 +int main() { // 85 + // 86 + DoNaked(); // 87 + DoAtomic(); // 88 + // 89 + cout << "Correct sum is: "; // 90 + cout << NUM_THREADS * MAX_LOOPS << endl; // 91 + cout << "Naked sum: " << naked_int << endl; // 92 + cout << "Atomic sum: " << atomic_integer << endl; // 93 + // 94 + DoLLSC(); // 95 + cout << "LLSC sum: " << naked_int << endl; // 96 + return 0; // 97 +} // 98 ``` This program will spawn 16 threads which will each loop 10,000 times, @@ -163,7 +188,7 @@ described below. For ARMv8 and for later ARM versions (to perform operations other than those listed above), there is a general solution that isn't pretty. It -is an example of Load Locked / Store Conditional. It isn't pretty +is an example of Load Linked / Store Conditional. It isn't pretty because it involves a loop. ```asm @@ -171,11 +196,11 @@ because it involves a loop. .p2align 2 // 2 // 3 #if defined(__APPLE__) // 4 - .global _LoadLockedStoreConditional // 5 -_LoadLockedStoreConditional: // 6 + .global _LoadLinkedStoreConditional // 5 +_LoadLinkedStoreConditional: // 6 #else // 7 - .global LoadLockedStoreConditional // 8 -LoadLockedStoreConditional: // 9 + .global LoadLinkedStoreConditional // 8 +LoadLinkedStoreConditional: // 9 #endif // 10 1: ldaxr w1, [x0] // 11 add w1, w1, 1 // 12 @@ -187,7 +212,7 @@ LoadLockedStoreConditional: // 9 Lines 1 and 2 are boilerplate. The conditional assembly block from line 4 through line 10 declare the -label `LoadLockedStoreConditional` as global for both Linux and Apple +label `LoadLinkedStoreConditional` as global for both Linux and Apple assemblers. The label itself is also stated. It is worth explaining that labels marked as global must have an @@ -201,27 +226,49 @@ the address as needing watching (by the hardware). Line 12 can be expanded and / or replaced with whatever operation needed to be done on the value. -Line 13 puts the potato on the fork. It is a store condition with -release. The release means that after the instruction finishes, the -previously marked address will no longer be marked. The value returned -in `w2` will be 0 of the store actually took place. +Line 13 puts the potato on the fork. It is a store conditional which may +or may not actually write anything to memory. -Here's the cool bit (literally): If `w2` contains a 1 it means that -some executing agent (most likely another thread in the same process) -has attempted to change the location in memory since this thread marked -the location, the store did NOT actually take place. +To understand this instruction, [Kristien et +al](../../reference_material/USENIX2020.pdf) et al. provide this +amazingly helpful picture: + +![llsc](./llsc.png) + +The value returned in `w2` will be 0 of the store actually took place +and will have the value of 1 if the store was rejected. Imagine the following: | T1 | T2 | | -- | -- | -| Executes line 11. Gets value 10. Location is marked by T1. | | -| Executes line 12. `w1` goes up to 11. | | -| Yanked from CPU | | -| | Executes line 11. Gets value 10. Location is marked by T2. | -| | `w1` goes up to 11. | -| | | +| Executes line 11. Gets value *N*. Location is marked. | | +| T1 is descheduled. | | +| | Executes line 11. Gets value *N*. Location is marked again. | +| | `w1` goes up to *N + 1* on line 12. | +| | Line 13 succeeds in storing *N + 1* to memory and
the location marking is cleared | +| | T2 is descheduled. | +| T1 is scheduled - recall it has stale values. | | +| Executes line 12 making *N + 1* which is now wrong. | | +| Executes line 13 which fails because the marking is now gone. | | +| Loops around, this time picking up *N + 1* | | +| Correctly makes *N + 2* | | ## Implementation of ARMv8.1A and Newer -Implementation of operations on atomic variables \ No newline at end of file +Implementations of operations on atomic variables were improved in the +second version of ARMv8, called ARMv8.1. The load linked and store +conditional instructions are still available but several new +instructions were added which perform certain operations such as +addition, subtraction and various bitwise operations in a single atomic +instruction. + +For example: + +```asm + mov w1, 1 + ldaddal w1, w0, [x0] +``` + +does the same work of atomically adding one to the value in memory +pointed to by `x0`. diff --git a/more/atomics/README.pdf b/more/atomics/README.pdf new file mode 100644 index 0000000..8e5312b Binary files /dev/null and b/more/atomics/README.pdf differ diff --git a/section_3/atomics/battle_pug.jpeg b/more/atomics/battle_pug.jpeg similarity index 100% rename from section_3/atomics/battle_pug.jpeg rename to more/atomics/battle_pug.jpeg diff --git a/section_3/atomics/highlevel.cpp b/more/atomics/highlevel.cpp similarity index 93% rename from section_3/atomics/highlevel.cpp rename to more/atomics/highlevel.cpp index 8b1cc08..3884ace 100644 --- a/section_3/atomics/highlevel.cpp +++ b/more/atomics/highlevel.cpp @@ -18,13 +18,13 @@ const uint32_t NUM_THREADS = 16; volatile uint32_t naked_int; atomic atomic_integer(0); -extern "C" void LoadLockedStoreConditional(uint32_t * value); +extern "C" void LoadLinkedStoreConditional(uint32_t * value); void LLSCWorker() { extern volatile uint32_t naked_int; for (uint32_t i = 0; i < MAX_LOOPS; i++) { - LoadLockedStoreConditional((uint32_t *) &naked_int); + LoadLinkedStoreConditional((uint32_t *)&naked_int); } } diff --git a/more/atomics/llsc.png b/more/atomics/llsc.png new file mode 100644 index 0000000..05c0897 Binary files /dev/null and b/more/atomics/llsc.png differ diff --git a/section_3/atomics/loadlocked.S b/more/atomics/loadlocked.S similarity index 57% rename from section_3/atomics/loadlocked.S rename to more/atomics/loadlocked.S index 58755ce..506ab8a 100644 --- a/section_3/atomics/loadlocked.S +++ b/more/atomics/loadlocked.S @@ -2,11 +2,11 @@ .p2align 2 #if defined(__APPLE__) - .global _LoadLockedStoreConditional -_LoadLockedStoreConditional: + .global _LoadLinkedStoreConditional +_LoadLinkedStoreConditional: #else - .global LoadLockedStoreConditional -LoadLockedStoreConditional: + .global LoadLinkedStoreConditional +LoadLinkedStoreConditional: #endif 1: ldaxr w1, [x0] add w1, w1, 1 diff --git a/section_3/atomics/README.pdf b/section_3/atomics/README.pdf deleted file mode 100644 index 0cbfb3a..0000000 Binary files a/section_3/atomics/README.pdf and /dev/null differ