moved atomics over to section 3 and added to the chapter

2026-06-22 23:16:48 +08:00 · 2023-03-01 14:22:05 -06:00 · 2023-03-01 14:22:05 -06:00 · ade18c3324
commit ade18c3324
parent 7909ede0ee
5 changed files with 340 additions and 0 deletions
--- a/section_3/atomics/README.md
+++ b/section_3/atomics/README.md
@ -0,0 +1,227 @@
 # Section 1 / Atomics
 ![Gurney Hallek](./battle_pug.jpeg)
 ## Threads
 Suppose you run two copies of the same program at the same time. They
 both have a variable named `i`. Can a change to `i` made by one copy of
 the program impact the value of `i` in the other running copy of the
 program? Of course not.
 Think of running two copies of a program at the same time as having
 two identical, but distinct, homes. What happens inside one house does
 not impact what happens inside the house next door.
 Threads are a different way of getting more than one "copy" of a program
 to run at the same time. Threads are different, however, in that they
 all live within the same household. All of the housemates share the
 living space and all housemates have access to any global or shared
 resource. This makes for great gains in performance for a broad class
 of problems but also introduces great hazards.
 Suppose you buy a carton of milk and place it in the fridge. If you were
 the only member of the household you would expect that when we next went
 to the fridge your milk would still be there, right? If you share the
 household with other people, this might not be the case.
 Consider the following program:
 ```c++
 #include <iostream>                                           // 1 
 #include <thread>                                             // 2 
 #include <atomic>                                             // 3 
 #include <vector>                                             // 4 
                                                              // 5 
 using std::cout;                                              // 6 
 using std::endl;                                              // 7 
 using std::atomic;                                            // 8 
 using std::vector;                                            // 9 
 using std::thread;                                            // 10 
                                                              // 11 
 const uint32_t MAX_LOOPS = 10000;                             // 12 
 const uint32_t NUM_THREADS = 16;                              // 13 
                                                              // 14 
 /*  volatile is necessary if any use of the optimizer         // 15 
    is to be made.                                            // 16 
 */                                                            // 17 
 volatile uint32_t naked_int = 0;                              // 18 
 atomic<uint32_t> atomic_integer(0);                           // 19 
                                                              // 20 
 void NakedWorker() {                                          // 21 
    extern volatile uint32_t naked_int;                       // 22 
                                                              // 23 
    for (uint32_t i = 0; i < MAX_LOOPS; i++) {                // 24 
        naked_int++;                                          // 25 
    }                                                         // 26 
 }                                                             // 27 
                                                              // 28 
 void AtomicWorker() {                                         // 29 
    extern atomic<uint32_t> atomic_integer;                   // 30 
                                                              // 31 
    for (uint32_t i = 0; i < MAX_LOOPS; i++) {                // 32 
        atomic_integer++;                                     // 33 
    }                                                         // 34 
 }                                                             // 35 
                                                              // 36 
 void DoNaked() {                                              // 37 
    vector<thread *> threads;                                 // 38 
                                                              // 39 
    for (uint32_t i = 0; i < NUM_THREADS; i++) {              // 40 
        threads.push_back(new thread(NakedWorker));           // 41 
    }                                                         // 42 
                                                              // 43 
    for (auto &t : threads) {                                 // 44 
        t->join();                                            // 45 
    }                                                         // 46 
 }                                                             // 47 
                                                              // 48 
 void DoAtomic() {                                             // 49 
    vector<thread *> threads;                                 // 50 
                                                              // 51 
    for (uint32_t i = 0; i < NUM_THREADS; i++) {              // 52 
        threads.push_back(new thread(AtomicWorker));          // 53 
    }                                                         // 54 
                                                              // 55 
    for (auto &t : threads) {                                 // 56 
        t->join();                                            // 57 
    }                                                         // 58 
 }                                                             // 59 
                                                              // 60 
 int main() {                                                  // 61 
                                                              // 62 
    DoNaked();                                                // 63 
    DoAtomic();                                               // 64 
                                                              // 65 
    cout << "Correct sum is: ";                               // 66 
    cout << NUM_THREADS * MAX_LOOPS << endl;                  // 67 
    cout << "Naked sum: " << naked_int << endl;               // 68 
    cout << "Atomic sum: " << atomic_integer << endl;         // 69 
                                                              // 70 
    return 0;                                                 // 71 
 }                                                             // 72 
 perrykivolowitz@DAEDALUS atomics %
 ```
 This program will spawn 16 threads which will each loop 10,000 times,
 adding one to a zero-initialized integer each loop. At the end, when all
 the threads complete, the integer should have the value 160,000.
 Alas, this is an example of the class "Hidden Update" bug. The shared
 resource, the integer, will get clobbered in unpredictable ways.
 For example, multiple runs might produce (snipped to show only the
 output from `NakedWorker()`):
 - Naked sum: 74291
 - Naked sum: 79390
 - Naked sum: 89115
 - etc
 Not only are the results wrong, they are wrong in a different way each
 time.
 ## Serializing Access to Integer Types
 C++11 introduced the notion of *atomic integers*. These do not glow.
 Rather, access to them is guaranteed to be atomic... as in, cannot be
 broken down.
 The hidden update problem's root cause is that adding (for example) to a
 value in memory involves three instructions at the assembly language
 level. A load, an addition, and a store. A hidden update occurs when a
 thread is yanked from the CPU in the middle of these instructions. When
 the thread returns to the CPU, the store causes its stale data to
 overwrite (hide) correct data.
 There are many ways to avoid the hidden update problem including a large
 array of synchronization mechanisms. Alternatively, one can avoid the
 hidden update problem by ensuring the three instruction sequence isn't
 interrupted. This can be done using atomic integer types.
 ## Using Atomics
 First, make the appropriate include:
 `#include <atomic>`
 Next, make the appropriate declaration and initialize the variable.
 Here, replace "integral type" with some integer type:
 `atomic<integral type> atomic_integer(0);`
 Notice how the initial value is provided to the atomic variable's
 constructor.
 Finally, use the atomic variable as you would any other integer.
 ## A General Implementation
 The newer ARM architectures provide a single instruction solution for
 addition, subtraction and various bitwise operations. These will be
 described below.
 For ARMv8 and for later ARM versions (to perform operations other than
 those listed above), there is a general solution that isn't pretty. It
 is an example of Load Locked / Store Conditional. It isn't pretty
 because it involves a loop.
 ```asm
        .text                                                 // 1 
        .p2align    2                                         // 2 
                                                              // 3 
 #if defined(__APPLE__)                                        // 4 
        .global     _LoadLockedStoreConditional               // 5 
 _LoadLockedStoreConditional:                                  // 6 
 #else                                                         // 7 
        .global     LoadLockedStoreConditional                // 8 
 LoadLockedStoreConditional:                                   // 9 
 #endif                                                        // 10 
 1:      ldaxr       w1, [x0]                                  // 11 
        add         w1, w1, 1                                 // 12 
        stlxr       w2, w1, [x0]                              // 13 
        cbnz        w2, 1b                                    // 14 
        ret                                                   // 15 
 ```
 Lines 1 and 2 are boilerplate.
 The conditional assembly block from line 4 through line 10 declare the
 label `LoadLockedStoreConditional` as global for both Linux and Apple
 assemblers. The label itself is also stated.
 It is worth explaining that labels marked as global must have an
 underscore prefix for Apple assembly.
 This function is passed the address of an `int32_t`.
 Line 11 loads the value found at that address into `w1` and also marks
 the address as needing watching (by the hardware).
 Line 12 can be expanded and / or replaced with whatever operation needed
 to be done on the value.
 Line 13 puts the potato on the fork. It is a store condition with
 release. The release means that after the instruction finishes, the
 previously marked address will no longer be marked. The value returned
 in `w2` will be 0 of the store actually took place.
 Here's the cool bit (literally): If `w2` contains a 1 it means that
 some executing agent (most likely another thread in the same process)
 has attempted to change the location in memory since this thread marked
 the location, the store did NOT actually take place.
 Imagine the following:
 | T1 | T2 |
 | -- | -- |
 | Executes line 11. Gets value 10. Location is marked by T1. | |
 | Executes line 12. `w1` goes up to 11. | |
 | Yanked from CPU | |
 | | Executes line 11. Gets value 10. Location is marked by T2. |
 | | `w1` goes up to 11. |
 | | |
 ## Implementation of ARMv8.1A and Newer
 Implementation of operations on atomic variables 
--- a/section_3/atomics/README.pdf
+++ b/section_3/atomics/README.pdf
--- a/section_3/atomics/battle_pug.jpeg
+++ b/section_3/atomics/battle_pug.jpeg
--- a/section_3/atomics/highlevel.cpp
+++ b/section_3/atomics/highlevel.cpp
@ -0,0 +1,98 @@
 #include <iostream>
 #include <thread>
 #include <atomic>
 #include <vector>
 using std::cout;
 using std::endl;
 using std::atomic;
 using std::vector;
 using std::thread;
 const uint32_t MAX_LOOPS = 10000;
 const uint32_t NUM_THREADS = 16;
 /*  volatile is necessary if any use of the optimizer 
    is to be made.
 */
 volatile uint32_t naked_int;
 atomic<uint32_t> atomic_integer(0);
 extern "C" void LoadLockedStoreConditional(uint32_t * value);
 void LLSCWorker() {
 	extern volatile uint32_t naked_int;
 	for (uint32_t i = 0; i < MAX_LOOPS; i++) {
 		LoadLockedStoreConditional((uint32_t *) &naked_int);
 	}
 }
 void NakedWorker() {
    extern volatile uint32_t naked_int;
    for (uint32_t i = 0; i < MAX_LOOPS; i++) {
        naked_int++;
    }
 }
 void AtomicWorker() {
 	extern atomic<uint32_t> atomic_integer;
 	for (uint32_t i = 0; i < MAX_LOOPS; i++) {
 		atomic_integer++;
 	}
 }
 void DoNaked() {
 	vector<thread *> threads;
    naked_int = 0;
 	for (uint32_t i = 0; i < NUM_THREADS; i++) {
 		threads.push_back(new thread(NakedWorker));
 	}
 	for (auto &t : threads) {
 		t->join();
 	}
 }
 void DoLLSC() {
 	vector<thread *> threads;
    naked_int = 0;
 	for (uint32_t i = 0; i < NUM_THREADS; i++) {
 		threads.push_back(new thread(LLSCWorker));
 	}
 	for (auto &t : threads) {
 		t->join();
 	}
 }
 void DoAtomic() {
 	vector<thread *> threads;
 	for (uint32_t i = 0; i < NUM_THREADS; i++) {
 		threads.push_back(new thread(AtomicWorker));
 	}
 	for (auto &t : threads) {
 		t->join();
 	}
 }
 int main() {
    DoNaked();
    DoAtomic();
 	cout << "Correct sum is: ";
    cout << NUM_THREADS * MAX_LOOPS << endl;
    cout << "Naked sum: " << naked_int << endl;
 	cout << "Atomic sum: " << atomic_integer << endl;
    DoLLSC();
 	cout << "LLSC sum: " << naked_int << endl;
 	return 0;
 }
--- a/section_3/atomics/loadlocked.S
+++ b/section_3/atomics/loadlocked.S
@ -0,0 +1,15 @@
        .text
        .p2align    2
 #if defined(__APPLE__)
        .global     _LoadLockedStoreConditional
 _LoadLockedStoreConditional:
 #else
        .global     LoadLockedStoreConditional
 LoadLockedStoreConditional:
 #endif
 1:      ldaxr       w1, [x0]
        add         w1, w1, 1
        stlxr       w2, w1, [x0]
        cbnz        w2, 1b
        ret