summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavit Grigoryan <[email protected]>2024-10-07 20:57:40 -0700
committerDavit Grigoryan <[email protected]>2024-10-07 20:57:40 -0700
commit3bd354933a0adadacfdcdb87dae50c2222251c32 (patch)
tree5834f89ed1c9d891d0560d64151835c154e9f361
parent6278c41e2771c7b8d63a3cc5e1f8313fd52cdb87 (diff)
add some more notes on control flow
-rw-r--r--spec/notes.txt85
1 files changed, 85 insertions, 0 deletions
diff --git a/spec/notes.txt b/spec/notes.txt
index dc753db..fdfe01d 100644
--- a/spec/notes.txt
+++ b/spec/notes.txt
@@ -177,6 +177,91 @@
Similar when waiting for b1 barrier
```
+ 3) Custom Control Flow Unit (mix of two?):
+ branch => do another warp split
+ when barrier mask is empty / all threads have awaited at bsync ->
+ -> or (|) masks of the warp split table entries who have same PC of bsync+1 and
+ add a new entry to the warp split table and remove the old one
+
+ => no need for rPC column ??
+
+ Examples:
+
+ One branch:
+ `
+ A;
+ binit b0
+ P0 = (id % 2 == 0);
+ @P0 bra Y
+ X: B;
+ jmp Z;
+ Y: C;
+ Z: bsync b0
+ D;
+ `
+ b0 = mask 1 1 1 1 and pending mask 0 0 0 0
+ one entry w/ mask 1 1 1 1 (PC=binit thing)
+ after bra -> two entries 0 1 0 1 (PC=X) and 1 0 1 0 (PC=Y)
+ assume PC=Y executes and then executes PC=Z (bsync) =>
+ => pending mask becomes 1 0 1 0 and it halts
+ PC=X then executes, jumps to Z and bsyncs b0 =>
+ => pending mask of b0 becomes 1 1 1 1
+ mask 1 1 1 1 == pending mask 1 1 1 1 =>
+ => all entries with PC=Z get merged into one with mask
+ 1 0 1 0 | 0 1 0 1 = 1 1 1 1
+
+ Nested branches:
+ `
+ A;
+ binit b0;
+ P0 = (id % 2 == 0);
+ @P0 bra Y;
+ X: B;
+ binit b1;
+ P0 = (id == 1);
+ @P0 bra J;
+ U: E;
+ jmp H;
+ J: F;
+ H: bsync b1;
+ jmp Z;
+ Y: C;
+ Z: bsync b0;
+ D;
+ `
+
+ While loop:
+ `
+ binit b0;
+ L: A;
+ P0 = sth;
+ @P0 bra L;
+ bsync b0;
+ `
+
+ results in many splits and many 'blocked' warp entries
+ but works
+
+ Spinlock:
+ `
+ mov mutex, 0
+ binit b0;
+ Y: yield;
+ P0 = atomic_take(mutex);
+ @!P0 bra Y;
+ X: A;
+ atomic_release(mutex);
+ bsync b0;
+ `
+
+ b0 -> initialized to 1 1 1 1
+ yield -> just chooses the same warp entry
+ for one thread -> P0 = 1 and for others P0 = 0
+ => two warp entries 0 1 0 0 (PC = X), 1 0 1 1 (PC = Y)
+ if PC=Y gets chosen, it will yield and become PC=Y+1
+ PC=X will do A, release the lock, and wait at bsync
+ eventually, all threads waiting at bsync
+ note: if branch is to a PC that a warp entry already has, just move the thread to that warp
* Warp Scheduling