add some more notes on control flow

author: Davit Grigoryan <[email protected]> 2024-10-07 20:57:40 -0700
committer: Davit Grigoryan <[email protected]> 2024-10-07 20:57:40 -0700
commit: 3bd354933a0adadacfdcdb87dae50c2222251c32 (patch)
tree: 5834f89ed1c9d891d0560d64151835c154e9f361
parent: 6278c41e2771c7b8d63a3cc5e1f8313fd52cdb87 (diff)
1 files changed, 85 insertions, 0 deletions
diff --git a/spec/notes.txt b/spec/notes.txt
index dc753db..fdfe01d 100644
--- a/spec/notes.txt
+++ b/spec/notes.txt
@@ -177,6 +177,91 @@
 
 			Similar when waiting for b1 barrier
 			```
+	3) Custom Control Flow Unit (mix of two?):
+			branch => do another warp split
+			when barrier mask is empty / all threads have awaited at bsync ->
+			-> or (|) masks of the warp split table entries who have same PC of bsync+1 and 
+			add a new entry to the warp split table and remove the old one
+
+			=> no need for rPC column ??
+
+			Examples:
+
+		One branch:
+			`
+			A;
+			binit b0
+			P0 = (id % 2 == 0);
+			@P0 bra Y
+		X:	B;
+			jmp Z;
+		Y:	C;
+		Z:	bsync b0
+			D;
+			`
+			b0 = mask 1 1 1 1 and pending mask 0 0 0 0
+			one entry w/ mask 1 1 1 1 (PC=binit thing)
+			after bra -> two entries 0 1 0 1 (PC=X) and 1 0 1 0 (PC=Y)
+			assume PC=Y executes and then executes PC=Z (bsync) =>
+			=> pending mask becomes 1 0 1 0 and it halts
+			PC=X then executes, jumps to Z and bsyncs b0 =>
+			=> pending mask of b0 becomes 1 1 1 1
+			mask 1 1 1 1 == pending mask 1 1 1 1 =>
+			=> all entries with PC=Z get merged into one with mask
+			1 0 1 0 | 0 1 0 1 = 1 1 1 1
+
+		Nested branches:
+			`
+			A;
+			binit b0;
+			P0 = (id % 2 == 0);
+			@P0 bra Y;
+		X:	B;
+			binit b1;
+			P0 = (id == 1);
+			@P0 bra J;
+		U:	E;
+			jmp H;
+		J:	F;
+		H:	bsync b1;
+			jmp Z;
+		Y:	C;
+		Z:	bsync b0;
+			D;
+			`
+
+		While loop:
+			`
+			binit b0;
+		L:	A;
+			P0 = sth;
+			@P0 bra L;
+			bsync b0;
+			`
+
+			results in many splits and many 'blocked' warp entries
+			but works
+
+		Spinlock:
+			`
+			mov mutex, 0
+			binit b0;
+		Y:	yield;
+			P0 = atomic_take(mutex);
+			@!P0 bra Y;
+		X:	A;
+			atomic_release(mutex);
+			bsync b0;
+			`
+
+			b0 -> initialized to 1 1 1 1
+			yield -> just chooses the same warp entry
+			for one thread -> P0 = 1 and for others P0 = 0
+			=> two warp entries 0 1 0 0 (PC = X), 1 0 1 1 (PC = Y)
+			if PC=Y gets chosen, it will yield and become PC=Y+1
+			PC=X will do A, release the lock, and wait at bsync
+			eventually, all threads waiting at bsync
+			note: if branch is to a PC that a warp entry already has, just move the thread to that warp
 
 
 * Warp Scheduling
author	Davit Grigoryan <[email protected]>	2024-10-07 20:57:40 -0700
committer	Davit Grigoryan <[email protected]>	2024-10-07 20:57:40 -0700
commit	3bd354933a0adadacfdcdb87dae50c2222251c32 (patch)
tree	5834f89ed1c9d891d0560d64151835c154e9f361
parent	6278c41e2771c7b8d63a3cc5e1f8313fd52cdb87 (diff)