|
|
|
@ -169,16 +169,15 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
self.submodules += Loader_fsm |
|
|
|
|
|
|
|
|
|
self.Loader_Delay = Signal(32, reset_less=True) |
|
|
|
|
self.Loader_Active = Signal() |
|
|
|
|
self.Loader_Active = Signal() |
|
|
|
|
Loader_fsm.act("Loader_IDLE", |
|
|
|
|
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value1_2.storage, 0), |
|
|
|
|
NextValue(self.b16Value2_1.storage, 0), |
|
|
|
@ -191,7 +190,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD1", |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry? |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
@ -199,9 +198,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
#-----> LOOP ENTRY ! (2nd loop onward: fs3 already prepared!) |
|
|
|
|
Loader_fsm.act("Loader_LOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ... |
|
|
|
|
# FPU#1 |
|
|
|
|
NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date |
|
|
|
@ -215,56 +216,59 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry |
|
|
|
|
|
|
|
|
|
#NextValue(self.Loader_Delay, 0), # Reset delay |
|
|
|
|
#NextState("Loader_LOAD3") |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
""" |
|
|
|
|
Loader_fsm.act("Loader_LOAD3", |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date |
|
|
|
|
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset delay |
|
|
|
|
NextState("Loader_LOAD4") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD4", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date |
|
|
|
|
NextValue(self.b16Value6.storage, LoadUnit.b32Data.storage >> 16), # Pick 6th date |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
""" |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
NextValue(fpu1.fadd, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fadd, True), |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
NextValue(fpu1.fmul, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fmul, True), |
|
|
|
|
).Else( |
|
|
|
|
NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested |
|
|
|
|
NextValue(fpu2.fmadd, True), |
|
|
|
|
) |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 |
|
|
|
|
NextValue(fpu2.fready, False), # Engage trigger FPU#2 |
|
|
|
|
NextState("Loader_EXEC2") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
If(fpu1.fready & fpu2.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fadd, False), # Clear command request FPU#2 |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
NextValue(fpu1.fmul, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmul, False), # Clear command request FPU#2 |
|
|
|
|
).Else( |
|
|
|
|
NextValue(fpu1.fmadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmadd, False), # Clear command request FPU#2 |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), # Words 0 .. 255 |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
).Else( # Finally prepare ADD both result sums (on FPU#1 only!) |
|
|
|
|
NextValue(fpu1.fs1, fpu1.fresult), |
|
|
|
|
NextValue(fpu1.fs2, fpu2.fresult), |
|
|
|
|
NextState("Loader_EXEC3") |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC3", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
NextValue(fpu1.fadd, True), # Final ADD requested |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 (only!) |
|
|
|
|
NextState("Loader_EXEC4") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC4", |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
If(fpu1.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Useless (control only ...) |
|
|
|
|
NextValue(self.b16Status.storage[15], True), # Indicate readyness ... |
|
|
|
|
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!) |
|
|
|
|
NextState("Loader_IDLE") |
|
|
|
|