|
|
|
@ -98,11 +98,31 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
Float register 3 |
|
|
|
|
""") |
|
|
|
|
self.b16Result = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
self.b16Value4 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 4 |
|
|
|
|
""") |
|
|
|
|
self.b16Value5 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing result |
|
|
|
|
Float register 5 |
|
|
|
|
""") |
|
|
|
|
self.b16Value6 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 6 |
|
|
|
|
""") |
|
|
|
|
self.b16Result1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing result 1 |
|
|
|
|
""") |
|
|
|
|
self.b16Result2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing result 2 |
|
|
|
|
""") |
|
|
|
|
self.bReady = Signal() # To be wired to data pin ... ;) |
|
|
|
|
|
|
|
|
|
# Local vars. |
|
|
|
@ -144,6 +164,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
|
|
|
|
|
#---------------- bfloat16 FPUs ------------------------------------------------------------- |
|
|
|
|
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU |
|
|
|
|
self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one! |
|
|
|
|
|
|
|
|
|
#---------------- Loaded data testing -------------------------------------------------- |
|
|
|
|
Loader_fsm = FSM(reset_state="Loader_IDLE") # FSM starts idling ... |
|
|
|
@ -156,7 +177,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(self.b16Result.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status |
|
|
|
|
NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... |
|
|
|
@ -195,7 +217,23 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date |
|
|
|
|
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset delay |
|
|
|
|
NextState("Loader_LOAD4") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD4", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date |
|
|
|
|
NextValue(self.b16Value6.storage, LoadUnit.b32Data.storage >> 16), # Pick 6th date |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
@ -203,15 +241,19 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
NextValue(fpu1.fnmsub, True), # This command requested |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger |
|
|
|
|
NextValue(fpu1.fadd, True), # This command requested |
|
|
|
|
NextValue(fpu2.fnmsub, True), # This command requested |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 |
|
|
|
|
NextValue(fpu2.fready, False), # Engage trigger FPU#2 |
|
|
|
|
NextState("Loader_EXEC2") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status added |
|
|
|
|
If(fpu1.fready, |
|
|
|
|
NextValue(fpu1.fnmsub, False), # Clear command request |
|
|
|
|
NextValue(self.b16Result.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
If(fpu1.fready & fpu2.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fnmsub, False), # Clear command request FPU#2 |
|
|
|
|
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Status.storage[15], True), # Indicate readyness ... |
|
|
|
|
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!) |
|
|
|
|
NextState("Loader_IDLE") |
|
|
|
|