|
|
|
@ -46,6 +46,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
|
|
|
|
|
:bEnable: To enable running (after data preparation) |
|
|
|
|
|
|
|
|
|
:b9ArrayWordLen: Number of words used for calculation of scalar (inner) product |
|
|
|
|
|
|
|
|
|
Outputs: |
|
|
|
|
######## |
|
|
|
|
|
|
|
|
@ -76,52 +78,47 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
Enable free run |
|
|
|
|
""") |
|
|
|
|
|
|
|
|
|
self.b9ArrayWordLen = CSRStorage(9, reset_less=False, |
|
|
|
|
fields=[CSRField("ArrayWordLen", size=9, description="*Field*: 9-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Word length of array used for calculation |
|
|
|
|
""") |
|
|
|
|
|
|
|
|
|
# Outputs |
|
|
|
|
self.b16Status = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing stati |
|
|
|
|
""") |
|
|
|
|
self.b16Value1 = CSRStorage(16, reset_less=False, |
|
|
|
|
self.b16Value1_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 1 |
|
|
|
|
FPU#1 Float register 1 |
|
|
|
|
""") |
|
|
|
|
self.b16Value2 = CSRStorage(16, reset_less=False, |
|
|
|
|
self.b16Value1_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 2 |
|
|
|
|
FPU#1 Float register 2 |
|
|
|
|
""") |
|
|
|
|
self.b16Value3 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 3 |
|
|
|
|
""") |
|
|
|
|
self.b16Value4 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 4 |
|
|
|
|
""") |
|
|
|
|
self.b16Value5 = CSRStorage(16, reset_less=False, |
|
|
|
|
self.b16Value2_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 5 |
|
|
|
|
FPU#2 Float register 1 |
|
|
|
|
""") |
|
|
|
|
self.b16Value6 = CSRStorage(16, reset_less=False, |
|
|
|
|
self.b16Value2_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Float register 6 |
|
|
|
|
FPU#2 Float register 2 |
|
|
|
|
""") |
|
|
|
|
self.b16Result1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing result 1 |
|
|
|
|
FPU#1 Processing result |
|
|
|
|
""") |
|
|
|
|
self.b16Result2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Processing result 2 |
|
|
|
|
FPU#2 Processing result |
|
|
|
|
""") |
|
|
|
|
self.bReady = Signal() # To be wired to data pin ... ;) |
|
|
|
|
|
|
|
|
@ -163,7 +160,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
#---------------- bfloat16 FPUs ------------------------------------------------------------- |
|
|
|
|
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU |
|
|
|
|
NFPUCORES=2 # No. of FPUs used |
|
|
|
|
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU |
|
|
|
|
self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one! |
|
|
|
|
|
|
|
|
|
#---------------- Loaded data testing -------------------------------------------------- |
|
|
|
@ -176,13 +174,15 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status |
|
|
|
|
NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value1_2.storage, 0), |
|
|
|
|
NextValue(self.b16Value2_1.storage, 0), |
|
|
|
|
NextValue(self.b16Value2_2.storage, 0), |
|
|
|
|
NextValue(self.bReady, False), # LED off! |
|
|
|
|
NextState("Loader_LOAD1") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
@ -192,8 +192,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD1", |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry? |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation |
|
|
|
|
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry? |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
).Elif(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
@ -202,19 +202,29 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
Loader_fsm.act("Loader_LOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset delay |
|
|
|
|
NextState("Loader_LOAD3") |
|
|
|
|
# FPU#1 |
|
|
|
|
NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LoadUnit.b9Offset1.storage + 1), # Move on to next entry |
|
|
|
|
# FPU#2 |
|
|
|
|
NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # Pick 2nd date |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry |
|
|
|
|
|
|
|
|
|
#NextValue(self.Loader_Delay, 0), # Reset delay |
|
|
|
|
#NextState("Loader_LOAD3") |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD3", |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
""" |
|
|
|
|
Loader_fsm.act("Loader_LOAD3", |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date |
|
|
|
|
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date |
|
|
|
@ -225,9 +235,9 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextState("Loader_LOAD4") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD4", |
|
|
|
|
Loader_fsm.act("Loader_LOAD4", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, |
|
|
|
|
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date |
|
|
|
@ -237,25 +247,26 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
""" |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
NextValue(fpu1.fadd, True), # This command requested |
|
|
|
|
NextValue(fpu2.fnmsub, True), # This command requested |
|
|
|
|
NextValue(fpu1.fadd, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fadd, True), |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 |
|
|
|
|
NextValue(fpu2.fready, False), # Engage trigger FPU#2 |
|
|
|
|
NextState("Loader_EXEC2") |
|
|
|
|
NextState("Loader_EXEC2") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status added |
|
|
|
|
If(fpu1.fready & fpu2.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fnmsub, False), # Clear command request FPU#2 |
|
|
|
|
NextValue(fpu2.fadd, False), # Clear command request FPU#2 |
|
|
|
|
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|
NextValue(self.b16Status.storage[15], True), # Indicate readyness ... |
|
|
|
|
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!) |
|
|
|
|
NextState("Loader_IDLE") |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|