|
|
|
@ -95,26 +95,28 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
FPU states: Low FPU#1, High FPU#2 |
|
|
|
|
""") |
|
|
|
|
""" TODO: Remove! |
|
|
|
|
self.b16Value1_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
description=" |
|
|
|
|
FPU#1 Float register 1 |
|
|
|
|
""") |
|
|
|
|
") |
|
|
|
|
self.b16Value1_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
FPU#1 Float register 2 |
|
|
|
|
""") |
|
|
|
|
description=" |
|
|
|
|
#FPU#1 Float register 2 |
|
|
|
|
") |
|
|
|
|
self.b16Value2_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
FPU#2 Float register 1 |
|
|
|
|
""") |
|
|
|
|
description=" |
|
|
|
|
#FPU#2 Float register 1 |
|
|
|
|
") |
|
|
|
|
self.b16Value2_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
FPU#2 Float register 2 |
|
|
|
|
""") |
|
|
|
|
description=" " " |
|
|
|
|
#FPU#2 Float register 2 |
|
|
|
|
" " ") |
|
|
|
|
""" |
|
|
|
|
self.b16Result1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
@ -183,10 +185,10 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ... |
|
|
|
|
NextValue(self.b16Value1_2.storage, 0), |
|
|
|
|
NextValue(self.b16Value2_1.storage, 0), |
|
|
|
|
NextValue(self.b16Value2_2.storage, 0), |
|
|
|
|
#NextValue(self.b16Value1_1.storage, 0), # TODO: Remove! Nothing loaded so far ... |
|
|
|
|
#NextValue(self.b16Value1_2.storage, 0), |
|
|
|
|
#NextValue(self.b16Value2_1.storage, 0), |
|
|
|
|
#NextValue(self.b16Value2_2.storage, 0), |
|
|
|
|
NextValue(self.bReady, False), # LED off! |
|
|
|
|
NextState("Loader_LOAD1") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
@ -209,14 +211,14 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ... |
|
|
|
|
# FPU#1 |
|
|
|
|
NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date |
|
|
|
|
#NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # TODO: Remove! Pick 1st date |
|
|
|
|
#NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # TODO: Remove! Pick 2nd date |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LoadUnit.b9Offset1.storage + 1), # Move on to next entry |
|
|
|
|
# FPU#2 |
|
|
|
|
NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # Pick 1st date |
|
|
|
|
NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # Pick 2nd date |
|
|
|
|
#NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # TODO: Remove! Pick 1st date |
|
|
|
|
#NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # TODO: Remove! Pick 2nd date |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry |
|
|
|
@ -228,7 +230,6 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
#If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
NextValue(fpu1.fmul, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fmul, True), |
|
|
|
@ -237,8 +238,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(fpu2.fmadd, True), |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 |
|
|
|
|
NextValue(fpu2.fready, False), # Engage trigger FPU#2 |
|
|
|
|
#), |
|
|
|
|
NextValue(fpu2.fready, False), # Engage trigger FPU#2 |
|
|
|
|
NextState("Loader_EXEC2") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
@ -249,13 +249,13 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) |
|
|
|
|
NextValue(fpu1.fmul, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmul, False), # Clear command request FPU#2 |
|
|
|
|
).Else( # Entries 1..len |
|
|
|
|
).Else( # Entries 1 .. (maxlen-1) |
|
|
|
|
NextValue(fpu1.fmadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmadd, False), # Clear command request FPU#2 |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), # Words 0 .. 255 |
|
|
|
|
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, # Words 0 .. 255 |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
).Else( # Finally prepare ADD both result sums (on FPU#1 only!) |
|
|
|
|
NextValue(fpu1.fs1, fpu1.fresult), |
|
|
|
|