|
|
|
@ -42,11 +42,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
|
|
|
|
|
:b32DRAMAddress: New DRAM address from where to load into local memory |
|
|
|
|
|
|
|
|
|
:b32Sentinel: Write control word to last address (same as [b32DRAMAddress+511] value) |
|
|
|
|
:b32Sentinel: Write control word to last address (same as [b32DRAMAddress+LEN-1] value) |
|
|
|
|
|
|
|
|
|
:bEnable: To enable running (after data preparation) |
|
|
|
|
|
|
|
|
|
:b9ArrayWordLen: Number of words used for calculation of scalar (inner) product |
|
|
|
|
:b10ArrayWordLen: Number of words used for calculation of scalar (inner) product |
|
|
|
|
|
|
|
|
|
Outputs: |
|
|
|
|
######## |
|
|
|
@ -78,8 +78,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
Enable free run |
|
|
|
|
""") |
|
|
|
|
self.b9ArrayWordLen = CSRStorage(9, reset_less=False, |
|
|
|
|
fields=[CSRField("ArrayWordLen", size=9, description="*Field*: 9-Bit value")], |
|
|
|
|
self.b10ArrayWordLen = CSRStorage(10, reset_less=False, |
|
|
|
|
fields=[CSRField("ArrayWordLen", size=10, description="*Field*: 10-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Word length of array used for calculation |
|
|
|
|
""") |
|
|
|
@ -95,28 +95,6 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
FPU states: Low FPU#1, High FPU#2 |
|
|
|
|
""") |
|
|
|
|
""" TODO: Remove! |
|
|
|
|
self.b16Value1_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=" |
|
|
|
|
FPU#1 Float register 1 |
|
|
|
|
") |
|
|
|
|
self.b16Value1_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=" |
|
|
|
|
#FPU#1 Float register 2 |
|
|
|
|
") |
|
|
|
|
self.b16Value2_1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=" |
|
|
|
|
#FPU#2 Float register 1 |
|
|
|
|
") |
|
|
|
|
self.b16Value2_2 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=" " " |
|
|
|
|
#FPU#2 Float register 2 |
|
|
|
|
" " ") |
|
|
|
|
""" |
|
|
|
|
self.b16Result1 = CSRStorage(16, reset_less=False, |
|
|
|
|
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], |
|
|
|
|
description=""" |
|
|
|
@ -130,13 +108,13 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
self.bReady = Signal() # To be wired to data pin ... ;) |
|
|
|
|
|
|
|
|
|
# Local vars. |
|
|
|
|
# - none yet - |
|
|
|
|
self.b10CurrentOffest = Signal(10, reset_less=True) |
|
|
|
|
|
|
|
|
|
#---------------- Load unit (LU) ------------------------------------------------------------- |
|
|
|
|
LU_fsm = FSM(reset_state="LU_IDLE") # FSM starts idling ... |
|
|
|
|
self.submodules += LU_fsm |
|
|
|
|
|
|
|
|
|
self.LU_CacheOffset = Signal(9, reset_less=True) # 0..511 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) |
|
|
|
|
self.LU_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) |
|
|
|
|
self.LU_CacheValid = Signal() # Indicate loaded LU cache |
|
|
|
|
self.LU_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.) |
|
|
|
|
LU_fsm.act("LU_IDLE", # If cache not valid fill it! |
|
|
|
@ -181,14 +159,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b10CurrentOffest, 0), # Actual offset (=DRAM local offset) |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
#NextValue(self.b16Value1_1.storage, 0), # TODO: Remove! Nothing loaded so far ... |
|
|
|
|
#NextValue(self.b16Value1_2.storage, 0), |
|
|
|
|
#NextValue(self.b16Value2_1.storage, 0), |
|
|
|
|
#NextValue(self.b16Value2_2.storage, 0), |
|
|
|
|
NextValue(self.bReady, False), # LED off! |
|
|
|
|
NextState("Loader_LOAD1") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
@ -197,9 +172,9 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD1", |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry? |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
).Elif(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
@ -210,18 +185,16 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
Loader_fsm.act("Loader_LOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ... |
|
|
|
|
NextValue(self.b10CurrentOffest, self.b10CurrentOffest + 1), # Increment (total) offset |
|
|
|
|
|
|
|
|
|
# FPU#1 |
|
|
|
|
#NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # TODO: Remove! Pick 1st date |
|
|
|
|
#NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # TODO: Remove! Pick 2nd date |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset1.storage, LoadUnit.b9Offset1.storage + 1), # Move on to next entry |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LoadUnit.b10Offset1.storage + 1), # Move on to next entry |
|
|
|
|
# FPU#2 |
|
|
|
|
#NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # TODO: Remove! Pick 1st date |
|
|
|
|
#NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # TODO: Remove! Pick 2nd date |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LoadUnit.b10Offset2.storage + 1), # Move on to next entry |
|
|
|
|
|
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
@ -230,7 +203,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
NextValue(fpu1.fmul, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fmul, True), |
|
|
|
|
).Else( |
|
|
|
@ -246,7 +219,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(self.b16Status.storage[8], fpu1.fready), # TODO: Remove! |
|
|
|
|
NextValue(self.b16Status.storage[9], fpu2.fready), # TODO: Remove! |
|
|
|
|
If(fpu1.fready & fpu2.fready, |
|
|
|
|
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) |
|
|
|
|
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) |
|
|
|
|
NextValue(fpu1.fmul, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmul, False), # Clear command request FPU#2 |
|
|
|
|
).Else( # Entries 1 .. (maxlen-1) |
|
|
|
@ -254,24 +227,58 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
NextValue(fpu2.fmadd, False), # Clear command request FPU#2 |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, # Words 0 .. 255 |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(self.b10CurrentOffest < self.b10ArrayWordLen.storage, # Words 0 .. LEN-1 |
|
|
|
|
If(LoadUnit.b10Offset1.storage < LUCacheSize, # Words 0 .. Cachelen |
|
|
|
|
NextState("Loader_LOAD2") # Next value @offset |
|
|
|
|
).Else( # Cache empty ... |
|
|
|
|
NextValue(self.b32DRAMLoadAddress.storage, self.b32DRAMLoadAddress.storage + LUCacheSize), # Prepare DRAM address |
|
|
|
|
NextState("Loader_XLOAD0") # Fill cache again |
|
|
|
|
) |
|
|
|
|
).Else( # Finally prepare ADD both result sums (on FPU#1 only!) |
|
|
|
|
NextValue(fpu1.fs1, fpu1.fresult), |
|
|
|
|
NextValue(fpu1.fs2, fpu2.fresult), |
|
|
|
|
NextState("Loader_EXEC3") |
|
|
|
|
NextState("Loader_EXEC3") # -> Final ADD logic & finishing cleanup |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Extended (2nd) cache load |
|
|
|
|
Loader_fsm.act("Loader_XLOAD0", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
NextValue(self.LU_CacheValid, 0), # Engage refill (address safely adjusted by now ...) |
|
|
|
|
NextState("Loader_XLOAD1") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_XLOAD1", # Extended load ... |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
If(self.LU_CacheValid, # Wait until filled ... |
|
|
|
|
#NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextState("Loader_XLOAD2") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_XLOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data1.storage == (self.b32Sentinel.storage + 1), # Valid last entry? +1!!! |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") # Continue w/ loop |
|
|
|
|
).Elif(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Final ADD of results |
|
|
|
|
Loader_fsm.act("Loader_EXEC3", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[7], True), # Current status added |
|
|
|
|
NextValue(fpu1.fadd, True), # Final ADD requested |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 (only!) |
|
|
|
|
NextState("Loader_EXEC4") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC4", |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[8], True), # Current status added |
|
|
|
|
If(fpu1.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|