|
|
|
@ -29,7 +29,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
|
|
|
|
|
1. Freeze operations by setting ``bEnable`` to false (0) |
|
|
|
|
|
|
|
|
|
2. Load ``b32DRAMAddress`` with a 32-bit DRAM memory pointer. |
|
|
|
|
2. Load ``b32DRAMLoadAddress1`` with a 32-bit DRAM memory pointer. |
|
|
|
|
|
|
|
|
|
3. Finally, enable processing by setting ``bEnable`` to true (1). |
|
|
|
|
|
|
|
|
@ -40,34 +40,60 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
Inputs: |
|
|
|
|
####### |
|
|
|
|
|
|
|
|
|
:b32DRAMAddress: New DRAM address from where to load into local memory |
|
|
|
|
:b32DRAMLoadAddress1: 1st DRAM address from where to load into local memory (matrice_row) |
|
|
|
|
|
|
|
|
|
:b32Sentinel: Write control word to last address (same as [b32DRAMAddress+LEN-1] value) |
|
|
|
|
:b32DRAMLoadAddress2: 2nd DRAM address from where to load into local memory (vector/column) |
|
|
|
|
|
|
|
|
|
:bEnable: To enable running (after data preparation) |
|
|
|
|
:b32Sentinel1: Write control word to last address (same as [b32DRAMLoadAddress1+LEN-1] value) |
|
|
|
|
|
|
|
|
|
:b32Sentinel2: Write control word to last address (same as [b32DRAMLoadAddress2+LEN-1] value) |
|
|
|
|
|
|
|
|
|
:b10ArrayWordLen: Number of words used for calculation of scalar (inner) product |
|
|
|
|
|
|
|
|
|
:bEnable: To enable running (after data preparation) |
|
|
|
|
|
|
|
|
|
:bReload1: Reload LU#1 |
|
|
|
|
|
|
|
|
|
:bReload2: Reload LU#2 |
|
|
|
|
|
|
|
|
|
:b10ArrayWordLen: Number of words used for calculation of scalar (inner) product |
|
|
|
|
|
|
|
|
|
Outputs: |
|
|
|
|
######## |
|
|
|
|
|
|
|
|
|
:b16Result: Processing result |
|
|
|
|
:b16Result1: Processing result FPU#1 & final result |
|
|
|
|
|
|
|
|
|
:b16Result2: Processing result FPU#2 |
|
|
|
|
|
|
|
|
|
:bReady: Ready indication (wire to LED ... ;) |
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
def __init__(self, RAMWaitTime=128, LUCacheSize=8, LoadUnit=None, StoreUnit=None): |
|
|
|
|
def __init__(self, RAMWaitTime=128, LUCacheSize=8, LoadUnit1=None, LoadUnit2=None): |
|
|
|
|
|
|
|
|
|
# Inputs |
|
|
|
|
self.b32DRAMLoadAddress = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("LoadAddress", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
self.b32DRAMLoadAddress1 = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("LoadAddress1", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Load value (32-bit DRAM address) for matrice/row |
|
|
|
|
""") |
|
|
|
|
self.b32DRAMLoadAddress2 = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("LoadAddress2", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Load value (32-bit DRAM address) for vector |
|
|
|
|
""") |
|
|
|
|
self.b32Sentinel1 = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("Sentinel1", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Load value (32-bit DRAM address). |
|
|
|
|
Control value #1 |
|
|
|
|
""") |
|
|
|
|
self.b32Sentinel = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("Sentinel", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
self.b32Sentinel2 = CSRStorage(32, reset_less=False, |
|
|
|
|
fields=[CSRField("Sentinel2", size=32, description="*Field*: 32-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Control value #2 |
|
|
|
|
""") |
|
|
|
|
self.b10ArrayWordLen = CSRStorage(10, reset_less=False, |
|
|
|
|
fields=[CSRField("ArrayWordLen", size=10, description="*Field*: 10-Bit value")], |
|
|
|
|
description=""" |
|
|
|
|
Control value |
|
|
|
|
Word length of array used for calculation |
|
|
|
|
""") |
|
|
|
|
self.bEnable = CSRStorage(1, reset_less=False, |
|
|
|
|
fields=[CSRField("Enable", size=1, description="*Field*: bit", values=[ |
|
|
|
@ -78,11 +104,24 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
description=""" |
|
|
|
|
Enable free run |
|
|
|
|
""") |
|
|
|
|
self.b10ArrayWordLen = CSRStorage(10, reset_less=False, |
|
|
|
|
fields=[CSRField("ArrayWordLen", size=10, description="*Field*: 10-Bit value")], |
|
|
|
|
self.bReload1 = CSRStorage(1, reset_less=False, |
|
|
|
|
fields=[CSRField("Reload1", size=1, description="*Field*: bit", values=[ |
|
|
|
|
("0", "DISABLED", "-"), |
|
|
|
|
("1", "ENABLED", "Reload LU#1"), |
|
|
|
|
]) |
|
|
|
|
], |
|
|
|
|
description=""" |
|
|
|
|
Word length of array used for calculation |
|
|
|
|
""") |
|
|
|
|
Reload LU#1 |
|
|
|
|
""") |
|
|
|
|
self.bReload2 = CSRStorage(1, reset_less=False, |
|
|
|
|
fields=[CSRField("Reload2", size=1, description="*Field*: bit", values=[ |
|
|
|
|
("0", "DISABLED", "-"), |
|
|
|
|
("1", "ENABLED", "Reload LU#2"), |
|
|
|
|
]) |
|
|
|
|
], |
|
|
|
|
description=""" |
|
|
|
|
Reload LU#2 |
|
|
|
|
""") |
|
|
|
|
|
|
|
|
|
# Outputs |
|
|
|
|
self.b16Status = CSRStorage(16, reset_less=False, |
|
|
|
@ -108,37 +147,75 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
self.bReady = Signal() # To be wired to data pin ... ;) |
|
|
|
|
|
|
|
|
|
# Local vars. |
|
|
|
|
self.b10CurrentOffest = Signal(10, reset_less=True) |
|
|
|
|
self.b10CurrentOffest = Signal(10, reset_less=True) # Current offset in range 0..b10ArrayWordLen-1 |
|
|
|
|
|
|
|
|
|
#---------------- Load unit (LU) ------------------------------------------------------------- |
|
|
|
|
LU_fsm = FSM(reset_state="LU_IDLE") # FSM starts idling ... |
|
|
|
|
self.submodules += LU_fsm |
|
|
|
|
#---------------- Load unit (LU) #1 ------------------------------------------------------------- |
|
|
|
|
LU1_fsm = FSM(reset_state="LU1_IDLE") # FSM starts idling ... |
|
|
|
|
self.submodules += LU1_fsm |
|
|
|
|
|
|
|
|
|
self.LU_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) |
|
|
|
|
self.LU_CacheValid = Signal() # Indicate loaded LU cache |
|
|
|
|
self.LU_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.) |
|
|
|
|
LU_fsm.act("LU_IDLE", # If cache not valid fill it! |
|
|
|
|
If(~self.LU_CacheValid & self.bEnable.storage, # Invalid cache & run enabled ... |
|
|
|
|
NextValue(LoadUnit.b32Address.storage, self.b32DRAMLoadAddress.storage), |
|
|
|
|
NextValue(self.LU_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit |
|
|
|
|
NextValue(self.LU_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle) |
|
|
|
|
NextState("LU_LOAD1") |
|
|
|
|
self.LU1_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) |
|
|
|
|
self.LU1_CacheValid = Signal() # Indicate loaded LU1 cache |
|
|
|
|
self.LU1_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.) |
|
|
|
|
self.LU1_Reload = Signal() # Actual trigger |
|
|
|
|
LU1_fsm.act("LU1_IDLE", # If cache not valid fill it! |
|
|
|
|
If(~self.LU1_CacheValid & self.LU1_Reload, # Invalid cache & load requested? |
|
|
|
|
NextValue(self.LU1_Reload, 0), # Clear trigger |
|
|
|
|
NextValue(LoadUnit1.b32Address.storage, self.b32DRAMLoadAddress1.storage), |
|
|
|
|
NextValue(self.LU1_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit |
|
|
|
|
NextValue(self.LU1_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle) |
|
|
|
|
NextState("LU1_LOAD1") |
|
|
|
|
).Elif(~self.bEnable.storage, # Cleared enable? |
|
|
|
|
NextValue(self.LU_CacheValid, False), # Enforce cache invalidation! |
|
|
|
|
NextValue(self.LU1_CacheValid, False), # Enforce cache invalidation! |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
LU_fsm.act("LU_LOAD1", # Engage! |
|
|
|
|
NextValue(LoadUnit.bEnable.storage, 1), # Trigger DRAM transfer to cache |
|
|
|
|
NextState("LU_LOAD2") |
|
|
|
|
LU1_fsm.act("LU1_LOAD1", # Engage! |
|
|
|
|
NextValue(LoadUnit1.bEnable.storage, 1), # Trigger DRAM transfer to cache |
|
|
|
|
NextState("LU1_LOAD2") |
|
|
|
|
) |
|
|
|
|
LU_fsm.act("LU_LOAD2", # Wait for termination of transfer ... |
|
|
|
|
If(LoadUnit.bValid.storage, # Data avail.? |
|
|
|
|
NextValue(self.LU_CacheValid, 1), # Declare cache valid |
|
|
|
|
NextValue(LoadUnit.bEnable.storage, 0), # Stop DRAM transfer to cache |
|
|
|
|
NextState("LU_IDLE") # Yap! |
|
|
|
|
LU1_fsm.act("LU1_LOAD2", # Wait for termination of transfer ... |
|
|
|
|
If(LoadUnit1.bValid.storage, # Data avail.? |
|
|
|
|
NextValue(self.LU1_CacheValid, 1), # Declare cache valid |
|
|
|
|
NextValue(LoadUnit1.bEnable.storage, 0), # Stop DRAM transfer to cache |
|
|
|
|
NextState("LU1_IDLE") # Yap! |
|
|
|
|
).Else( |
|
|
|
|
If(self.LU_CacheDelay < 2047, # MAX-1! |
|
|
|
|
NextValue(self.LU_CacheDelay, self.LU_CacheDelay + 1), |
|
|
|
|
If(self.LU1_CacheDelay < 2047, # MAX-1! |
|
|
|
|
NextValue(self.LU1_CacheDelay, self.LU1_CacheDelay + 1), |
|
|
|
|
) |
|
|
|
|
# TODO: Permit timeout indication ... |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
#---------------- Load unit (LU) #2 ------------------------------------------------------------- |
|
|
|
|
LU2_fsm = FSM(reset_state="LU2_IDLE") # FSM starts idling ... |
|
|
|
|
self.submodules += LU2_fsm |
|
|
|
|
|
|
|
|
|
self.LU2_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) |
|
|
|
|
self.LU2_CacheValid = Signal() # Indicate loaded LU2 cache |
|
|
|
|
self.LU2_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.) |
|
|
|
|
self.LU2_Reload = Signal() # Actual trigger |
|
|
|
|
LU2_fsm.act("LU2_IDLE", # If cache not valid fill it! |
|
|
|
|
If(~self.LU2_CacheValid & self.LU2_Reload, # Invalid cache & load requested? |
|
|
|
|
NextValue(self.LU2_Reload, 0), # Clear trigger |
|
|
|
|
NextValue(LoadUnit2.b32Address.storage, self.b32DRAMLoadAddress2.storage), |
|
|
|
|
NextValue(self.LU2_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit |
|
|
|
|
NextValue(self.LU2_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle) |
|
|
|
|
NextState("LU2_LOAD1") |
|
|
|
|
).Elif(~self.bEnable.storage, # Cleared enable? |
|
|
|
|
NextValue(self.LU2_CacheValid, False), # Enforce cache invalidation! |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
LU2_fsm.act("LU2_LOAD1", # Engage! |
|
|
|
|
NextValue(LoadUnit2.bEnable.storage, 1), # Trigger DRAM transfer to cache |
|
|
|
|
NextState("LU2_LOAD2") |
|
|
|
|
) |
|
|
|
|
LU2_fsm.act("LU2_LOAD2", # Wait for termination of transfer ... |
|
|
|
|
If(LoadUnit2.bValid.storage, # Data avail.? |
|
|
|
|
NextValue(self.LU2_CacheValid, 1), # Declare cache valid |
|
|
|
|
NextValue(LoadUnit2.bEnable.storage, 0), # Stop DRAM transfer to cache |
|
|
|
|
NextState("LU2_IDLE") # Yap! |
|
|
|
|
).Else( |
|
|
|
|
If(self.LU2_CacheDelay < 2047, # MAX-1! |
|
|
|
|
NextValue(self.LU2_CacheDelay, self.LU2_CacheDelay + 1), |
|
|
|
|
) |
|
|
|
|
# TODO: Permit timeout indication ... |
|
|
|
|
) |
|
|
|
@ -156,54 +233,72 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
self.Loader_Delay = Signal(32, reset_less=True) |
|
|
|
|
self.Loader_Active = Signal() |
|
|
|
|
Loader_fsm.act("Loader_IDLE", |
|
|
|
|
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(self.b10CurrentOffest, 0), # Actual offset (=DRAM local offset) |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.b16Result2.storage, 0), # Indicate # delays |
|
|
|
|
NextValue(self.bReady, False), # LED off! |
|
|
|
|
NextState("Loader_LOAD1") |
|
|
|
|
If(self.bEnable.storage & ~self.Loader_Active, # Started & not active already? |
|
|
|
|
NextValue(self.Loader_Active, True), # Loader up & running (block re-entry) |
|
|
|
|
If(self.bReload1.storage, # Load requested? |
|
|
|
|
NextValue(self.LU1_CacheValid, 0), # Invalidate cache |
|
|
|
|
NextValue(self.LU1_Reload, 1), # Load matrice row #1 |
|
|
|
|
NextValue(self.bReload1.storage, 0), # Clear request |
|
|
|
|
), |
|
|
|
|
If(self.bReload2.storage, # Load requested? |
|
|
|
|
NextValue(self.LU2_CacheValid, 0), # Invalidate cache |
|
|
|
|
NextValue(self.LU2_Reload, 1), # Load vector #2 |
|
|
|
|
NextValue(self.bReload2.storage, 0), # Clear request |
|
|
|
|
), |
|
|
|
|
NextValue(self.b16Result1.storage, 0), # Clear results |
|
|
|
|
NextValue(self.b16Result2.storage, 0), |
|
|
|
|
NextValue(self.bReady, False), # LED off! |
|
|
|
|
NextState("Loader_LOAD0") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
|
NextValue(self.b16Status.storage, 0), # Current status: inactive |
|
|
|
|
NextValue(self.Loader_Active, False), # Reset in sync w/ global activation |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD0", |
|
|
|
|
If(self.LU1_CacheValid & self.LU2_CacheValid, # Wait 'til caches start to fill ... |
|
|
|
|
NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(self.b10CurrentOffest, 0), # Actual offset (=DRAM local offset) |
|
|
|
|
NextValue(LoadUnit1.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel #1 |
|
|
|
|
NextValue(LoadUnit2.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel #2 |
|
|
|
|
NextState("Loader_LOAD1") |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_LOAD1", |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry? |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
).Elif(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextValue(self.b16Status.storage[0], True), # Current status: Caches loaded |
|
|
|
|
If(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
).Elif(LoadUnit1.b32Data.storage == self.b32Sentinel1.storage, # Valid last entry? |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status: 1st sentinel ok |
|
|
|
|
If(LoadUnit2.b32Data.storage == self.b32Sentinel2.storage, # Valid last entry? |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status: 2nd sentinel ok |
|
|
|
|
NextValue(LoadUnit1.b9Offset.storage, 0), # 1st value offset preparation, matrice/row |
|
|
|
|
NextValue(LoadUnit2.b9Offset.storage, 0), # 1st value offset preparation, vector |
|
|
|
|
NextState("Loader_LOAD2") |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
#-----> LOOP ENTRY ! (2nd loop onward: fs3 already prepared!) |
|
|
|
|
Loader_fsm.act("Loader_LOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[1], True), # Current status added |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ... |
|
|
|
|
NextValue(self.b10CurrentOffest, self.b10CurrentOffest + 1), # Increment (total) offset |
|
|
|
|
|
|
|
|
|
Loader_fsm.act("Loader_LOAD2", |
|
|
|
|
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ... |
|
|
|
|
# FPU#1 |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LoadUnit.b10Offset1.storage + 1), # Move on to next entry |
|
|
|
|
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit1.b32Data.storage[0:16])), |
|
|
|
|
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit2.b32Data.storage[0:16])), |
|
|
|
|
# FPU#2 |
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])), |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LoadUnit.b10Offset2.storage + 1), # Move on to next entry |
|
|
|
|
|
|
|
|
|
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit1.b32Data.storage[16:32])), |
|
|
|
|
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit2.b32Data.storage[16:32])), |
|
|
|
|
# Prepare next load in advance ... |
|
|
|
|
NextValue(LoadUnit1.b9Offset.storage, LoadUnit1.b9Offset.storage + 1), # Move on to next entry |
|
|
|
|
NextValue(LoadUnit2.b9Offset.storage, LoadUnit2.b9Offset.storage + 1), |
|
|
|
|
NextValue(self.b10CurrentOffest, self.b10CurrentOffest + 1), # Increment (total) offset |
|
|
|
|
NextState("Loader_EXEC1") |
|
|
|
|
).Else( # MEM wait cycles |
|
|
|
|
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
NextValue(self.b16Status.storage[2], True), # Current status added |
|
|
|
|
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
Loader_fsm.act("Loader_EXEC1", |
|
|
|
|
# FIXME: Compare will fail for 2nd (extended) load ... |
|
|
|
|
If(LoadUnit1.b9Offset.storage == 1, # As pointer already moved ahead 1! |
|
|
|
|
NextValue(fpu1.fmul, True), # 1st ADD requested |
|
|
|
|
NextValue(fpu2.fmul, True), |
|
|
|
|
).Else( |
|
|
|
@ -216,29 +311,31 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC2", |
|
|
|
|
NextValue(self.b16Status.storage[3], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[8], fpu1.fready), # TODO: Remove! |
|
|
|
|
NextValue(self.b16Status.storage[9], fpu2.fready), # TODO: Remove! |
|
|
|
|
NextValue(self.b16Status.storage[13], fpu1.fready), # TODO: Remove! |
|
|
|
|
NextValue(self.b16Status.storage[14], fpu2.fready), # TODO: Remove! |
|
|
|
|
If(fpu1.fready & fpu2.fready, |
|
|
|
|
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) |
|
|
|
|
# FIXME: Compare will fail for 2nd (extended) load ... |
|
|
|
|
If(LoadUnit1.b9Offset.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) |
|
|
|
|
NextValue(fpu1.fmul, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmul, False), # Clear command request FPU#2 |
|
|
|
|
).Else( # Entries 1 .. (maxlen-1) |
|
|
|
|
NextValue(fpu1.fmadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(fpu2.fmadd, False), # Clear command request FPU#2 |
|
|
|
|
), |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(self.b10CurrentOffest < self.b10ArrayWordLen.storage, # Words 0 .. LEN-1 |
|
|
|
|
If(LoadUnit.b10Offset1.storage < LUCacheSize, # Words 0 .. Cachelen |
|
|
|
|
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s |
|
|
|
|
If(LoadUnit1.b9Offset.storage < LUCacheSize, # Words 0 .. Cachelen |
|
|
|
|
NextState("Loader_LOAD2") # Next value @offset |
|
|
|
|
).Else( # Cache empty ... |
|
|
|
|
NextValue(self.b32DRAMLoadAddress.storage, self.b32DRAMLoadAddress.storage + LUCacheSize), # Prepare DRAM address |
|
|
|
|
NextValue(self.b32DRAMLoadAddress1.storage, self.b32DRAMLoadAddress1.storage + LUCacheSize), # Prepare DRAM address |
|
|
|
|
NextValue(self.b32DRAMLoadAddress2.storage, self.b32DRAMLoadAddress2.storage + LUCacheSize), # Prepare DRAM address |
|
|
|
|
NextState("Loader_XLOAD0") # Fill cache again |
|
|
|
|
) |
|
|
|
|
).Else( # Finally prepare ADD both result sums (on FPU#1 only!) |
|
|
|
|
NextValue(fpu1.fs1, fpu1.fresult), |
|
|
|
|
NextValue(fpu1.fs2, fpu2.fresult), |
|
|
|
|
NextState("Loader_EXEC3") # -> Final ADD logic & finishing cleanup |
|
|
|
|
NextValue(fpu1.fs2, fpu2.fresult), |
|
|
|
|
NextState("Loader_EXEC3") # -> Final ADD logic & finishing cleanup |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
@ -246,39 +343,45 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): |
|
|
|
|
# Extended (2nd) cache load |
|
|
|
|
Loader_fsm.act("Loader_XLOAD0", |
|
|
|
|
NextValue(self.b16Status.storage[4], True), # Current status added |
|
|
|
|
NextValue(self.LU_CacheValid, 0), # Engage refill (address safely adjusted by now ...) |
|
|
|
|
NextValue(self.LU1_CacheValid, 0), # Engage refill (address safely adjusted by now ...) |
|
|
|
|
NextValue(self.LU1_Reload, 1), # Load vector #2 |
|
|
|
|
NextValue(self.LU2_CacheValid, 0), # Invalidate cache |
|
|
|
|
NextValue(self.LU2_Reload, 1), # Load vector #2 |
|
|
|
|
NextState("Loader_XLOAD1") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_XLOAD1", # Extended load ... |
|
|
|
|
NextValue(self.b16Status.storage[5], True), # Current status added |
|
|
|
|
If(self.LU_CacheValid, # Wait until filled ... |
|
|
|
|
#NextValue(self.Loader_Delay, 0), # Reset read delay timer |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array |
|
|
|
|
NextState("Loader_XLOAD2") |
|
|
|
|
).Elif(~self.bEnable.storage, # Externally aborted? |
|
|
|
|
If(~self.bEnable.storage, # Externally aborted? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
) |
|
|
|
|
).Elif(self.LU1_CacheValid & self.LU2_CacheValid, # Wait until filled ... |
|
|
|
|
NextValue(LoadUnit1.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextValue(LoadUnit2.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel |
|
|
|
|
NextState("Loader_XLOAD2") |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_XLOAD2", |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status added |
|
|
|
|
If(LoadUnit.b32Data1.storage == (self.b32Sentinel.storage + 1), # Valid last entry? +1!!! |
|
|
|
|
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") # Continue w/ loop |
|
|
|
|
).Elif(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
Loader_fsm.act("Loader_XLOAD2", |
|
|
|
|
If(~self.bEnable.storage, # Enable withdrawn? |
|
|
|
|
NextState("Loader_IDLE") # Abort! |
|
|
|
|
).Elif(LoadUnit1.b32Data.storage == (self.b32Sentinel1.storage + 1), # Valid last entry? +1!!! |
|
|
|
|
NextValue(self.b16Status.storage[6], True), # Current status: Sentinel #1 good |
|
|
|
|
If(LoadUnit2.b32Data.storage == (self.b32Sentinel2.storage + 1), # Valid last entry? +1!!! |
|
|
|
|
NextValue(self.b16Status.storage[7], True), # Current status: Sentinel #2 good |
|
|
|
|
NextValue(LoadUnit1.b9Offset.storage, 0), # 1st value offset preparation |
|
|
|
|
NextValue(LoadUnit2.b9Offset.storage, 0), # 1st value offset preparation |
|
|
|
|
NextState("Loader_LOAD2") # Continue w/ loop |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
# Final ADD of results |
|
|
|
|
Loader_fsm.act("Loader_EXEC3", |
|
|
|
|
NextValue(self.b16Status.storage[7], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[8], True), # Current status added |
|
|
|
|
NextValue(fpu1.fadd, True), # Final ADD requested |
|
|
|
|
NextValue(fpu1.fready, False), # Engage trigger FPU#1 (only!) |
|
|
|
|
NextState("Loader_EXEC4") |
|
|
|
|
) |
|
|
|
|
Loader_fsm.act("Loader_EXEC4", |
|
|
|
|
NextValue(self.b16Status.storage[8], True), # Current status added |
|
|
|
|
NextValue(self.b16Status.storage[9], True), # Current status added |
|
|
|
|
If(fpu1.fready, |
|
|
|
|
NextValue(fpu1.fadd, False), # Clear command request FPU#1 |
|
|
|
|
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) |
|
|
|
|