#!/usr/bin/env python3 # # bfloat16nncore.py # # bfloat16 neural network processing core logic # # History: # -------- # 21.04.21/KQ Initial version # from migen import * from migen.fhdl.specials import Memory from litex.soc.interconnect.csr import AutoCSR, CSRStatus, CSRStorage, CSRField, CSRAccess from litex.soc.integration.doc import AutoDoc, ModuleDoc from litex.soc.interconnect.csr import * from libmodules.dramtransfer import DRAM2FPGA, FPGA2DRAM from libmodules.bfloat16processor import bfloat16Processor class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): """ bfloat16NN core class: Usage: ###### 1. Freeze operations by setting ``bEnable`` to false (0) 2. Load ``b32DRAMAddress`` with a 32-bit DRAM memory pointer. 3. Finally, enable processing by setting ``bEnable`` to true (1). 4. bfloat16NN processing will now run async. to CPU until reset or s.a. (1.). CPU from now on will write to DRAM only (yet, L2 cache will have to be flushed), all data will be picked up by FPGA automatically ... Inputs: ####### :b32DRAMAddress: New DRAM address from where to load into local memory :b32Sentinel: Write control word to last address (same as [b32DRAMAddress+511] value) :bEnable: To enable running (after data preparation) Outputs: ######## :b16Result: Processing result :bReady: Ready indication (wire to LED ... ;) """ def __init__(self, RAMWaitTime=128, LUCacheSize=8, LoadUnit=None, StoreUnit=None): # Inputs self.b32DRAMLoadAddress = CSRStorage(32, reset_less=False, fields=[CSRField("LoadAddress", size=32, description="*Field*: 32-Bit value")], description=""" Load value (32-bit DRAM address). """) self.b32Sentinel = CSRStorage(32, reset_less=False, fields=[CSRField("Sentinel", size=32, description="*Field*: 32-Bit value")], description=""" Control value """) self.bEnable = CSRStorage(1, reset_less=False, fields=[CSRField("Enable", size=1, description="*Field*: bit", values=[ ("0", "DISABLED", "bfloat16NN not active"), ("1", "ENABLED", "bfloat16NN active"), ]) ], description=""" Enable free run """) # Outputs self.b16Status = CSRStorage(16, reset_less=False, fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")], description=""" Processing stati """) self.b16Value1 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 1 """) self.b16Value2 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 2 """) self.b16Value3 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 3 """) self.b16Result = CSRStorage(16, reset_less=False, fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")], description=""" Processing result """) self.bReady = Signal() # To be wired to data pin ... ;) # Local vars. # - none yet - #---------------- Load unit (LU) ------------------------------------------------------------- LU_fsm = FSM(reset_state="LU_IDLE") # FSM starts idling ... self.submodules += LU_fsm self.LU_CacheOffset = Signal(9, reset_less=True) # 0..511 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits) self.LU_CacheValid = Signal() # Indicate loaded LU cache self.LU_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.) LU_fsm.act("LU_IDLE", # If cache not valid fill it! If(~self.LU_CacheValid & self.bEnable.storage, # Invalid cache & run enabled ... NextValue(LoadUnit.b32Address.storage, self.b32DRAMLoadAddress.storage), NextValue(self.LU_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit NextValue(self.LU_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle) NextState("LU_LOAD1") ).Elif(~self.bEnable.storage, # Cleared enable? NextValue(self.LU_CacheValid, False), # Enforce cache invalidation! ) ) LU_fsm.act("LU_LOAD1", # Engage! NextValue(LoadUnit.bEnable.storage, 1), # Trigger DRAM transfer to cache NextState("LU_LOAD2") ) LU_fsm.act("LU_LOAD2", # Wait for termination of transfer ... If(LoadUnit.bValid.storage, # Data avail.? NextValue(self.LU_CacheValid, 1), # Declare cache valid NextValue(LoadUnit.bEnable.storage, 0), # Stop DRAM transfer to cache NextState("LU_IDLE") # Yap! ).Else( If(self.LU_CacheDelay < 2047, # MAX-1! NextValue(self.LU_CacheDelay, self.LU_CacheDelay + 1), ) # TODO: Permit timeout indication ... ) ) #---------------- bfloat16 FPU ------------------------------------------------------------- self.submodules.fpu = fpu = bfloat16Processor() # Integrate bfloat16 FPU #---------------- Loaded data testing -------------------------------------------------- Loader_fsm = FSM(reset_state="Loader_IDLE") # FSM starts idling ... self.submodules += Loader_fsm self.Loader_Delay = Signal(32, reset_less=True) self.Loader_Active = Signal() Loader_fsm.act("Loader_IDLE", If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already NextValue(self.Loader_Active, True), # Loader up & running NextValue(self.Loader_Delay, 0), # Reset read delay timer NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel NextValue(self.b16Result.storage, 0), # Indicate # delays NextValue(self.b16Status.storage[0], True), # Current status NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ... NextValue(self.bReady, False), # LED off! NextState("Loader_LOAD1") ).Elif(~self.bEnable.storage, # Externally aborted? NextValue(self.b16Status.storage, 0), # Current status: inactive NextValue(self.Loader_Active, False), # Reset in sync w/ global activation ) ) Loader_fsm.act("Loader_LOAD1", NextValue(self.b16Status.storage[1], True), # Current status added If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry? NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation NextState("Loader_LOAD2") ).Elif(~self.bEnable.storage, # Enable withdrawn? NextState("Loader_IDLE") # Abort! ) ) Loader_fsm.act("Loader_LOAD2", NextValue(self.b16Status.storage[2], True), # Current status added If(self.Loader_Delay > RAMWaitTime, NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date NextValue(fpu.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), NextValue(fpu.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation NextValue(self.Loader_Delay, 0), # Reset delay NextState("Loader_LOAD3") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment ) ) Loader_fsm.act("Loader_LOAD3", NextValue(self.b16Status.storage[3], True), # Current status added If(self.Loader_Delay > RAMWaitTime, NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date NextValue(fpu.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), NextState("Loader_EXEC1") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment ) ) """ fpu.fmadd fpu.fmsub fpu.fnmadd fpu.fnmsub """ Loader_fsm.act("Loader_EXEC1", NextValue(self.b16Status.storage[5], True), # Current status added NextValue(fpu.fmax, True), # This command requested NextValue(fpu.fready, False), # Engage trigger NextState("Loader_EXEC2") ) Loader_fsm.act("Loader_EXEC2", NextValue(self.b16Status.storage[6], True), # Current status added If(fpu.fready, NextValue(fpu.fmax, False), # Clear command request NextValue(self.b16Result.storage, fpu.fresult[16:32]), # Pick result (little endian, high word!) NextValue(self.b16Status.storage[15], True), # Indicate readyness ... NextValue(self.bReady, True), # Indicate readyness (LED on!) NextState("Loader_IDLE") ) )