Working, but uses 2xcalclen?

master
kaqu 1 year ago
parent c0189b15c7
commit 9dc25b43d7
  1. 47
      bfloat16nn.py
  2. 295
      libmodules/bfloat16nncore.py
  3. 93
      libmodules/dramtransfer.py
  4. 238
      software/source/bfloat16nnlib.c

@ -183,38 +183,39 @@ class BaseSoC(SoCCore):
self.add_csr("systime")
# DRAM access section
MAXWORDS = 512 # Max. FIFO!
MAXWORDS = 400 # Max. FIFO=512 permissible!
# -------------- DMA block #1 ----------------------
# Load unit memory access
self.submodules.dma_reader1 = dma_reader1 = LiteDRAMDMAReader(self.sdram.crossbar.get_port(), fifo_depth=MAXWORDS, fifo_buffered=True)
dma_reader1.add_csr()
self.add_csr("dma_reader1")
# Load unit transfer
self.submodules.sync_fifo_in1 = sync_fifo_in1 = SyncFIFO([("data", 32)], MAXWORDS, True)
self.comb += dma_reader1.source.connect(sync_fifo_in1.sink) # Connect DMA-Reader.source -> FIFO.sink
# Load unit (LU)
self.submodules.dram2fpga1 = dram2fpga1 = DRAM2FPGA(maxwords=MAXWORDS, dma_reader=dma_reader1, sync_fifo=sync_fifo_in1)
self.add_csr("dram2fpga1")
# -------------- DMA block #2 ----------------------
# Load unit memory access
self.submodules.dma_reader = dma_reader = LiteDRAMDMAReader(self.sdram.crossbar.get_port(), fifo_depth=MAXWORDS, fifo_buffered=True)
dma_reader.add_csr()
self.add_csr("dma_reader")
self.submodules.dma_reader2 = dma_reader2 = LiteDRAMDMAReader(self.sdram.crossbar.get_port(), fifo_depth=MAXWORDS, fifo_buffered=True)
dma_reader2.add_csr()
self.add_csr("dma_reader2")
# Load unit transfer
self.submodules.sync_fifo_in = sync_fifo_in = SyncFIFO([("data", 32)], MAXWORDS, True)
self.comb += dma_reader.source.connect(sync_fifo_in.sink) # Connect DMA-Reader.source -> FIFO.sink
self.submodules.sync_fifo_in2 = sync_fifo_in2 = SyncFIFO([("data", 32)], MAXWORDS, True)
self.comb += dma_reader2.source.connect(sync_fifo_in2.sink) # Connect DMA-Reader.source -> FIFO.sink
# Load unit (LU)
self.submodules.dram2fpga = dram2fpga = DRAM2FPGA(maxwords=MAXWORDS, dma_reader=dma_reader, sync_fifo=sync_fifo_in)
self.add_csr("dram2fpga")
self.submodules.dram2fpga2 = dram2fpga2 = DRAM2FPGA(maxwords=MAXWORDS, dma_reader=dma_reader2, sync_fifo=sync_fifo_in2)
self.add_csr("dram2fpga2")
""" *** Not used currently ! ***
MAXWRITEWORDS = 1 # Transfer length 1 x 32-bit = 4 byte maximum (SU)
# Store unit memory access
self.submodules.dma_writer = dma_writer = LiteDRAMDMAWriter(self.sdram.crossbar.get_port(), fifo_depth=MAXWRITEWORDS, fifo_buffered=True)
dma_writer.add_csr()
self.add_csr("dma_writer")
# Store unit transfer
self.submodules.sync_fifo_out = sync_fifo_out = SyncFIFO([("data", 32)], MAXWRITEWORDS, True)
self.comb += sync_fifo_out.source.connect(dma_writer.sink) # Connect FIFO.source -> DMA-Writer.sink
# Store unit (SU)
self.submodules.fpga2dram = fpga2dram = FPGA2DRAM(dma_writer=dma_writer, sync_fifo=sync_fifo_out)
self.add_csr("fpga2dram")
"""
# Integrate bfloat16NN processor
RAMWAITTIME = 1 # Minimum wait!
self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore(
RAMWaitTime=RAMWAITTIME,
LUCacheSize=MAXWORDS,
LoadUnit=dram2fpga,
StoreUnit=None, # *** Not used currently: fpga2dram,
LoadUnit1=dram2fpga1,
LoadUnit2=dram2fpga2,
)
self.add_csr("bfloat16nn")

@ -29,7 +29,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
1. Freeze operations by setting ``bEnable`` to false (0)
2. Load ``b32DRAMAddress`` with a 32-bit DRAM memory pointer.
2. Load ``b32DRAMLoadAddress1`` with a 32-bit DRAM memory pointer.
3. Finally, enable processing by setting ``bEnable`` to true (1).
@ -40,34 +40,60 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
Inputs:
#######
:b32DRAMAddress: New DRAM address from where to load into local memory
:b32DRAMLoadAddress1: 1st DRAM address from where to load into local memory (matrice_row)
:b32Sentinel: Write control word to last address (same as [b32DRAMAddress+LEN-1] value)
:b32DRAMLoadAddress2: 2nd DRAM address from where to load into local memory (vector/column)
:bEnable: To enable running (after data preparation)
:b32Sentinel1: Write control word to last address (same as [b32DRAMLoadAddress1+LEN-1] value)
:b32Sentinel2: Write control word to last address (same as [b32DRAMLoadAddress2+LEN-1] value)
:b10ArrayWordLen: Number of words used for calculation of scalar (inner) product
:bEnable: To enable running (after data preparation)
:bReload1: Reload LU#1
:bReload2: Reload LU#2
:b10ArrayWordLen: Number of words used for calculation of scalar (inner) product
Outputs:
########
:b16Result: Processing result
:b16Result1: Processing result FPU#1 & final result
:b16Result2: Processing result FPU#2
:bReady: Ready indication (wire to LED ... ;)
"""
def __init__(self, RAMWaitTime=128, LUCacheSize=8, LoadUnit=None, StoreUnit=None):
def __init__(self, RAMWaitTime=128, LUCacheSize=8, LoadUnit1=None, LoadUnit2=None):
# Inputs
self.b32DRAMLoadAddress = CSRStorage(32, reset_less=False,
fields=[CSRField("LoadAddress", size=32, description="*Field*: 32-Bit value")],
self.b32DRAMLoadAddress1 = CSRStorage(32, reset_less=False,
fields=[CSRField("LoadAddress1", size=32, description="*Field*: 32-Bit value")],
description="""
Load value (32-bit DRAM address) for matrice/row
""")
self.b32DRAMLoadAddress2 = CSRStorage(32, reset_less=False,
fields=[CSRField("LoadAddress2", size=32, description="*Field*: 32-Bit value")],
description="""
Load value (32-bit DRAM address) for vector
""")
self.b32Sentinel1 = CSRStorage(32, reset_less=False,
fields=[CSRField("Sentinel1", size=32, description="*Field*: 32-Bit value")],
description="""
Load value (32-bit DRAM address).
Control value #1
""")
self.b32Sentinel = CSRStorage(32, reset_less=False,
fields=[CSRField("Sentinel", size=32, description="*Field*: 32-Bit value")],
self.b32Sentinel2 = CSRStorage(32, reset_less=False,
fields=[CSRField("Sentinel2", size=32, description="*Field*: 32-Bit value")],
description="""
Control value #2
""")
self.b10ArrayWordLen = CSRStorage(10, reset_less=False,
fields=[CSRField("ArrayWordLen", size=10, description="*Field*: 10-Bit value")],
description="""
Control value
Word length of array used for calculation
""")
self.bEnable = CSRStorage(1, reset_less=False,
fields=[CSRField("Enable", size=1, description="*Field*: bit", values=[
@ -78,11 +104,24 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Enable free run
""")
self.b10ArrayWordLen = CSRStorage(10, reset_less=False,
fields=[CSRField("ArrayWordLen", size=10, description="*Field*: 10-Bit value")],
self.bReload1 = CSRStorage(1, reset_less=False,
fields=[CSRField("Reload1", size=1, description="*Field*: bit", values=[
("0", "DISABLED", "-"),
("1", "ENABLED", "Reload LU#1"),
])
],
description="""
Word length of array used for calculation
""")
Reload LU#1
""")
self.bReload2 = CSRStorage(1, reset_less=False,
fields=[CSRField("Reload2", size=1, description="*Field*: bit", values=[
("0", "DISABLED", "-"),
("1", "ENABLED", "Reload LU#2"),
])
],
description="""
Reload LU#2
""")
# Outputs
self.b16Status = CSRStorage(16, reset_less=False,
@ -108,37 +147,75 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
self.bReady = Signal() # To be wired to data pin ... ;)
# Local vars.
self.b10CurrentOffest = Signal(10, reset_less=True)
self.b10CurrentOffest = Signal(10, reset_less=True) # Current offset in range 0..b10ArrayWordLen-1
#---------------- Load unit (LU) -------------------------------------------------------------
LU_fsm = FSM(reset_state="LU_IDLE") # FSM starts idling ...
self.submodules += LU_fsm
#---------------- Load unit (LU) #1 -------------------------------------------------------------
LU1_fsm = FSM(reset_state="LU1_IDLE") # FSM starts idling ...
self.submodules += LU1_fsm
self.LU_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits)
self.LU_CacheValid = Signal() # Indicate loaded LU cache
self.LU_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.)
LU_fsm.act("LU_IDLE", # If cache not valid fill it!
If(~self.LU_CacheValid & self.bEnable.storage, # Invalid cache & run enabled ...
NextValue(LoadUnit.b32Address.storage, self.b32DRAMLoadAddress.storage),
NextValue(self.LU_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit
NextValue(self.LU_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle)
NextState("LU_LOAD1")
self.LU1_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits)
self.LU1_CacheValid = Signal() # Indicate loaded LU1 cache
self.LU1_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.)
self.LU1_Reload = Signal() # Actual trigger
LU1_fsm.act("LU1_IDLE", # If cache not valid fill it!
If(~self.LU1_CacheValid & self.LU1_Reload, # Invalid cache & load requested?
NextValue(self.LU1_Reload, 0), # Clear trigger
NextValue(LoadUnit1.b32Address.storage, self.b32DRAMLoadAddress1.storage),
NextValue(self.LU1_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit
NextValue(self.LU1_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle)
NextState("LU1_LOAD1")
).Elif(~self.bEnable.storage, # Cleared enable?
NextValue(self.LU_CacheValid, False), # Enforce cache invalidation!
NextValue(self.LU1_CacheValid, False), # Enforce cache invalidation!
)
)
LU_fsm.act("LU_LOAD1", # Engage!
NextValue(LoadUnit.bEnable.storage, 1), # Trigger DRAM transfer to cache
NextState("LU_LOAD2")
LU1_fsm.act("LU1_LOAD1", # Engage!
NextValue(LoadUnit1.bEnable.storage, 1), # Trigger DRAM transfer to cache
NextState("LU1_LOAD2")
)
LU_fsm.act("LU_LOAD2", # Wait for termination of transfer ...
If(LoadUnit.bValid.storage, # Data avail.?
NextValue(self.LU_CacheValid, 1), # Declare cache valid
NextValue(LoadUnit.bEnable.storage, 0), # Stop DRAM transfer to cache
NextState("LU_IDLE") # Yap!
LU1_fsm.act("LU1_LOAD2", # Wait for termination of transfer ...
If(LoadUnit1.bValid.storage, # Data avail.?
NextValue(self.LU1_CacheValid, 1), # Declare cache valid
NextValue(LoadUnit1.bEnable.storage, 0), # Stop DRAM transfer to cache
NextState("LU1_IDLE") # Yap!
).Else(
If(self.LU_CacheDelay < 2047, # MAX-1!
NextValue(self.LU_CacheDelay, self.LU_CacheDelay + 1),
If(self.LU1_CacheDelay < 2047, # MAX-1!
NextValue(self.LU1_CacheDelay, self.LU1_CacheDelay + 1),
)
# TODO: Permit timeout indication ...
)
)
#---------------- Load unit (LU) #2 -------------------------------------------------------------
LU2_fsm = FSM(reset_state="LU2_IDLE") # FSM starts idling ...
self.submodules += LU2_fsm
self.LU2_CacheOffset = Signal(10, reset_less=True) # 0..1023 log2_int(LUCacheSize, False)) # Cache reading offset (0..(Size-1))=>Bits)
self.LU2_CacheValid = Signal() # Indicate loaded LU2 cache
self.LU2_CacheDelay = Signal(11, reset_less=True) # Evaluate load length in cycles (2048 max.)
self.LU2_Reload = Signal() # Actual trigger
LU2_fsm.act("LU2_IDLE", # If cache not valid fill it!
If(~self.LU2_CacheValid & self.LU2_Reload, # Invalid cache & load requested?
NextValue(self.LU2_Reload, 0), # Clear trigger
NextValue(LoadUnit2.b32Address.storage, self.b32DRAMLoadAddress2.storage),
NextValue(self.LU2_CacheOffset, 0), # Adjust pointer (local reader), 4-byte width=32-bit
NextValue(self.LU2_CacheDelay, 2), # Reset load delay counter (but inkl. 1st & last cycle)
NextState("LU2_LOAD1")
).Elif(~self.bEnable.storage, # Cleared enable?
NextValue(self.LU2_CacheValid, False), # Enforce cache invalidation!
)
)
LU2_fsm.act("LU2_LOAD1", # Engage!
NextValue(LoadUnit2.bEnable.storage, 1), # Trigger DRAM transfer to cache
NextState("LU2_LOAD2")
)
LU2_fsm.act("LU2_LOAD2", # Wait for termination of transfer ...
If(LoadUnit2.bValid.storage, # Data avail.?
NextValue(self.LU2_CacheValid, 1), # Declare cache valid
NextValue(LoadUnit2.bEnable.storage, 0), # Stop DRAM transfer to cache
NextState("LU2_IDLE") # Yap!
).Else(
If(self.LU2_CacheDelay < 2047, # MAX-1!
NextValue(self.LU2_CacheDelay, self.LU2_CacheDelay + 1),
)
# TODO: Permit timeout indication ...
)
@ -156,54 +233,72 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
self.Loader_Delay = Signal(32, reset_less=True)
self.Loader_Active = Signal()
Loader_fsm.act("Loader_IDLE",
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already
NextValue(self.Loader_Active, True), # Loader up & running
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(self.b10CurrentOffest, 0), # Actual offset (=DRAM local offset)
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array
NextValue(self.b16Result1.storage, 0), # Indicate # delays
NextValue(self.b16Result2.storage, 0), # Indicate # delays
NextValue(self.bReady, False), # LED off!
NextState("Loader_LOAD1")
If(self.bEnable.storage & ~self.Loader_Active, # Started & not active already?
NextValue(self.Loader_Active, True), # Loader up & running (block re-entry)
If(self.bReload1.storage, # Load requested?
NextValue(self.LU1_CacheValid, 0), # Invalidate cache
NextValue(self.LU1_Reload, 1), # Load matrice row #1
NextValue(self.bReload1.storage, 0), # Clear request
),
If(self.bReload2.storage, # Load requested?
NextValue(self.LU2_CacheValid, 0), # Invalidate cache
NextValue(self.LU2_Reload, 1), # Load vector #2
NextValue(self.bReload2.storage, 0), # Clear request
),
NextValue(self.b16Result1.storage, 0), # Clear results
NextValue(self.b16Result2.storage, 0),
NextValue(self.bReady, False), # LED off!
NextState("Loader_LOAD0")
).Elif(~self.bEnable.storage, # Externally aborted?
NextValue(self.b16Status.storage, 0), # Current status: inactive
NextValue(self.Loader_Active, False), # Reset in sync w/ global activation
)
)
Loader_fsm.act("Loader_LOAD0",
If(self.LU1_CacheValid & self.LU2_CacheValid, # Wait 'til caches start to fill ...
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(self.b10CurrentOffest, 0), # Actual offset (=DRAM local offset)
NextValue(LoadUnit1.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel #1
NextValue(LoadUnit2.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel #2
NextState("Loader_LOAD1")
)
)
Loader_fsm.act("Loader_LOAD1",
NextValue(self.b16Status.storage[0], True), # Current status added
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry?
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2")
).Elif(~self.bEnable.storage, # Enable withdrawn?
NextValue(self.b16Status.storage[0], True), # Current status: Caches loaded
If(~self.bEnable.storage, # Enable withdrawn?
NextState("Loader_IDLE") # Abort!
).Elif(LoadUnit1.b32Data.storage == self.b32Sentinel1.storage, # Valid last entry?
NextValue(self.b16Status.storage[1], True), # Current status: 1st sentinel ok
If(LoadUnit2.b32Data.storage == self.b32Sentinel2.storage, # Valid last entry?
NextValue(self.b16Status.storage[2], True), # Current status: 2nd sentinel ok
NextValue(LoadUnit1.b9Offset.storage, 0), # 1st value offset preparation, matrice/row
NextValue(LoadUnit2.b9Offset.storage, 0), # 1st value offset preparation, vector
NextState("Loader_LOAD2")
)
)
)
#-----> LOOP ENTRY ! (2nd loop onward: fs3 already prepared!)
Loader_fsm.act("Loader_LOAD2",
NextValue(self.b16Status.storage[1], True), # Current status added
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ...
NextValue(self.b10CurrentOffest, self.b10CurrentOffest + 1), # Increment (total) offset
Loader_fsm.act("Loader_LOAD2",
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ...
# FPU#1
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])),
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])),
NextValue(LoadUnit.b10Offset1.storage, LoadUnit.b10Offset1.storage + 1), # Move on to next entry
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit1.b32Data.storage[0:16])),
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit2.b32Data.storage[0:16])),
# FPU#2
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])),
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])),
NextValue(LoadUnit.b10Offset2.storage, LoadUnit.b10Offset2.storage + 1), # Move on to next entry
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit1.b32Data.storage[16:32])),
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit2.b32Data.storage[16:32])),
# Prepare next load in advance ...
NextValue(LoadUnit1.b9Offset.storage, LoadUnit1.b9Offset.storage + 1), # Move on to next entry
NextValue(LoadUnit2.b9Offset.storage, LoadUnit2.b9Offset.storage + 1),
NextValue(self.b10CurrentOffest, self.b10CurrentOffest + 1), # Increment (total) offset
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[2], True), # Current status added
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1!
Loader_fsm.act("Loader_EXEC1",
# FIXME: Compare will fail for 2nd (extended) load ...
If(LoadUnit1.b9Offset.storage == 1, # As pointer already moved ahead 1!
NextValue(fpu1.fmul, True), # 1st ADD requested
NextValue(fpu2.fmul, True),
).Else(
@ -216,29 +311,31 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[3], True), # Current status added
NextValue(self.b16Status.storage[8], fpu1.fready), # TODO: Remove!
NextValue(self.b16Status.storage[9], fpu2.fready), # TODO: Remove!
NextValue(self.b16Status.storage[13], fpu1.fready), # TODO: Remove!
NextValue(self.b16Status.storage[14], fpu2.fready), # TODO: Remove!
If(fpu1.fready & fpu2.fready,
If(LoadUnit.b10Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0)
# FIXME: Compare will fail for 2nd (extended) load ...
If(LoadUnit1.b9Offset.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0)
NextValue(fpu1.fmul, False), # Clear command request FPU#1
NextValue(fpu2.fmul, False), # Clear command request FPU#2
).Else( # Entries 1 .. (maxlen-1)
NextValue(fpu1.fmadd, False), # Clear command request FPU#1
NextValue(fpu2.fmadd, False), # Clear command request FPU#2
),
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s
If(self.b10CurrentOffest < self.b10ArrayWordLen.storage, # Words 0 .. LEN-1
If(LoadUnit.b10Offset1.storage < LUCacheSize, # Words 0 .. Cachelen
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s
If(LoadUnit1.b9Offset.storage < LUCacheSize, # Words 0 .. Cachelen
NextState("Loader_LOAD2") # Next value @offset
).Else( # Cache empty ...
NextValue(self.b32DRAMLoadAddress.storage, self.b32DRAMLoadAddress.storage + LUCacheSize), # Prepare DRAM address
NextValue(self.b32DRAMLoadAddress1.storage, self.b32DRAMLoadAddress1.storage + LUCacheSize), # Prepare DRAM address
NextValue(self.b32DRAMLoadAddress2.storage, self.b32DRAMLoadAddress2.storage + LUCacheSize), # Prepare DRAM address
NextState("Loader_XLOAD0") # Fill cache again
)
).Else( # Finally prepare ADD both result sums (on FPU#1 only!)
NextValue(fpu1.fs1, fpu1.fresult),
NextValue(fpu1.fs2, fpu2.fresult),
NextState("Loader_EXEC3") # -> Final ADD logic & finishing cleanup
NextValue(fpu1.fs2, fpu2.fresult),
NextState("Loader_EXEC3") # -> Final ADD logic & finishing cleanup
)
)
)
@ -246,39 +343,45 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
# Extended (2nd) cache load
Loader_fsm.act("Loader_XLOAD0",
NextValue(self.b16Status.storage[4], True), # Current status added
NextValue(self.LU_CacheValid, 0), # Engage refill (address safely adjusted by now ...)
NextValue(self.LU1_CacheValid, 0), # Engage refill (address safely adjusted by now ...)
NextValue(self.LU1_Reload, 1), # Load vector #2
NextValue(self.LU2_CacheValid, 0), # Invalidate cache
NextValue(self.LU2_Reload, 1), # Load vector #2
NextState("Loader_XLOAD1")
)
Loader_fsm.act("Loader_XLOAD1", # Extended load ...
NextValue(self.b16Status.storage[5], True), # Current status added
If(self.LU_CacheValid, # Wait until filled ...
#NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(LoadUnit.b10Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit.b10Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array
NextState("Loader_XLOAD2")
).Elif(~self.bEnable.storage, # Externally aborted?
If(~self.bEnable.storage, # Externally aborted?
NextState("Loader_IDLE") # Abort!
)
).Elif(self.LU1_CacheValid & self.LU2_CacheValid, # Wait until filled ...
NextValue(LoadUnit1.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit2.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextState("Loader_XLOAD2")
)
)
Loader_fsm.act("Loader_XLOAD2",
NextValue(self.b16Status.storage[6], True), # Current status added
If(LoadUnit.b32Data1.storage == (self.b32Sentinel.storage + 1), # Valid last entry? +1!!!
NextValue(LoadUnit.b10Offset1.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2") # Continue w/ loop
).Elif(~self.bEnable.storage, # Enable withdrawn?
Loader_fsm.act("Loader_XLOAD2",
If(~self.bEnable.storage, # Enable withdrawn?
NextState("Loader_IDLE") # Abort!
).Elif(LoadUnit1.b32Data.storage == (self.b32Sentinel1.storage + 1), # Valid last entry? +1!!!
NextValue(self.b16Status.storage[6], True), # Current status: Sentinel #1 good
If(LoadUnit2.b32Data.storage == (self.b32Sentinel2.storage + 1), # Valid last entry? +1!!!
NextValue(self.b16Status.storage[7], True), # Current status: Sentinel #2 good
NextValue(LoadUnit1.b9Offset.storage, 0), # 1st value offset preparation
NextValue(LoadUnit2.b9Offset.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2") # Continue w/ loop
)
)
)
# Final ADD of results
Loader_fsm.act("Loader_EXEC3",
NextValue(self.b16Status.storage[7], True), # Current status added
NextValue(self.b16Status.storage[8], True), # Current status added
NextValue(fpu1.fadd, True), # Final ADD requested
NextValue(fpu1.fready, False), # Engage trigger FPU#1 (only!)
NextState("Loader_EXEC4")
)
Loader_fsm.act("Loader_EXEC4",
NextValue(self.b16Status.storage[8], True), # Current status added
NextValue(self.b16Status.storage[9], True), # Current status added
If(fpu1.fready,
NextValue(fpu1.fadd, False), # Clear command request FPU#1
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)

@ -10,7 +10,6 @@
# 21.12.20/KQ Initial test
# 30.12.20/KQ Working (renamed) version
# 22.04.21/KQ Inbound transfer renamed
# 06.05.21/KQ Support for 2 read ports added (for now ...)
#
from migen import *
@ -36,7 +35,7 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
#. Once ``bValid`` becomes true (1), FPGA local memory is loaded, deactivate ``bEnable``
#. To retrieve, load ``b10Offset`` with offset (from base adress) to read from (0 .. 1023),
#. To retrieve, load ``b9Offset`` with offset (from base adress) to read from (0 .. 511),
``b32Data`` will contain the 32-bit value (from local FPGA memory @offset)
Inputs:
@ -46,18 +45,14 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
:bEnable: To enable running (after initialization)
:b10Offset1: Offset #1 (0..1023) into local FPGA memory to read from
:b10Offset2: Offset #2 (0..1023) into local FPGA memory to read from
:b9Offset: Offset (0..511) into local FPGA memory to read from
Output:
#######
:bValid: Indicate validity of local FPGA memory, i.e. 'loaded'
:b32Data1: Local FPGA memory at b10Offset1
:b32Data2: Local FPGA memory at b10Offset2
:b32Data: Local FPGA memory at b9Offset
"""
def __init__(self, maxwords=8, dma_reader=None, sync_fifo=None):
@ -76,17 +71,12 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Enable/disabling DRAM access
""")
self.b10Offset1 = CSRStorage(10, reset_less=True,
fields=[CSRField("Offset1", size=10, description="*Field*: 10-Bit value (0..1023)")],
description="""
Offset added to base address, port #1
""")
self.b10Offset2 = CSRStorage(10, reset_less=True,
fields=[CSRField("Offset2", size=10, description="*Field*: 10-Bit value (0..1023)")],
self.b9Offset = CSRStorage(9, reset_less=True,
fields=[CSRField("Offset", size=9, description="*Field*: 9-Bit value (0..511)")],
description="""
Offset added to base address, port #2
Offset added to base address
""")
# Outputs
self.bValid = CSRStorage(1, reset_less=True,
fields=[CSRField("Valid", size=1, description="*Field*: bit", values=[
@ -97,16 +87,11 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Data valid indication
""")
self.b32Data1 = CSRStorage(32, reset_less=True,
fields=[CSRField("Data1", size=32, description="*Field*: 32-Bit value")],
self.b32Data = CSRStorage(32, reset_less=True,
fields=[CSRField("Data", size=32, description="*Field*: 32-Bit value")],
description="""
Actual value read #1
""")
self.b32Data2 = CSRStorage(32, reset_less=True,
fields=[CSRField("Data2", size=32, description="*Field*: 32-Bit value")],
description="""
Actual value read #2
""")
Actual value read
""")
self.b32RCount = CSRStorage(32, reset_less=True,
fields=[CSRField("RCount", size=32, description="*Field*: 32-Bit value")],
description="""
@ -115,10 +100,8 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
# Local 'wire' data
self.b32MemPt = Signal(32) # WRITE: Local FPGA memory offset pointer
self.b2Address1 = Signal(3) # READ: Adress conversion helper #1
self.b2Address2 = Signal(3) # READ: Adress conversion helper #2
self.bData1 = Signal(32) # READ: Helper output data #1
self.bData2 = Signal(32) # READ: Helper output data #2
self.b2Address = Signal(3) # READ: Adress conversion helper #1
self.bData = Signal(32) # READ: Helper output data #1
storage = Memory(32, maxwords) # Local FPGA memory
self.specials += storage
@ -185,43 +168,25 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
# --------------------------- Local (FPGA) memory retrieval access -----------------------------------------------
# FPGA local memory read port
rdport1 = storage.get_port()
self.specials += rdport1
rdport2 = storage.get_port()
self.specials += rdport2
rdport = storage.get_port()
self.specials += rdport
self.comb += [ # Read from (FPGA local) memory
self.b2Address1.eq(self.b10Offset1.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b10Offset1.storage < maxwords,
#rdport.adr.eq(self.b10Offset1.storage), # w/ translation!
If(self.b2Address1 == 0,
rdport1.adr.eq(self.b10Offset1.storage | 3) # 0->3
).Elif(self.b2Address1 == 1,
rdport1.adr.eq((self.b10Offset1.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address1 == 2,
rdport1.adr.eq((self.b10Offset1.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address1 == 3,
rdport1.adr.eq(self.b10Offset1.storage & 0x1FC) # 3->0
self.b2Address.eq(self.b9Offset.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b9Offset.storage < maxwords,
#rdport.adr.eq(self.b9Offset.storage), # w/ translation!
If(self.b2Address == 0,
rdport.adr.eq(self.b9Offset.storage | 3) # 0->3
).Elif(self.b2Address == 1,
rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address == 2,
rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address == 3,
rdport.adr.eq(self.b9Offset.storage & 0x1FC) # 3->0
),
self.bData1.eq(rdport1.dat_r) # Assign to external var. ...
),
self.b2Address2.eq(self.b10Offset2.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b10Offset2.storage < maxwords,
#rdport.adr.eq(self.b10Offset2.storage), # w/ translation!
If(self.b2Address2 == 0,
rdport2.adr.eq(self.b10Offset2.storage | 3) # 0->3
).Elif(self.b2Address2 == 1,
rdport2.adr.eq((self.b10Offset2.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address2 == 2,
rdport2.adr.eq((self.b10Offset2.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address2 == 3,
rdport2.adr.eq(self.b10Offset2.storage & 0x1FC) # 3->0
),
self.bData2.eq(rdport2.dat_r) # Assign to external var. ...
),
self.bData.eq(rdport.dat_r) # Assign to external var. ...
),
]
self.sync += self.b32Data1.storage.eq(self.bData1) # Assign to external var. ...
self.sync += self.b32Data2.storage.eq(self.bData2) # Assign to external var. ...
self.sync += self.b32Data.storage.eq(self.bData) # Assign to external var. ...
class FPGA2DRAM(Module, AutoCSR, AutoDoc, ModuleDoc):
"""

@ -44,36 +44,35 @@ extern void busy_wait(unsigned int ms); // Worx!
extern char kbhit(void);
extern int key_eval(void);
#define DRAMDATABASE 0x40190000
#define DRAMDATASIZE 1024 // 512 OK, 800 FAIL => 2 Load cycles (2*512)!
#define DRAMDATABASE1 0x40190000
#define DRAMDATASIZE1 400 // 512 OK => 2 Load cycles (2*512)!
#define DRAMDATABASE2 (DRAMDATABASE1 + (DRAMDATASIZE1*sizeof(int32_t)))
#define DRAMDATASIZE2 DRAMDATASIZE1
static uint32_t fpgastate, fpustates;
static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen)
static int fpgaload(uint32_t *mempt1, uint32_t *mempt2, int16_t calclen, int bReload1, int bReload2)
{
uint32_t *sentinel1 = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE/2 - 1) * sizeof(int32_t));
uint32_t *sentinel2 = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
uint32_t *sentinel1 = (uint32_t *)(DRAMDATABASE1 + ((DRAMDATASIZE1-1) * sizeof(uint32_t)));
uint32_t *sentinel2 = (uint32_t *)(DRAMDATABASE2 + ((DRAMDATASIZE2-1) * sizeof(uint32_t)));
static uint32_t seqno = 0x41434142; // Just some marker pattern ;)
if((len < 4) | (len > DRAMDATASIZE)) {
printf("*** fpgaload: len out of range!");
return -1; // Verify length of transfer was understood!
}
if((calclen < 2) | (calclen > len/2)) {
printf("*** fpgaload: calclen out of range!");
return -2; // Reasonable calc amount?
}
bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...)
*sentinel1 = 0x41434142; // Just some marker pattern ;)
*sentinel2 = 0x41434142 + 1; // Just some marker pattern ;)
bfloat16nn_b32Sentinel_write(*sentinel1);
*sentinel1 = seqno++;
*sentinel2 = seqno++;
bfloat16nn_b32Sentinel1_write(*sentinel1);
bfloat16nn_b32Sentinel2_write(*sentinel2);
flush_l2_cache(); // Strictly nec. for longer transfers
bfloat16nn_b10ArrayWordLen_write(calclen); // Indicate array length for calc.
bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from
bfloat16nn_b32DRAMLoadAddress1_write((uint32_t)mempt1); // Indicate memory to load from
bfloat16nn_b32DRAMLoadAddress2_write((uint32_t)mempt2); // Indicate memory to load from
bfloat16nn_bReload1_write(bReload1 ? 1 : 0); // Reload mem#1
bfloat16nn_bReload2_write(bReload2 ? 1 : 0); // Reload mem#2
bfloat16nn_bEnable_write(1); // Finally: Engage!
for(int i=0;i<2000;i++) { // Max. 100ms delay
if(bfloat16nn_b16Status_read() & 0x8000) {
fpgastate = (uint32_t)bfloat16nn_b16Status_read();
fpustates = (uint32_t)bfloat16nn_b16FPUStates_read();
bfloat16nn_bEnable_write(0); // Disable transfer
fpgastate = 0;
fpustates = 0;
return 1; // Ok, ready!
}
else
@ -84,36 +83,7 @@ static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen)
bfloat16nn_bEnable_write(0); // Disable transfer
return 0; // Timeout
}
/*
static float fp1_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_1_read(); // Low-endian, high half word required
float *fpt = (float *)&v;
return *fpt;
}
static float fp1_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_2_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_1_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_2_read();
float *fpt = (float *)&v;
return *fpt;
}
*/
static float fpResult1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
@ -142,147 +112,103 @@ void dumpfloat(float f)
int key_eval(void)
{
extern void printf1(const char *fmt, float f1);
static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
uint32_t *ui32ptr;
uint16_t *ui16ptr1, *ui16ptr2;
int i;
//float fp1_1, fp1_2;
float fpResult1;
//float fp2_1, fp2_2;
float fpResult2;
extern void printf1(const char *fmt, float f1);
uint32_t *ui32ptr1, *ui32ptr2;
uint16_t *ui16ptr1, *ui16ptr2;
float fpResult1, fpResult2;
uint32_t starttime;
uint32_t deltatime;
int i;
#define MAXCALCLEN (284) //784 //16 OK
#define MAXCALCLEN 4 //784 //16 OK
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
printf("Elements/FPU: %d\n", MAXCALCLEN);
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = 0; // Clear all memory ...
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1; // Clear all memory ...
printf("DRAM1->%08Xh DRAM2->%08Xh\n", DRAMDATABASE1, DRAMDATABASE2);
/*
// TODO: Control procedure w/ regular code (matrice inner product)
float *floatptr = (float *)DRAMDATABASE;
float *floatptr2 = (float *)(DRAMDATABASE + (MAXCALCLEN/2) * sizeof(float)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN/2;i++) {
*floatptr++ = (1.0 * (float)i);
for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i<DRAMDATASIZE1;i++) // Setup test data
*ui32ptr1++ = 0; // Clear all memory ...
for(i=0, ui32ptr2 = (uint32_t *)DRAMDATABASE2;i<DRAMDATASIZE2;i++) // Setup test data
*ui32ptr2++ = 0; // Clear all memory ...
for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i<DRAMDATASIZE1;i++) // Setup test data
*ui32ptr1++ = i+1;
for(i=0, ui32ptr2 = (uint32_t *)DRAMDATABASE2;i<DRAMDATASIZE2;i++) // Setup test data
*ui32ptr2++ = i+1;
*/
// TODO: Control procedure w/ regular code (matrice inner product)
float *floatptr1 = (float *)DRAMDATABASE1;
float *floatptr2 = (float *)(DRAMDATABASE1 + (DRAMDATASIZE1 * sizeof(float)));
for(i=1;i<=MAXCALCLEN;i++) {
*floatptr1++ = (1.0 * (float)i);
*floatptr2++ = (1.0 * (float)i);
}
floatptr = (float *)DRAMDATABASE;
floatptr2 = (float *)(DRAMDATABASE + (MAXCALCLEN/2) * sizeof(float)); // Absolute: bytes!
floatptr1 = (float *)DRAMDATABASE1;
floatptr2 = (float *)(DRAMDATABASE1 + (DRAMDATASIZE1 * sizeof(float))); // Absolute: bytes!
starttime = systime(0);
float sum = 0.0;
for(i=1;i<=MAXCALCLEN/2;i++) {
sum += ((*floatptr++) * (*floatptr2++)); // 1*1+2*2+3*3+4*4 = 1+4+9+16 = 5+9+16 = 14+16=30
for(i=1;i<=MAXCALCLEN;i++) {
sum += ((*floatptr1++) * (*floatptr2++)); // 1*1+2*2+3*3+4*4 = 1+4+9+16 = 5+9+16 = 14+16=30
}
deltatime = systime(0)-starttime;
printf("S/W Delta t: %dms ", deltatime);
printf1("\t\t\tS/W SUM=%8.4f\n", sum);
// FPU#1
ui16ptr1 = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
// FPU#2
ui16ptr2 = (uint16_t *)(DRAMDATABASE + (DRAMDATASIZE/2) * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN;i++)
// If not cleared fails???
for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i<DRAMDATASIZE1;i++) // Setup test data
*ui32ptr1++ = 0; // Clear all memory ...
for(i=0, ui32ptr2 = (uint32_t *)DRAMDATABASE2;i<DRAMDATASIZE2;i++) // Setup test data
*ui32ptr2++ = 0; // Clear all memory ...
ui16ptr1 = (uint16_t *)DRAMDATABASE1; // Absolute: bytes! Matrice/row
for(i=1;i<=MAXCALCLEN+1;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
ui16ptr2 = (uint16_t *)DRAMDATABASE2; // Absolute: bytes! Vector
for(i=1;i<=MAXCALCLEN+1;i++)
*ui16ptr2++ = f2ui16(1.0 * (float)i );
*/
// BOTH: 1*1+2*2+3*3+4*4 (+5*5) = 1+4+9+16+25 = 5+9+16+25 = 14+16=30+25 = 55 WRONG!
// FPU#1: 1*1 +3*3 = 1+9 = 10 OK
// FPU#2: 2*2 +4*4 (+5*5)= 4+16(+5*5) = 20(+25) = 45 WRONG!
starttime = systime(0);
if(fpgaload((uint32_t *)DRAMDATABASE, DRAMDATASIZE, MAXCALCLEN/2)) { // 800*32-bit=3200 bytes, 400 Words/FPU to calc.
if(fpgaload((uint32_t *)DRAMDATABASE1, (uint32_t *)DRAMDATABASE2, MAXCALCLEN, 1, 1)) { // 800*32-bit=3200 bytes, 400 Words/FPU to calc.
deltatime = systime(0)-starttime;
printf("H/W Delta t: %dms ", deltatime);
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("(S=%04Xh: FS=%04Xh)", fpgastate, fpustates);
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
printf1("\tS/W SUM=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
//printf1("(RESULT2=%8.4f)", fpResult2);
/*
for(i=DRAMDATASIZE/2;i<DRAMDATASIZE/2+3;i++) {
dram2fpga_b10Offset2_write(i);
printf("%d: %d\n", i, dram2fpga_b32Data2_read());
}
dram2fpga_b10Offset2_write(DRAMDATASIZE - 1);
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data2_read());
*/
printf1("\tH/W SUM=%8.4f", fpResult1);
printf1("\t(FPU#2=%8.4f)\n", fpResult2);
}
else {
printf("CURRENT TIMEOUT: S=%04Xh: FS=%04Xh ", fpgastate, fpustates);
printf("Offset 1: %d (%d) ", (uint32_t)dram2fpga_b10Offset1_read(), dram2fpga_b32Data1_read());
printf("Offset 2: %d (%d)", (uint32_t)dram2fpga_b10Offset2_read(), dram2fpga_b32Data2_read());
printf("Sentinels: %08Xh %08Xh\n", bfloat16nn_b32Sentinel_read(), dram2fpga_b32Data1_read());
for(i=0;i<10;i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=DRAMDATASIZE/2 - 5;i<(DRAMDATASIZE/2 + 5);i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=512 - 5;i<(512 + 5);i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=DRAMDATASIZE-10;i<DRAMDATASIZE-1;i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
dram2fpga_b10Offset1_write(DRAMDATASIZE - 1);
dram2fpga_b10Offset2_write(DRAMDATASIZE - 1);
printf("%d:*%d=%d*\n", DRAMDATASIZE - 1, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, DRAMDATASIZE, MAXCALCLEN/2)) {
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("INVALIDATED: S=%04Xh: FS=%04Xh\n", fpgastate, fpustates);
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
//printf1("RESULT1=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
//printf1("RESULT2=%8.4f\n", fpResult2);
printf("Offset 1: %d (%d) ", (uint32_t)dram2fpga1_b9Offset_read(), dram2fpga1_b32Data_read());
printf("Offset 2: %d (%d) ", (uint32_t)dram2fpga2_b9Offset_read(), dram2fpga2_b32Data_read());
printf("Sentinel 1: %08Xh=%08Xh ", bfloat16nn_b32Sentinel1_read(), dram2fpga1_b32Data_read());
printf("Sentinel 2: %08Xh=%08Xh ", bfloat16nn_b32Sentinel2_read(), dram2fpga2_b32Data_read());
}
else
printf("INVALIDATED TIMEOUT: S=%04Xh: FS=%04Xh\n", fpgastate, fpustates);
/*
ui32ptr1 = (uint32_t *)(DRAMDATABASE1 + (DRAMDATASIZE1-4)*sizeof(uint32_t));
ui32ptr2 = (uint32_t *)(DRAMDATABASE2 + (DRAMDATASIZE2-4)*sizeof(uint32_t));
for(i=DRAMDATASIZE1-4;i<(DRAMDATASIZE1-1);i++) {
dram2fpga1_b9Offset_write(i);
dram2fpga2_b9Offset_write(i);
printf("%d:\t1:%d/%d\t2:%d/%d\n", i, dram2fpga1_b32Data_read(), *ui32ptr1++, dram2fpga2_b32Data_read(), *ui32ptr2++);
}
dram2fpga1_b9Offset_write(i);
dram2fpga2_b9Offset_write(i);
printf("%d: 1:%08Xh/%08Xh", i, dram2fpga1_b32Data_read(), *ui32ptr1++);
printf("\t2:%08Xh/%08Xh\n", dram2fpga2_b32Data_read(), *ui32ptr2++);
*/
break;
case 's':
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("REQUESTED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
printf1("RESULT1=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
printf1("RESULT2=%8.4f\n", fpResult2);
break;
case 'x': return 1; // Abort indication

Loading…
Cancel
Save