2 core dram access, matrix inner product started

master
kaqu 1 year ago
parent 87b041a371
commit 45fa4f4f12
  1. 4
      bfloat16nn.py
  2. 107
      libmodules/bfloat16nncore.py
  3. 18
      libmodules/bfloat16processor.py
  4. 101
      libmodules/dramtransfer.py
  5. 136
      software/source/bfloat16nnlib.c

@ -208,9 +208,9 @@ class BaseSoC(SoCCore):
self.submodules.fpga2dram = fpga2dram = FPGA2DRAM(dma_writer=dma_writer, sync_fifo=sync_fifo_out)
self.add_csr("fpga2dram")
"""
# Integrate bfloat16NN processor
# Integrate bfloat16NN processor
RAMWAITTIME=1 # Minimum wait!
self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore(
self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore(
RAMWaitTime=RAMWAITTIME,
LUCacheSize=MAXWORDS,
LoadUnit=dram2fpga,

@ -46,6 +46,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
:bEnable: To enable running (after data preparation)
:b9ArrayWordLen: Number of words used for calculation of scalar (inner) product
Outputs:
########
@ -76,52 +78,47 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Enable free run
""")
self.b9ArrayWordLen = CSRStorage(9, reset_less=False,
fields=[CSRField("ArrayWordLen", size=9, description="*Field*: 9-Bit value")],
description="""
Word length of array used for calculation
""")
# Outputs
self.b16Status = CSRStorage(16, reset_less=False,
fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")],
description="""
Processing stati
""")
self.b16Value1 = CSRStorage(16, reset_less=False,
self.b16Value1_1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 1
FPU#1 Float register 1
""")
self.b16Value2 = CSRStorage(16, reset_less=False,
self.b16Value1_2 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 2
FPU#1 Float register 2
""")
self.b16Value3 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 3
""")
self.b16Value4 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 4
""")
self.b16Value5 = CSRStorage(16, reset_less=False,
self.b16Value2_1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 5
FPU#2 Float register 1
""")
self.b16Value6 = CSRStorage(16, reset_less=False,
self.b16Value2_2 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 6
FPU#2 Float register 2
""")
self.b16Result1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result 1
FPU#1 Processing result
""")
self.b16Result2 = CSRStorage(16, reset_less=False,
fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result 2
FPU#2 Processing result
""")
self.bReady = Signal() # To be wired to data pin ... ;)
@ -163,7 +160,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
#---------------- bfloat16 FPUs -------------------------------------------------------------
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU
NFPUCORES=2 # No. of FPUs used
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU
self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one!
#---------------- Loaded data testing --------------------------------------------------
@ -176,13 +174,15 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already
NextValue(self.Loader_Active, True), # Loader up & running
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array
NextValue(self.b16Result1.storage, 0), # Indicate # delays
NextValue(self.b16Result2.storage, 0), # Indicate # delays
NextValue(self.b16Status.storage[0], True), # Current status
NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value1_2.storage, 0),
NextValue(self.b16Value2_1.storage, 0),
NextValue(self.b16Value2_2.storage, 0),
NextValue(self.bReady, False), # LED off!
NextState("Loader_LOAD1")
).Elif(~self.bEnable.storage, # Externally aborted?
@ -192,8 +192,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
Loader_fsm.act("Loader_LOAD1",
NextValue(self.b16Status.storage[1], True), # Current status added
If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry?
NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry?
NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2")
).Elif(~self.bEnable.storage, # Enable withdrawn?
NextState("Loader_IDLE") # Abort!
@ -202,19 +202,29 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
Loader_fsm.act("Loader_LOAD2",
NextValue(self.b16Status.storage[2], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD3")
# FPU#1
NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])),
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])),
NextValue(LoadUnit.b9Offset1.storage, LoadUnit.b9Offset1.storage + 1), # Move on to next entry
# FPU#2
NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # Pick 2nd date
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])),
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])),
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry
#NextValue(self.Loader_Delay, 0), # Reset delay
#NextState("Loader_LOAD3")
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_LOAD3",
NextValue(self.b16Status.storage[3], True), # Current status added
"""
Loader_fsm.act("Loader_LOAD3",
NextValue(self.b16Status.storage[3], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date
@ -225,9 +235,9 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextState("Loader_LOAD4")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
)
Loader_fsm.act("Loader_LOAD4",
Loader_fsm.act("Loader_LOAD4",
NextValue(self.b16Status.storage[4], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date
@ -237,25 +247,26 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_EXEC1",
)
)
"""
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu1.fadd, True), # This command requested
NextValue(fpu2.fnmsub, True), # This command requested
NextValue(fpu1.fadd, True), # 1st ADD requested
NextValue(fpu2.fadd, True),
NextValue(fpu1.fready, False), # Engage trigger FPU#1
NextValue(fpu2.fready, False), # Engage trigger FPU#2
NextState("Loader_EXEC2")
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[6], True), # Current status added
If(fpu1.fready & fpu2.fready,
NextValue(fpu1.fadd, False), # Clear command request FPU#1
NextValue(fpu2.fnmsub, False), # Clear command request FPU#2
NextValue(fpu2.fadd, False), # Clear command request FPU#2
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!)
NextState("Loader_IDLE")
)
)
)

@ -118,8 +118,7 @@ class bfloat16Processor(Module):
).Elif(self.e2 == -1, # Infinity
NextValue(self.fresult, self.fs2), # Return infinity
NextValue(self.fready, 1),
NextState("FPU_IDLE")
# TODO: VERIFY -> risq5!
NextState("FPU_IDLE")
).Elif(self.fs1[0:31] == 0, # 0+x: Nothing to add? (w/o sign!)
If(self.fsub, # Subtract yields negative result!
NextValue(self.fresult, self.fs2 ^ 0x80000000), # Invert sign
@ -131,8 +130,7 @@ class bfloat16Processor(Module):
NextValue(self.fresult, self.fs2), # Ready!
),
NextValue(self.fready, 1),
NextState("FPU_IDLE")
# FIXME: VERIFY! -->risq5!
NextState("FPU_IDLE")
).Elif(self.fs2[0:31] == 0, # x+0: Nothing to add? (w/o sign!)
If(self.fnmadd | self.fnmsub,
NextValue(self.fresult, self.fs1 ^ 0x80000000), # Ready!
@ -141,15 +139,6 @@ class bfloat16Processor(Module):
),
NextValue(self.fready, 1),
NextState("FPU_IDLE")
#).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!)
# NextValue(self.fresult, self.fs1), # Ready!
# NextValue(self.fready, 1),
# NextState("FPU_IDLE")
#).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!)
# If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT
# NextValue(self.sign3, ~self.sign3) # Invert result finally
# ),
# NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
).Else( # Ok, valid floats supplied ...
NextValue(self.s_bit, 0),
NextValue(self.branch1, 0), # Reset helpers
@ -286,8 +275,7 @@ class bfloat16Processor(Module):
).Elif(self.e2 == -1, # Infinity
NextValue(self.fresult, self.fs2), # Return infinity
NextValue(self.fready, 1),
NextState("FPU_IDLE")
# FIXME: Verify -> risq5!
NextState("FPU_IDLE")
).Elif((self.fs1[0:31] == 0) | (self.fs2[0:31] == 0), # Nothing to multiply? (w/o sign!)
If(self.fmul, # Single instruction? Straight return.
NextValue(self.fresult, 0), # Result will be zero ...

@ -9,7 +9,8 @@
# --------
# 21.12.20/KQ Initial test
# 30.12.20/KQ Working (renamed) version
# 22.04.21/KQ In transfer renamed
# 22.04.21/KQ Inbound transfer renamed
# 06.05.21/KQ Support for 2 read ports added (for now ...)
#
from migen import *
@ -30,10 +31,7 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
######
#. Load ``b32Address`` with base address of range to read from (DRAM: >= 0x40000000)
#. Indicate length of range to read by setting up ``b8Len`` (not used currently,
allways ``maxwords`` * 4 bytes will be loaded (words à 32-bit).
#. Finally, enable processing by setting ``bEnable`` to true (1).
#. Once ``bValid`` becomes true (1), FPGA local memory is loaded, deactivate ``bEnable``
@ -44,20 +42,22 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
Inputs:
#######
:b32Address: Base DRAM Address to load from
:b8Len: Length (0..255) of range to read from (i.e. # of bytes - not used currently - allways reading 32 bytes!)
:b32Address: Base DRAM Address to load from
:bEnable: To enable running (after initialization)
:b9Offset: Offset (0..511) into local FPGA memory to read from
:b9Offset1: Offset #1 (0..511) into local FPGA memory to read from
:b9Offset2: Offset #2 (0..511) into local FPGA memory to read from
Output:
#######
:bValid: Indicate validity of local FPGA memory, i.e. 'loaded'
:b32Data: Local FPGA memory at b9Offset
:b32Data1: Local FPGA memory at b9Offset1
:b32Data2: Local FPGA memory at b9Offset2
"""
def __init__(self, maxwords=8, dma_reader=None, sync_fifo=None):
@ -67,11 +67,6 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Base DRAM address, to load from
""")
self.b8Len = CSRStorage(8, reset_less=True,
fields=[CSRField("Len", size=8, description="*Field*: 8-Bit value (0..max)")],
description="""
Length of range to load, currently not used (allways 4 assumed)
""")
self.bEnable = CSRStorage(1, reset_less=True,
fields=[CSRField("Enable", size=1, description="*Field*: bit", values=[
("0", "DISABLED", "Loading enabled"),
@ -81,11 +76,15 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Enable/disabling DRAM access
""")
self.b9Offset = CSRStorage(9, reset_less=True,
#fields=[CSRField("Offset", size=12, description="*Field*: 9-Bit value (0..511)")],
fields=[CSRField("Offset", size=9, description="*Field*: 9-Bit value (0..511)")],
self.b9Offset1 = CSRStorage(9, reset_less=True,
fields=[CSRField("Offset1", size=9, description="*Field*: 9-Bit value (0..511)")],
description="""
Offset added to base address.
Offset added to base address, port #1
""")
self.b9Offset2 = CSRStorage(9, reset_less=True,
fields=[CSRField("Offset2", size=9, description="*Field*: 9-Bit value (0..511)")],
description="""
Offset added to base address, port #2
""")
# Outputs
@ -98,10 +97,15 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Data valid indication
""")
self.b32Data = CSRStorage(32, reset_less=True,
fields=[CSRField("Data", size=32, description="*Field*: 32-Bit value")],
self.b32Data1 = CSRStorage(32, reset_less=True,
fields=[CSRField("Data1", size=32, description="*Field*: 32-Bit value")],
description="""
Actual value read #1
""")
self.b32Data2 = CSRStorage(32, reset_less=True,
fields=[CSRField("Data2", size=32, description="*Field*: 32-Bit value")],
description="""
Actual value read
Actual value read #2
""")
self.b32RCount = CSRStorage(32, reset_less=True,
fields=[CSRField("RCount", size=32, description="*Field*: 32-Bit value")],
@ -111,8 +115,10 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
# Local 'wire' data
self.b32MemPt = Signal(32) # WRITE: Local FPGA memory offset pointer
self.b2Address = Signal(3) # READ: Adress conversion helper
self.bData = Signal(32) # READ: Helper output data
self.b2Address1 = Signal(3) # READ: Adress conversion helper #1
self.b2Address2 = Signal(3) # READ: Adress conversion helper #2
self.bData1 = Signal(32) # READ: Helper output data #1
self.bData2 = Signal(32) # READ: Helper output data #2
storage = Memory(32, maxwords) # Local FPGA memory
self.specials += storage
@ -179,25 +185,42 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
# --------------------------- Local (FPGA) memory retrieval access -----------------------------------------------
# FPGA local memory read port
rdport = storage.get_port()
self.specials += rdport
rdport1 = storage.get_port()
self.specials += rdport1
rdport2 = storage.get_port()
self.specials += rdport2
self.comb += [ # Read from (FPGA local) memory
self.b2Address.eq(self.b9Offset.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b9Offset.storage < maxwords,
#rdport.adr.eq(self.b9Offset.storage), # w/ translation!
If(self.b2Address == 0,
rdport.adr.eq(self.b9Offset.storage | 3) # 0->3
).Elif(self.b2Address == 1,
rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address == 2,
rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address == 3,
rdport.adr.eq(self.b9Offset.storage & 0x1FC) # 3->0
self.b2Address1.eq(self.b9Offset1.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b9Offset1.storage < maxwords,
#rdport.adr.eq(self.b9Offset1.storage), # w/ translation!
If(self.b2Address1 == 0,
rdport1.adr.eq(self.b9Offset1.storage | 3) # 0->3
).Elif(self.b2Address1 == 1,
rdport1.adr.eq((self.b9Offset1.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address1 == 2,
rdport1.adr.eq((self.b9Offset1.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address1 == 3,
rdport1.adr.eq(self.b9Offset1.storage & 0x1FC) # 3->0
),
self.bData1.eq(rdport1.dat_r) # Assign to external var. ...
),
self.b2Address2.eq(self.b9Offset2.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b9Offset2.storage < maxwords,
#rdport.adr.eq(self.b9Offset2.storage), # w/ translation!
If(self.b2Address2 == 0,
rdport2.adr.eq(self.b9Offset2.storage | 3) # 0->3
).Elif(self.b2Address2 == 1,
rdport2.adr.eq((self.b9Offset2.storage & 0x1FC) | 2) # 1->2
).Elif(self.b2Address2 == 2,
rdport2.adr.eq((self.b9Offset2.storage & 0x1FC) | 1) # 2->1
).Elif(self.b2Address2 == 3,
rdport2.adr.eq(self.b9Offset2.storage & 0x1FC) # 3->0
),
self.bData.eq(rdport.dat_r) # Assign to external var. ...
self.bData2.eq(rdport2.dat_r) # Assign to external var. ...
),
]
self.sync += self.b32Data.storage.eq(self.bData) # Assign to external var. ...
self.sync += self.b32Data1.storage.eq(self.bData1) # Assign to external var. ...
self.sync += self.b32Data2.storage.eq(self.bData2) # Assign to external var. ...
class FPGA2DRAM(Module, AutoCSR, AutoDoc, ModuleDoc):
"""

@ -67,48 +67,34 @@ static int fpgaload(uint32_t *mempt, int16_t len)
return 0; // Timeout
}
static float fp1_read(void)
static float fp1_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_1_read(); // Low-endian, high half word required
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_read(void)
static float fp1_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read();
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_2_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp3_read(void)
static float fp2_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read();
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_1_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp4_read(void)
static float fp2_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value4_read();
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_2_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp5_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value5_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp6_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value6_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fpResult1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
@ -141,48 +127,48 @@ int key_eval(void)
static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
uint32_t *ui32ptr;
uint16_t *ui16ptr;
uint16_t *ui16ptr1, *ui16ptr2;
int i;
float fp1, fp2, fp3, fpResult1;
float fp4, fp5, fp6, fpResult2;
float fp1_1, fp1_2, fpResult1;
float fp2_1, fp2_2, fpResult2;
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1;
ui16ptr = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint16_t)); // Absolute: bytes!
*ui16ptr++ = f2ui16(1.0);
*ui16ptr++ = f2ui16(2.0);
*ui16ptr++ = f2ui16(3.0);
*ui16ptr++ = f2ui16(4.0);
*ui16ptr++ = f2ui16(5.0);
*ui16ptr++ = f2ui16(6.0);
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
fp1 = fp1_read();
fp2 = fp2_read();
fp3 = fp3_read();
fp4 = fp4_read();
fp5 = fp5_read();
fp6 = fp6_read();
// FPU#1
ui16ptr1 = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=DRAMDATASIZE/2;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
// FPU#2
ui16ptr2 = (uint16_t *)(DRAMDATABASE + (DRAMDATASIZE/2) * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=DRAMDATASIZE/2;i++)
*ui16ptr2++ = f2ui16(1.0 * (float)i );
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) { // 512 * 32-bit= 2048 bytes = 2kB
fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1=%5.3f ", fp1); // FIXME: printf1 fails for 0.0 output !
printf1("V2=%5.3f ", fp2);
printf1("V3=%5.3f ", fp3);
printf1("RESULT=%6.4f\n", fpResult1);
printf1("V1=%5.3f ", fp4);
printf1("V2=%5.3f ", fp5);
printf1("V3=%5.3f ", fp6);
printf1("RESULT=%6.4f\n", fpResult2);
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);
printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
/*
for(i=0;i<DRAMDATASIZE;i+=32) {
dram2fpga_b9Offset_write(i);
printf("%d: %d\n", i, dram2fpga_b32Data_read());
for(i=DRAMDATASIZE/2;i<DRAMDATASIZE/2+3;i++) {
dram2fpga_b9Offset2_write(i);
printf("%d: %d\n", i, dram2fpga_b32Data2_read());
}
dram2fpga_b9Offset_write(DRAMDATASIZE - 1);
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data_read());
dram2fpga_b9Offset2_write(DRAMDATASIZE - 1);
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data2_read());
*/
}
else
@ -190,29 +176,39 @@ int key_eval(void)
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("INVALIDATED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf1("V1=%5.3f ", fp1_read());
printf1("V2=%5.3f ", fp2_read());
printf1("V3=%5.3f ", fp3_read());
printf1("RESULT=%6.4f\n", fpResult1_read());
printf1("V1=%5.3f ", fp4_read());
printf1("V2=%5.3f ", fp5_read());
printf1("V3=%5.3f ", fp6_read());
printf1("RESULT=%6.4f\n", fpResult2_read());
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);
printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
}
else
printf("INVALIDATED: Timeout!");
break;
case 's':
printf("REQUESTED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());
printf1("RESULT=%6.4f\n", fpResult1_read());
printf1("V1=%4.2f ", fp4_read());
printf1("V2=%4.2f ", fp5_read());
printf1("V3=%4.2f ", fp6_read());
printf1("RESULT=%6.4f\n", fpResult2_read());
case 's':
fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("REQUESTED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);
printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
break;
case 'x': return 1; // Abort indication
default: ;

Loading…
Cancel
Save