diff --git a/bfloat16nn.py b/bfloat16nn.py index 367673c..b854e2e 100755 --- a/bfloat16nn.py +++ b/bfloat16nn.py @@ -208,9 +208,9 @@ class BaseSoC(SoCCore): self.submodules.fpga2dram = fpga2dram = FPGA2DRAM(dma_writer=dma_writer, sync_fifo=sync_fifo_out) self.add_csr("fpga2dram") """ - # Integrate bfloat16NN processor + # Integrate bfloat16NN processor RAMWAITTIME=1 # Minimum wait! - self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore( + self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore( RAMWaitTime=RAMWAITTIME, LUCacheSize=MAXWORDS, LoadUnit=dram2fpga, diff --git a/libmodules/bfloat16nncore.py b/libmodules/bfloat16nncore.py index a860d88..03854bc 100644 --- a/libmodules/bfloat16nncore.py +++ b/libmodules/bfloat16nncore.py @@ -46,6 +46,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): :bEnable: To enable running (after data preparation) + :b9ArrayWordLen: Number of words used for calculation of scalar (inner) product + Outputs: ######## @@ -76,52 +78,47 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Enable free run """) - + self.b9ArrayWordLen = CSRStorage(9, reset_less=False, + fields=[CSRField("ArrayWordLen", size=9, description="*Field*: 9-Bit value")], + description=""" + Word length of array used for calculation + """) + # Outputs self.b16Status = CSRStorage(16, reset_less=False, fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")], description=""" Processing stati """) - self.b16Value1 = CSRStorage(16, reset_less=False, + self.b16Value1_1 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" - Float register 1 + FPU#1 Float register 1 """) - self.b16Value2 = CSRStorage(16, reset_less=False, + self.b16Value1_2 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" - Float register 2 + FPU#1 Float register 2 """) - self.b16Value3 = CSRStorage(16, reset_less=False, + self.b16Value2_1 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" - Float register 3 + FPU#2 Float register 1 """) - self.b16Value4 = CSRStorage(16, reset_less=False, + self.b16Value2_2 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" - Float register 4 - """) - self.b16Value5 = CSRStorage(16, reset_less=False, - fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], - description=""" - Float register 5 - """) - self.b16Value6 = CSRStorage(16, reset_less=False, - fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], - description=""" - Float register 6 + FPU#2 Float register 2 """) self.b16Result1 = CSRStorage(16, reset_less=False, fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], description=""" - Processing result 1 + FPU#1 Processing result """) self.b16Result2 = CSRStorage(16, reset_less=False, fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")], description=""" - Processing result 2 + FPU#2 Processing result """) self.bReady = Signal() # To be wired to data pin ... ;) @@ -163,7 +160,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) #---------------- bfloat16 FPUs ------------------------------------------------------------- - self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU + NFPUCORES=2 # No. of FPUs used + self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one! #---------------- Loaded data testing -------------------------------------------------- @@ -176,13 +174,15 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already NextValue(self.Loader_Active, True), # Loader up & running NextValue(self.Loader_Delay, 0), # Reset read delay timer - NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel + NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel + NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array NextValue(self.b16Result1.storage, 0), # Indicate # delays NextValue(self.b16Result2.storage, 0), # Indicate # delays NextValue(self.b16Status.storage[0], True), # Current status - NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... - NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... - NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ... + NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ... + NextValue(self.b16Value1_2.storage, 0), + NextValue(self.b16Value2_1.storage, 0), + NextValue(self.b16Value2_2.storage, 0), NextValue(self.bReady, False), # LED off! NextState("Loader_LOAD1") ).Elif(~self.bEnable.storage, # Externally aborted? @@ -192,8 +192,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) Loader_fsm.act("Loader_LOAD1", NextValue(self.b16Status.storage[1], True), # Current status added - If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry? - NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation + If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry? + NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation NextState("Loader_LOAD2") ).Elif(~self.bEnable.storage, # Enable withdrawn? NextState("Loader_IDLE") # Abort! @@ -202,19 +202,29 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): Loader_fsm.act("Loader_LOAD2", NextValue(self.b16Status.storage[2], True), # Current status added If(self.Loader_Delay > RAMWaitTime, - NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date - NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date - NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), - NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), - NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation - NextValue(self.Loader_Delay, 0), # Reset delay - NextState("Loader_LOAD3") + # FPU#1 + NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date + NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date + NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[0:16])), + NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data1.storage[16:32])), + NextValue(LoadUnit.b9Offset1.storage, LoadUnit.b9Offset1.storage + 1), # Move on to next entry + # FPU#2 + NextValue(self.b16Value2_1.storage, LoadUnit.b32Data2.storage & 0xFFFF), # Pick 1st date + NextValue(self.b16Value2_2.storage, LoadUnit.b32Data2.storage >> 16), # Pick 2nd date + NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[0:16])), + NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])), + NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry + + #NextValue(self.Loader_Delay, 0), # Reset delay + #NextState("Loader_LOAD3") + NextState("Loader_EXEC1") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment ) ) - Loader_fsm.act("Loader_LOAD3", - NextValue(self.b16Status.storage[3], True), # Current status added + """ + Loader_fsm.act("Loader_LOAD3", + NextValue(self.b16Status.storage[3], True), # Current status added If(self.Loader_Delay > RAMWaitTime, NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date @@ -225,9 +235,9 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextState("Loader_LOAD4") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment - ) + ) ) - Loader_fsm.act("Loader_LOAD4", + Loader_fsm.act("Loader_LOAD4", NextValue(self.b16Status.storage[4], True), # Current status added If(self.Loader_Delay > RAMWaitTime, NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date @@ -237,25 +247,26 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextState("Loader_EXEC1") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment - ) - ) - Loader_fsm.act("Loader_EXEC1", + ) + ) + """ + Loader_fsm.act("Loader_EXEC1", NextValue(self.b16Status.storage[5], True), # Current status added - NextValue(fpu1.fadd, True), # This command requested - NextValue(fpu2.fnmsub, True), # This command requested + NextValue(fpu1.fadd, True), # 1st ADD requested + NextValue(fpu2.fadd, True), NextValue(fpu1.fready, False), # Engage trigger FPU#1 NextValue(fpu2.fready, False), # Engage trigger FPU#2 - NextState("Loader_EXEC2") + NextState("Loader_EXEC2") ) - Loader_fsm.act("Loader_EXEC2", + Loader_fsm.act("Loader_EXEC2", NextValue(self.b16Status.storage[6], True), # Current status added If(fpu1.fready & fpu2.fready, NextValue(fpu1.fadd, False), # Clear command request FPU#1 - NextValue(fpu2.fnmsub, False), # Clear command request FPU#2 + NextValue(fpu2.fadd, False), # Clear command request FPU#2 NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!) NextValue(self.b16Status.storage[15], True), # Indicate readyness ... NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!) NextState("Loader_IDLE") - ) + ) ) diff --git a/libmodules/bfloat16processor.py b/libmodules/bfloat16processor.py index 93f51e9..2434d47 100644 --- a/libmodules/bfloat16processor.py +++ b/libmodules/bfloat16processor.py @@ -118,8 +118,7 @@ class bfloat16Processor(Module): ).Elif(self.e2 == -1, # Infinity NextValue(self.fresult, self.fs2), # Return infinity NextValue(self.fready, 1), - NextState("FPU_IDLE") - # TODO: VERIFY -> risq5! + NextState("FPU_IDLE") ).Elif(self.fs1[0:31] == 0, # 0+x: Nothing to add? (w/o sign!) If(self.fsub, # Subtract yields negative result! NextValue(self.fresult, self.fs2 ^ 0x80000000), # Invert sign @@ -131,8 +130,7 @@ class bfloat16Processor(Module): NextValue(self.fresult, self.fs2), # Ready! ), NextValue(self.fready, 1), - NextState("FPU_IDLE") - # FIXME: VERIFY! -->risq5! + NextState("FPU_IDLE") ).Elif(self.fs2[0:31] == 0, # x+0: Nothing to add? (w/o sign!) If(self.fnmadd | self.fnmsub, NextValue(self.fresult, self.fs1 ^ 0x80000000), # Ready! @@ -141,15 +139,6 @@ class bfloat16Processor(Module): ), NextValue(self.fready, 1), NextState("FPU_IDLE") - #).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!) - # NextValue(self.fresult, self.fs1), # Ready! - # NextValue(self.fready, 1), - # NextState("FPU_IDLE") - #).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!) - # If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT - # NextValue(self.sign3, ~self.sign3) # Invert result finally - # ), - # NextState("FRESULT") # Just supply (normalized finally!) result from multiplication! ).Else( # Ok, valid floats supplied ... NextValue(self.s_bit, 0), NextValue(self.branch1, 0), # Reset helpers @@ -286,8 +275,7 @@ class bfloat16Processor(Module): ).Elif(self.e2 == -1, # Infinity NextValue(self.fresult, self.fs2), # Return infinity NextValue(self.fready, 1), - NextState("FPU_IDLE") - # FIXME: Verify -> risq5! + NextState("FPU_IDLE") ).Elif((self.fs1[0:31] == 0) | (self.fs2[0:31] == 0), # Nothing to multiply? (w/o sign!) If(self.fmul, # Single instruction? Straight return. NextValue(self.fresult, 0), # Result will be zero ... diff --git a/libmodules/dramtransfer.py b/libmodules/dramtransfer.py index cb5c258..591e757 100644 --- a/libmodules/dramtransfer.py +++ b/libmodules/dramtransfer.py @@ -9,7 +9,8 @@ # -------- # 21.12.20/KQ Initial test # 30.12.20/KQ Working (renamed) version -# 22.04.21/KQ In transfer renamed +# 22.04.21/KQ Inbound transfer renamed +# 06.05.21/KQ Support for 2 read ports added (for now ...) # from migen import * @@ -30,10 +31,7 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): ###### #. Load ``b32Address`` with base address of range to read from (DRAM: >= 0x40000000) - - #. Indicate length of range to read by setting up ``b8Len`` (not used currently, - allways ``maxwords`` * 4 bytes will be loaded (words à 32-bit). - + #. Finally, enable processing by setting ``bEnable`` to true (1). #. Once ``bValid`` becomes true (1), FPGA local memory is loaded, deactivate ``bEnable`` @@ -44,20 +42,22 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): Inputs: ####### - :b32Address: Base DRAM Address to load from - - :b8Len: Length (0..255) of range to read from (i.e. # of bytes - not used currently - allways reading 32 bytes!) + :b32Address: Base DRAM Address to load from :bEnable: To enable running (after initialization) - :b9Offset: Offset (0..511) into local FPGA memory to read from + :b9Offset1: Offset #1 (0..511) into local FPGA memory to read from + + :b9Offset2: Offset #2 (0..511) into local FPGA memory to read from Output: ####### :bValid: Indicate validity of local FPGA memory, i.e. 'loaded' - :b32Data: Local FPGA memory at b9Offset + :b32Data1: Local FPGA memory at b9Offset1 + + :b32Data2: Local FPGA memory at b9Offset2 """ def __init__(self, maxwords=8, dma_reader=None, sync_fifo=None): @@ -67,11 +67,6 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Base DRAM address, to load from """) - self.b8Len = CSRStorage(8, reset_less=True, - fields=[CSRField("Len", size=8, description="*Field*: 8-Bit value (0..max)")], - description=""" - Length of range to load, currently not used (allways 4 assumed) - """) self.bEnable = CSRStorage(1, reset_less=True, fields=[CSRField("Enable", size=1, description="*Field*: bit", values=[ ("0", "DISABLED", "Loading enabled"), @@ -81,11 +76,15 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Enable/disabling DRAM access """) - self.b9Offset = CSRStorage(9, reset_less=True, - #fields=[CSRField("Offset", size=12, description="*Field*: 9-Bit value (0..511)")], - fields=[CSRField("Offset", size=9, description="*Field*: 9-Bit value (0..511)")], + self.b9Offset1 = CSRStorage(9, reset_less=True, + fields=[CSRField("Offset1", size=9, description="*Field*: 9-Bit value (0..511)")], description=""" - Offset added to base address. + Offset added to base address, port #1 + """) + self.b9Offset2 = CSRStorage(9, reset_less=True, + fields=[CSRField("Offset2", size=9, description="*Field*: 9-Bit value (0..511)")], + description=""" + Offset added to base address, port #2 """) # Outputs @@ -98,10 +97,15 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Data valid indication """) - self.b32Data = CSRStorage(32, reset_less=True, - fields=[CSRField("Data", size=32, description="*Field*: 32-Bit value")], + self.b32Data1 = CSRStorage(32, reset_less=True, + fields=[CSRField("Data1", size=32, description="*Field*: 32-Bit value")], description=""" - Actual value read + Actual value read #1 + """) + self.b32Data2 = CSRStorage(32, reset_less=True, + fields=[CSRField("Data2", size=32, description="*Field*: 32-Bit value")], + description=""" + Actual value read #2 """) self.b32RCount = CSRStorage(32, reset_less=True, fields=[CSRField("RCount", size=32, description="*Field*: 32-Bit value")], @@ -111,8 +115,10 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): # Local 'wire' data self.b32MemPt = Signal(32) # WRITE: Local FPGA memory offset pointer - self.b2Address = Signal(3) # READ: Adress conversion helper - self.bData = Signal(32) # READ: Helper output data + self.b2Address1 = Signal(3) # READ: Adress conversion helper #1 + self.b2Address2 = Signal(3) # READ: Adress conversion helper #2 + self.bData1 = Signal(32) # READ: Helper output data #1 + self.bData2 = Signal(32) # READ: Helper output data #2 storage = Memory(32, maxwords) # Local FPGA memory self.specials += storage @@ -179,25 +185,42 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): # --------------------------- Local (FPGA) memory retrieval access ----------------------------------------------- # FPGA local memory read port - rdport = storage.get_port() - self.specials += rdport + rdport1 = storage.get_port() + self.specials += rdport1 + rdport2 = storage.get_port() + self.specials += rdport2 self.comb += [ # Read from (FPGA local) memory - self.b2Address.eq(self.b9Offset.storage[0:2]), # Filter bits 0..1 (range 0-3) - If(self.b9Offset.storage < maxwords, - #rdport.adr.eq(self.b9Offset.storage), # w/ translation! - If(self.b2Address == 0, - rdport.adr.eq(self.b9Offset.storage | 3) # 0->3 - ).Elif(self.b2Address == 1, - rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 2) # 1->2 - ).Elif(self.b2Address == 2, - rdport.adr.eq((self.b9Offset.storage & 0x1FC) | 1) # 2->1 - ).Elif(self.b2Address == 3, - rdport.adr.eq(self.b9Offset.storage & 0x1FC) # 3->0 + self.b2Address1.eq(self.b9Offset1.storage[0:2]), # Filter bits 0..1 (range 0-3) + If(self.b9Offset1.storage < maxwords, + #rdport.adr.eq(self.b9Offset1.storage), # w/ translation! + If(self.b2Address1 == 0, + rdport1.adr.eq(self.b9Offset1.storage | 3) # 0->3 + ).Elif(self.b2Address1 == 1, + rdport1.adr.eq((self.b9Offset1.storage & 0x1FC) | 2) # 1->2 + ).Elif(self.b2Address1 == 2, + rdport1.adr.eq((self.b9Offset1.storage & 0x1FC) | 1) # 2->1 + ).Elif(self.b2Address1 == 3, + rdport1.adr.eq(self.b9Offset1.storage & 0x1FC) # 3->0 ), - self.bData.eq(rdport.dat_r) # Assign to external var. ... + self.bData1.eq(rdport1.dat_r) # Assign to external var. ... + ), + self.b2Address2.eq(self.b9Offset2.storage[0:2]), # Filter bits 0..1 (range 0-3) + If(self.b9Offset2.storage < maxwords, + #rdport.adr.eq(self.b9Offset2.storage), # w/ translation! + If(self.b2Address2 == 0, + rdport2.adr.eq(self.b9Offset2.storage | 3) # 0->3 + ).Elif(self.b2Address2 == 1, + rdport2.adr.eq((self.b9Offset2.storage & 0x1FC) | 2) # 1->2 + ).Elif(self.b2Address2 == 2, + rdport2.adr.eq((self.b9Offset2.storage & 0x1FC) | 1) # 2->1 + ).Elif(self.b2Address2 == 3, + rdport2.adr.eq(self.b9Offset2.storage & 0x1FC) # 3->0 + ), + self.bData2.eq(rdport2.dat_r) # Assign to external var. ... ), ] - self.sync += self.b32Data.storage.eq(self.bData) # Assign to external var. ... + self.sync += self.b32Data1.storage.eq(self.bData1) # Assign to external var. ... + self.sync += self.b32Data2.storage.eq(self.bData2) # Assign to external var. ... class FPGA2DRAM(Module, AutoCSR, AutoDoc, ModuleDoc): """ diff --git a/software/source/bfloat16nnlib.c b/software/source/bfloat16nnlib.c index 12cb6fc..de41cda 100644 --- a/software/source/bfloat16nnlib.c +++ b/software/source/bfloat16nnlib.c @@ -67,48 +67,34 @@ static int fpgaload(uint32_t *mempt, int16_t len) return 0; // Timeout } -static float fp1_read(void) +static float fp1_1_read(void) { uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_1_read(); // Low-endian, high half word required float *fpt = (float *)&v; return *fpt; } -static float fp2_read(void) +static float fp1_2_read(void) { uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read(); + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_2_read(); float *fpt = (float *)&v; return *fpt; } -static float fp3_read(void) +static float fp2_1_read(void) { uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read(); + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_1_read(); float *fpt = (float *)&v; return *fpt; } -static float fp4_read(void) +static float fp2_2_read(void) { uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value4_read(); + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_2_read(); float *fpt = (float *)&v; return *fpt; } -static float fp5_read(void) -{ - uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value5_read(); - float *fpt = (float *)&v; - return *fpt; -} -static float fp6_read(void) -{ - uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value6_read(); - float *fpt = (float *)&v; - return *fpt; -} static float fpResult1_read(void) { uint32_t v __attribute__((aligned(16))) = 0; @@ -141,48 +127,48 @@ int key_eval(void) static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t)); uint32_t *ui32ptr; - uint16_t *ui16ptr; + uint16_t *ui16ptr1, *ui16ptr2; int i; - float fp1, fp2, fp3, fpResult1; - float fp4, fp5, fp6, fpResult2; + float fp1_1, fp1_2, fpResult1; + float fp2_1, fp2_2, fpResult2; switch(kbhit()) { case 'r': // Reload printf("\e[35;1m*** Reload ***\e[0m\n"); for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i