Matrice finished, but timeout?!

master
kaqu 1 year ago
parent 45fa4f4f12
commit e99f510d9e
  1. 94
      libmodules/bfloat16nncore.py
  2. 43
      software/source/bfloat16nnlib.c

@ -169,16 +169,15 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
self.submodules += Loader_fsm
self.Loader_Delay = Signal(32, reset_less=True)
self.Loader_Active = Signal()
self.Loader_Active = Signal()
Loader_fsm.act("Loader_IDLE",
If(self.LU_CacheValid & ~self.Loader_Active, # Enter if not active already
NextValue(self.Loader_Active, True), # Loader up & running
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(LoadUnit.b9Offset1.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array
NextValue(LoadUnit.b9Offset2.storage, LUCacheSize >> 1), # Adjust offset to start of 2nd array
NextValue(self.b16Result1.storage, 0), # Indicate # delays
NextValue(self.b16Result2.storage, 0), # Indicate # delays
NextValue(self.b16Status.storage[0], True), # Current status
NextValue(self.b16Result2.storage, 0), # Indicate # delays
NextValue(self.b16Value1_1.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value1_2.storage, 0),
NextValue(self.b16Value2_1.storage, 0),
@ -191,7 +190,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
)
Loader_fsm.act("Loader_LOAD1",
NextValue(self.b16Status.storage[1], True), # Current status added
NextValue(self.b16Status.storage[0], True), # Current status added
If(LoadUnit.b32Data1.storage == self.b32Sentinel.storage, # Valid last entry?
NextValue(LoadUnit.b9Offset1.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2")
@ -199,9 +198,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextState("Loader_IDLE") # Abort!
)
)
#-----> LOOP ENTRY ! (2nd loop onward: fs3 already prepared!)
Loader_fsm.act("Loader_LOAD2",
NextValue(self.b16Status.storage[2], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Status.storage[1], True), # Current status added
If(self.Loader_Delay > RAMWaitTime, # Required only for 1st entry ...
# FPU#1
NextValue(self.b16Value1_1.storage, LoadUnit.b32Data1.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value1_2.storage, LoadUnit.b32Data1.storage >> 16), # Pick 2nd date
@ -215,56 +216,59 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data2.storage[16:32])),
NextValue(LoadUnit.b9Offset2.storage, LoadUnit.b9Offset2.storage + 1), # Move on to next entry
#NextValue(self.Loader_Delay, 0), # Reset delay
#NextState("Loader_LOAD3")
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
"""
Loader_fsm.act("Loader_LOAD3",
NextValue(self.b16Status.storage[3], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD4")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_LOAD4",
NextValue(self.b16Status.storage[4], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date
NextValue(self.b16Value6.storage, LoadUnit.b32Data.storage >> 16), # Pick 6th date
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
"""
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu1.fadd, True), # 1st ADD requested
NextValue(fpu2.fadd, True),
NextValue(self.b16Status.storage[2], True), # Current status added
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1),
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1!
NextValue(fpu1.fmul, True), # 1st ADD requested
NextValue(fpu2.fmul, True),
).Else(
NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested
NextValue(fpu2.fmadd, True),
)
),
NextValue(fpu1.fready, False), # Engage trigger FPU#1
NextValue(fpu2.fready, False), # Engage trigger FPU#2
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[6], True), # Current status added
NextValue(self.b16Status.storage[3], True), # Current status added
If(fpu1.fready & fpu2.fready,
NextValue(fpu1.fadd, False), # Clear command request FPU#1
NextValue(fpu2.fadd, False), # Clear command request FPU#2
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1!
NextValue(fpu1.fmul, False), # Clear command request FPU#1
NextValue(fpu2.fmul, False), # Clear command request FPU#2
).Else(
NextValue(fpu1.fmadd, False), # Clear command request FPU#1
NextValue(fpu2.fmadd, False), # Clear command request FPU#2
),
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu1.fresult[16:32])), # Sum will be used for fmadd.s
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, fpu2.fresult[16:32])), # Sum will be used for fmadd.s
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), # Words 0 .. 255
NextState("Loader_LOAD2")
).Else( # Finally prepare ADD both result sums (on FPU#1 only!)
NextValue(fpu1.fs1, fpu1.fresult),
NextValue(fpu1.fs2, fpu2.fresult),
NextState("Loader_EXEC3")
)
)
)
Loader_fsm.act("Loader_EXEC3",
NextValue(self.b16Status.storage[4], True), # Current status added
NextValue(fpu1.fadd, True), # Final ADD requested
NextValue(fpu1.fready, False), # Engage trigger FPU#1 (only!)
NextState("Loader_EXEC4")
)
Loader_fsm.act("Loader_EXEC4",
NextValue(self.b16Status.storage[5], True), # Current status added
If(fpu1.fready,
NextValue(fpu1.fadd, False), # Clear command request FPU#1
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Useless (control only ...)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!)
NextState("Loader_IDLE")

@ -47,15 +47,18 @@ extern int key_eval(void);
#define DRAMDATABASE 0x40190000
#define DRAMDATASIZE 512
static int fpgaload(uint32_t *mempt, int16_t len)
static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen)
{
uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
if((len < 4) | (len > 512)) return -1; // Verify length of transfer was understood!
flush_l2_cache(); // Strictly nec. for longer transfers
bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from
bfloat16nn_b32Sentinel_write(*sentinel);
bfloat16nn_b9ArrayWordLen_write(calclen); // Indicate array length for calc.
bfloat16nn_bEnable_write(1); // Finally: Engage!
for(int i=0;i<10;i++) { // Max. 100ms delay
for(int i=0;i<100;i++) { // Max. 100ms delay
if(bfloat16nn_b16Status_read() & 0x8000) {
bfloat16nn_bEnable_write(0); // Disable transfer
return 1; // Ok, ready!
@ -132,6 +135,8 @@ int key_eval(void)
float fp1_1, fp1_2, fpResult1;
float fp2_1, fp2_2, fpResult2;
#define MAXCALCLEN 4
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
@ -140,14 +145,24 @@ int key_eval(void)
// FPU#1
ui16ptr1 = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=DRAMDATASIZE/2;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
for(i=1;i<=MAXCALCLEN;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
// FPU#2
ui16ptr2 = (uint16_t *)(DRAMDATABASE + (DRAMDATASIZE/2) * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=DRAMDATASIZE/2;i++)
for(i=1;i<=MAXCALCLEN;i++)
*ui16ptr2++ = f2ui16(1.0 * (float)i );
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) { // 512 * 32-bit= 2048 bytes = 2kB
// TODO: Control procedure w/ regular code (matrice inner product)
float sum = 0.0;
for(i=1;i<=MAXCALCLEN;i+=2) {
sum += (1.0 * (float)i) * (1.0 * (float)(i+1));
}
sum *= 2.0; // Sim. 2 FPUs!
printf("Elements/FPU: %d", MAXCALCLEN);
printf1(" S/W SUM (2xFPU): %8.3f\n", sum);
if(fpgaload((uint32_t *)DRAMDATABASE, 512, MAXCALCLEN)) { // 512*32-bit=2048 bytes = 2kB, 256 Words/FPU to calc.
fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
@ -157,7 +172,7 @@ int key_eval(void)
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);
printf1("TOTAL SUM=%8.4f\n", fpResult1);
printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
@ -171,11 +186,13 @@ int key_eval(void)
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data2_read());
*/
}
else
printf("CURRENT: Timeout!");
else {
printf("CURRENT: Timeout! %04Xh ", (uint32_t)bfloat16nn_b16Status_read());
printf("Offset 1: %d ", (uint32_t)dram2fpga_b9Offset1_read());
printf("Offset 2: %d\n", (uint32_t)dram2fpga_b9Offset2_read());
}
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
if(fpgaload((uint32_t *)DRAMDATABASE, 512, MAXCALCLEN)) {
fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
@ -191,8 +208,8 @@ int key_eval(void)
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
}
else
printf("INVALIDATED: Timeout!");
else
printf("INVALIDATED: Timeout! %04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
break;
case 's':
fp1_1 = fp1_1_read();

Loading…
Cancel
Save