zero bug hunting ...

master
kaqu 1 year ago
parent 122bf1c3c6
commit 7091db2a27
  1. 60
      libmodules/bfloat16nncore.py
  2. 41
      libmodules/bfloat16processor.py
  3. 115
      software/source/bfloat16nnlib.c

@ -98,11 +98,31 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Float register 3
""")
self.b16Result = CSRStorage(16, reset_less=False,
fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")],
self.b16Value4 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 4
""")
self.b16Value5 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result
Float register 5
""")
self.b16Value6 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 6
""")
self.b16Result1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result 1
""")
self.b16Result2 = CSRStorage(16, reset_less=False,
fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result 2
""")
self.bReady = Signal() # To be wired to data pin ... ;)
# Local vars.
@ -144,6 +164,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
#---------------- bfloat16 FPUs -------------------------------------------------------------
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU
self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one!
#---------------- Loaded data testing --------------------------------------------------
Loader_fsm = FSM(reset_state="Loader_IDLE") # FSM starts idling ...
@ -156,7 +177,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(self.Loader_Active, True), # Loader up & running
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(self.b16Result.storage, 0), # Indicate # delays
NextValue(self.b16Result1.storage, 0), # Indicate # delays
NextValue(self.b16Result2.storage, 0), # Indicate # delays
NextValue(self.b16Status.storage[0], True), # Current status
NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ...
@ -195,7 +217,23 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(self.b16Status.storage[3], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date
NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD4")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_LOAD4",
NextValue(self.b16Status.storage[4], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date
NextValue(self.b16Value6.storage, LoadUnit.b32Data.storage >> 16), # Pick 6th date
NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
@ -203,15 +241,19 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu1.fnmsub, True), # This command requested
NextValue(fpu1.fready, False), # Engage trigger
NextValue(fpu1.fadd, True), # This command requested
NextValue(fpu2.fnmsub, True), # This command requested
NextValue(fpu1.fready, False), # Engage trigger FPU#1
NextValue(fpu2.fready, False), # Engage trigger FPU#2
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[6], True), # Current status added
If(fpu1.fready,
NextValue(fpu1.fnmsub, False), # Clear command request
NextValue(self.b16Result.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)
If(fpu1.fready & fpu2.fready,
NextValue(fpu1.fadd, False), # Clear command request FPU#1
NextValue(fpu2.fnmsub, False), # Clear command request FPU#2
NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!)
NextState("Loader_IDLE")

@ -119,10 +119,10 @@ class bfloat16Processor(Module):
NextValue(self.fresult, self.fs2), # Return infinity
NextValue(self.fready, 1),
NextState("FPU_IDLE")
).Elif(self.fs1[0:31] == 0, # Nothing to add? (w/o sign!)
).Elif(self.fs1[0:31] == 0, # 0+x: Nothing to add? (w/o sign!)
If(self.fsub, # Subtract yields negative result!
NextValue(self.fresult, self.fs2 ^ 0x80000000), # Invert sign
).Elif(self.fmsub | self.fnmadd, # 0*x=>0! 0-fs3 or -(0+fs3) = +fs3! FIXME->risq5 -*- => +!
).Elif(self.fmsub | self.fnmadd, # 0*x=>0! 0-fs3 or -(0+fs3) = +fs3!
NextValue(self.fresult, self.fs3 ^ 0x80000000), # Invert sign
).Elif(self.fmadd, # 0*x=>0! 0+fs3 = fs3!
NextValue(self.fresult, self.fs3), # Ready!
@ -131,15 +131,24 @@ class bfloat16Processor(Module):
),
NextValue(self.fready, 1),
NextState("FPU_IDLE")
).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!)
NextValue(self.fresult, self.fs1), # Ready!
NextValue(self.fready, 1),
NextState("FPU_IDLE")
).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!)
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
NextValue(self.sign3, ~self.sign3) # Invert result finally
# FIXME: VERIFY! -->risq5!
).Elif(self.fs2[0:31] == 0, # x+0: Nothing to add? (w/o sign!)
If(self.fnmadd | self.fnmsub,
NextValue(self.fresult, self.fs1 ^ 0x80000000), # Ready!
).Else(
NextValue(self.fresult, self.fs1), # Ready!
),
NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
NextValue(self.fready, 1),
NextState("FPU_IDLE")
#).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!)
# NextValue(self.fresult, self.fs1), # Ready!
# NextValue(self.fready, 1),
# NextState("FPU_IDLE")
#).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!)
# If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT
# NextValue(self.sign3, ~self.sign3) # Invert result finally
# ),
# NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
).Else( # Ok, valid floats supplied ...
NextValue(self.s_bit, 0),
NextValue(self.branch1, 0), # Reset helpers
@ -234,7 +243,7 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 + self.s_bit),
NextState("FADD8") # Adjust possible overflow ...
).Else( # Nope, all ready
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT
NextValue(self.sign3, ~self.sign3) # Invert result finally
),
NextState("FRESULT")
@ -248,7 +257,7 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
NextValue(self.e3, self.e3 + 1)
),
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT
NextValue(self.sign3, ~self.sign3) # Invert result finally
),
NextState("FRESULT")
@ -283,8 +292,8 @@ class bfloat16Processor(Module):
NextState("FPU_IDLE")
).Else( # Ok, valid floats supplied ...
NextValue(self.sign3, self.sign1 ^ self.sign2), # 1. Calculate result sign
NextValue(self.e3, self.e1 + self.e2), # 2. Calculate resulting exponent (add!)
NextValue(self.lm3, self.m1 * self.m2), # 3. Significants multiplication (result size: 2x (sizeof(mantissa)+1) !)
NextValue(self.e3, self.e1 + self.e2), # 2. Calculate resulting exponent (add!)
NextValue(self.lm3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m1) * Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m2)), # 3. Significants multiplication (result size: 2x (sizeof(mantissa)+1) !)
NextState("FMUL2")
)
)
@ -327,8 +336,8 @@ class bfloat16Processor(Module):
NextState("FMUL5")
)
FPU_fsm.act("FMUL5",
# 6. Construction of result
NextValue(self.m3, (self.lm3 >> 23) & 0x7FFFFF),
# 6. Construction of result
NextValue(self.m3, ((self.lm3 >> 23) & 0x7FFFFF)[16:23]),
# TODO: e3=se3 omitted ok?
If(self.fmul, # Simple multiplication
NextState("FRESULT")

@ -72,35 +72,80 @@ static float fp1_read(void)
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required
float *fpt = (float *)&v;
return *fpt;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fp2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read();
float *fpt = (float *)&v;
return *fpt;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fp3_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read();
float *fpt = (float *)&v;
return *fpt;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fpResult_read(void)
static float fp4_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value4_read();
float *fpt = (float *)&v;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fp5_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value5_read();
float *fpt = (float *)&v;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fp6_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value6_read();
float *fpt = (float *)&v;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fpResult1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result_read();
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result1_read();
float *fpt = (float *)&v;
return *fpt;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static float fpResult2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result2_read();
float *fpt = (float *)&v;
if(*fpt != 0.0)
return *fpt;
return(-12.34);
}
static uint16_t f2ui16(float f)
{
return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ...
}
static void dumpfloat(float f)
extern void dumpfloat(float f);
void dumpfloat(float f)
{
printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f));
}
@ -113,27 +158,39 @@ int key_eval(void)
uint32_t *ui32ptr;
uint16_t *ui16ptr;
int i;
float fp1, fp2, fp3, fpResult;
float fp1, fp2, fp3, fpResult1;
float fp4, fp5, fp6, fpResult2;
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1;
ui16ptr = (uint16_t *)(DRAMDATABASE + 0 * sizeof(float));
*ui16ptr++ = f2ui16(4.0);
*ui16ptr++ = f2ui16(2.0);
*ui16ptr++ = f2ui16(3.0);
ui16ptr = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint16_t)); // Absolute: bytes!
*ui16ptr++ = f2ui16(0.0);
*ui16ptr++ = f2ui16(1.0);
*ui16ptr++ = f2ui16(2.0);
*ui16ptr++ = f2ui16(0.0); // fnmsub: -(0*3 - 4) = +4!
*ui16ptr++ = f2ui16(3.0);
*ui16ptr++ = f2ui16(4.0); // FIXME: Fused 0+X returns 0!
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
fp1 = fp1_read();
fp2 = fp2_read();
fp3 = fp3_read();
fpResult = fpResult_read();
printf("S=%04Xh ", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1);
printf1("V2=%4.2f ", fp2);
printf1("V3=%4.2f ", fp3);
printf1("RESULT=%4.2f\n", fpResult);
fp4 = fp4_read();
fp5 = fp5_read();
fp6 = fp6_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1=%5.3f ", fp1); // FIXME: printf1 fails for 0.0 output !
printf1("V2=%5.3f ", fp2);
printf1("V3=%5.3f ", fp3);
printf1("RESULT=%5.3f\n", fpResult1);
printf1("V1=%5.3f ", fp4);
printf1("V2=%5.3f ", fp5);
printf1("V3=%5.3f ", fp6);
printf1("RESULT=%5.3f\n", fpResult2);
/*
for(i=0;i<DRAMDATASIZE;i+=32) {
dram2fpga_b9Offset_write(i);
@ -148,21 +205,29 @@ int key_eval(void)
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
printf("INVALIDATED: S=%04Xh ", bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());
printf1("RESULT=%4.2f\n", fpResult_read());
printf("INVALIDATED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf1("V1=%5.3f ", fp1_read());
printf1("V2=%5.3f ", fp2_read());
printf1("V3=%5.3f ", fp3_read());
printf1("RESULT=%5.3f\n", fpResult1_read());
printf1("V1=%5.3f ", fp4_read());
printf1("V2=%5.3f ", fp5_read());
printf1("V3=%5.3f ", fp6_read());
printf1("RESULT=%5.3f\n", fpResult2_read());
}
else
printf("INVALIDATED: Timeout!");
break;
case 's':
printf("REQUESTED: S=%04Xh ", bfloat16nn_b16Status_read());
printf("REQUESTED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());
printf1("RESULT=%4.2f\n", fpResult_read());
printf1("RESULT=%4.2f\n", fpResult1_read());
printf1("V1=%4.2f ", fp4_read());
printf1("V2=%4.2f ", fp5_read());
printf1("V3=%4.2f ", fp6_read());
printf1("RESULT=%4.2f\n", fpResult2_read());
break;
case 'x': return 1; // Abort indication
default: ;

Loading…
Cancel
Save