From 7091db2a275a803c0eb9702a72df1c2f8199d25a Mon Sep 17 00:00:00 2001 From: kaqu Date: Tue, 4 May 2021 19:18:33 +0200 Subject: [PATCH] zero bug hunting ... --- libmodules/bfloat16nncore.py | 60 ++++++++++++++--- libmodules/bfloat16processor.py | 41 +++++++----- software/source/bfloat16nnlib.c | 115 +++++++++++++++++++++++++------- 3 files changed, 166 insertions(+), 50 deletions(-) diff --git a/libmodules/bfloat16nncore.py b/libmodules/bfloat16nncore.py index c8f1b8e..a860d88 100644 --- a/libmodules/bfloat16nncore.py +++ b/libmodules/bfloat16nncore.py @@ -98,11 +98,31 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Float register 3 """) - self.b16Result = CSRStorage(16, reset_less=False, - fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")], + self.b16Value4 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" - Processing result + Float register 4 + """) + self.b16Value5 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], + description=""" + Float register 5 + """) + self.b16Value6 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], + description=""" + Float register 6 + """) + self.b16Result1 = CSRStorage(16, reset_less=False, + fields=[CSRField("Result1", size=16, description="*Field*: 16-Bit value")], + description=""" + Processing result 1 """) + self.b16Result2 = CSRStorage(16, reset_less=False, + fields=[CSRField("Result2", size=16, description="*Field*: 16-Bit value")], + description=""" + Processing result 2 + """) self.bReady = Signal() # To be wired to data pin ... ;) # Local vars. @@ -144,6 +164,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): #---------------- bfloat16 FPUs ------------------------------------------------------------- self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU + self.submodules.fpu2 = fpu2 = bfloat16Processor() # Integrate another one! #---------------- Loaded data testing -------------------------------------------------- Loader_fsm = FSM(reset_state="Loader_IDLE") # FSM starts idling ... @@ -156,7 +177,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextValue(self.Loader_Active, True), # Loader up & running NextValue(self.Loader_Delay, 0), # Reset read delay timer NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel - NextValue(self.b16Result.storage, 0), # Indicate # delays + NextValue(self.b16Result1.storage, 0), # Indicate # delays + NextValue(self.b16Result2.storage, 0), # Indicate # delays NextValue(self.b16Status.storage[0], True), # Current status NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... @@ -195,7 +217,23 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextValue(self.b16Status.storage[3], True), # Current status added If(self.Loader_Delay > RAMWaitTime, NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date + NextValue(self.b16Value4.storage, LoadUnit.b32Data.storage >> 16), # Pick 4th date NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), + NextValue(fpu2.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), + NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation + NextValue(self.Loader_Delay, 0), # Reset delay + NextState("Loader_LOAD4") + ).Else( # MEM wait cycles + NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment + ) + ) + Loader_fsm.act("Loader_LOAD4", + NextValue(self.b16Status.storage[4], True), # Current status added + If(self.Loader_Delay > RAMWaitTime, + NextValue(self.b16Value5.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 5th date + NextValue(self.b16Value6.storage, LoadUnit.b32Data.storage >> 16), # Pick 6th date + NextValue(fpu2.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), + NextValue(fpu2.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), NextState("Loader_EXEC1") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment @@ -203,15 +241,19 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) Loader_fsm.act("Loader_EXEC1", NextValue(self.b16Status.storage[5], True), # Current status added - NextValue(fpu1.fnmsub, True), # This command requested - NextValue(fpu1.fready, False), # Engage trigger + NextValue(fpu1.fadd, True), # This command requested + NextValue(fpu2.fnmsub, True), # This command requested + NextValue(fpu1.fready, False), # Engage trigger FPU#1 + NextValue(fpu2.fready, False), # Engage trigger FPU#2 NextState("Loader_EXEC2") ) Loader_fsm.act("Loader_EXEC2", NextValue(self.b16Status.storage[6], True), # Current status added - If(fpu1.fready, - NextValue(fpu1.fnmsub, False), # Clear command request - NextValue(self.b16Result.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) + If(fpu1.fready & fpu2.fready, + NextValue(fpu1.fadd, False), # Clear command request FPU#1 + NextValue(fpu2.fnmsub, False), # Clear command request FPU#2 + NextValue(self.b16Result1.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!) + NextValue(self.b16Result2.storage, fpu2.fresult[16:32]), # Pick result (little endian, high word!) NextValue(self.b16Status.storage[15], True), # Indicate readyness ... NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!) NextState("Loader_IDLE") diff --git a/libmodules/bfloat16processor.py b/libmodules/bfloat16processor.py index 3a75f8b..b4debd4 100644 --- a/libmodules/bfloat16processor.py +++ b/libmodules/bfloat16processor.py @@ -119,10 +119,10 @@ class bfloat16Processor(Module): NextValue(self.fresult, self.fs2), # Return infinity NextValue(self.fready, 1), NextState("FPU_IDLE") - ).Elif(self.fs1[0:31] == 0, # Nothing to add? (w/o sign!) + ).Elif(self.fs1[0:31] == 0, # 0+x: Nothing to add? (w/o sign!) If(self.fsub, # Subtract yields negative result! NextValue(self.fresult, self.fs2 ^ 0x80000000), # Invert sign - ).Elif(self.fmsub | self.fnmadd, # 0*x=>0! 0-fs3 or -(0+fs3) = +fs3! FIXME->risq5 -*- => +! + ).Elif(self.fmsub | self.fnmadd, # 0*x=>0! 0-fs3 or -(0+fs3) = +fs3! NextValue(self.fresult, self.fs3 ^ 0x80000000), # Invert sign ).Elif(self.fmadd, # 0*x=>0! 0+fs3 = fs3! NextValue(self.fresult, self.fs3), # Ready! @@ -131,15 +131,24 @@ class bfloat16Processor(Module): ), NextValue(self.fready, 1), NextState("FPU_IDLE") - ).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!) - NextValue(self.fresult, self.fs1), # Ready! - NextValue(self.fready, 1), - NextState("FPU_IDLE") - ).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!) - If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5 - NextValue(self.sign3, ~self.sign3) # Invert result finally + # FIXME: VERIFY! -->risq5! + ).Elif(self.fs2[0:31] == 0, # x+0: Nothing to add? (w/o sign!) + If(self.fnmadd | self.fnmsub, + NextValue(self.fresult, self.fs1 ^ 0x80000000), # Ready! + ).Else( + NextValue(self.fresult, self.fs1), # Ready! ), - NextState("FRESULT") # Just supply (normalized finally!) result from multiplication! + NextValue(self.fready, 1), + NextState("FPU_IDLE") + #).Elif((self.fadd | self.fsub) & (self.fs2[0:31] == 0), # Nothing to add? (w/o sign!) + # NextValue(self.fresult, self.fs1), # Ready! + # NextValue(self.fready, 1), + # NextState("FPU_IDLE") + #).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!) + # If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT + # NextValue(self.sign3, ~self.sign3) # Invert result finally + # ), + # NextState("FRESULT") # Just supply (normalized finally!) result from multiplication! ).Else( # Ok, valid floats supplied ... NextValue(self.s_bit, 0), NextValue(self.branch1, 0), # Reset helpers @@ -234,7 +243,7 @@ class bfloat16Processor(Module): NextValue(self.m3, self.m3 + self.s_bit), NextState("FADD8") # Adjust possible overflow ... ).Else( # Nope, all ready - If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5 + If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT NextValue(self.sign3, ~self.sign3) # Invert result finally ), NextState("FRESULT") @@ -248,7 +257,7 @@ class bfloat16Processor(Module): NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent NextValue(self.e3, self.e3 + 1) ), - If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5 + If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT NextValue(self.sign3, ~self.sign3) # Invert result finally ), NextState("FRESULT") @@ -283,8 +292,8 @@ class bfloat16Processor(Module): NextState("FPU_IDLE") ).Else( # Ok, valid floats supplied ... NextValue(self.sign3, self.sign1 ^ self.sign2), # 1. Calculate result sign - NextValue(self.e3, self.e1 + self.e2), # 2. Calculate resulting exponent (add!) - NextValue(self.lm3, self.m1 * self.m2), # 3. Significants multiplication (result size: 2x (sizeof(mantissa)+1) !) + NextValue(self.e3, self.e1 + self.e2), # 2. Calculate resulting exponent (add!) + NextValue(self.lm3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m1) * Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m2)), # 3. Significants multiplication (result size: 2x (sizeof(mantissa)+1) !) NextState("FMUL2") ) ) @@ -327,8 +336,8 @@ class bfloat16Processor(Module): NextState("FMUL5") ) FPU_fsm.act("FMUL5", - # 6. Construction of result - NextValue(self.m3, (self.lm3 >> 23) & 0x7FFFFF), + # 6. Construction of result + NextValue(self.m3, ((self.lm3 >> 23) & 0x7FFFFF)[16:23]), # TODO: e3=se3 omitted ok? If(self.fmul, # Simple multiplication NextState("FRESULT") diff --git a/software/source/bfloat16nnlib.c b/software/source/bfloat16nnlib.c index 73b383a..952438e 100644 --- a/software/source/bfloat16nnlib.c +++ b/software/source/bfloat16nnlib.c @@ -72,35 +72,80 @@ static float fp1_read(void) uint32_t v __attribute__((aligned(16))) = 0; *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required float *fpt = (float *)&v; - return *fpt; + if(*fpt != 0.0) + return *fpt; + return(-12.34); } static float fp2_read(void) { uint32_t v __attribute__((aligned(16))) = 0; *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read(); float *fpt = (float *)&v; - return *fpt; + if(*fpt != 0.0) + return *fpt; + return(-12.34); } static float fp3_read(void) { uint32_t v __attribute__((aligned(16))) = 0; *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read(); float *fpt = (float *)&v; - return *fpt; + if(*fpt != 0.0) + return *fpt; + return(-12.34); } -static float fpResult_read(void) +static float fp4_read(void) +{ + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value4_read(); + float *fpt = (float *)&v; + if(*fpt != 0.0) + return *fpt; + return(-12.34); +} +static float fp5_read(void) +{ + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value5_read(); + float *fpt = (float *)&v; + if(*fpt != 0.0) + return *fpt; + return(-12.34); +} +static float fp6_read(void) +{ + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value6_read(); + float *fpt = (float *)&v; + if(*fpt != 0.0) + return *fpt; + return(-12.34); +} +static float fpResult1_read(void) { uint32_t v __attribute__((aligned(16))) = 0; - *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result_read(); + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result1_read(); float *fpt = (float *)&v; - return *fpt; + if(*fpt != 0.0) + return *fpt; + return(-12.34); +} +static float fpResult2_read(void) +{ + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result2_read(); + float *fpt = (float *)&v; + if(*fpt != 0.0) + return *fpt; + return(-12.34); } static uint16_t f2ui16(float f) { return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ... } -static void dumpfloat(float f) +extern void dumpfloat(float f); +void dumpfloat(float f) { printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f)); } @@ -113,27 +158,39 @@ int key_eval(void) uint32_t *ui32ptr; uint16_t *ui16ptr; int i; - float fp1, fp2, fp3, fpResult; + float fp1, fp2, fp3, fpResult1; + float fp4, fp5, fp6, fpResult2; switch(kbhit()) { case 'r': // Reload printf("\e[35;1m*** Reload ***\e[0m\n"); for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i