From 7a592492fc0dba9f56c3eb9c58415c5d0cca449b Mon Sep 17 00:00:00 2001 From: kaqu Date: Mon, 3 May 2021 12:12:14 +0200 Subject: [PATCH] bfloat16 fadd working! --- libmodules/bfloat16nncore.py | 72 ++++++++++++++------------------- libmodules/bfloat16processor.py | 56 ++++++++++++++++--------- software/source/bfloat16nnlib.c | 60 +++++++++++++++++---------- 3 files changed, 107 insertions(+), 81 deletions(-) diff --git a/libmodules/bfloat16nncore.py b/libmodules/bfloat16nncore.py index b24392e..613df7e 100644 --- a/libmodules/bfloat16nncore.py +++ b/libmodules/bfloat16nncore.py @@ -49,7 +49,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): Outputs: ######## - :b32Result: Processing result + :b16Result: Processing result :bReady: Ready indication (wire to LED ... ;) @@ -78,28 +78,28 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): """) # Outputs - self.b32Status = CSRStorage(32, reset_less=False, - fields=[CSRField("Status", size=32, description="*Field*: 32-Bit value")], + self.b16Status = CSRStorage(16, reset_less=False, + fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")], description=""" Processing stati """) - self.b32Value1 = CSRStorage(32, reset_less=False, - fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")], + self.b16Value1 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 1 """) - self.b32Value2 = CSRStorage(32, reset_less=False, - fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")], + self.b16Value2 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 2 """) - self.b32Value3 = CSRStorage(32, reset_less=False, - fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")], + self.b16Value3 = CSRStorage(16, reset_less=False, + fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" Float register 3 """) - self.b32Result = CSRStorage(32, reset_less=False, - fields=[CSRField("Result", size=32, description="*Field*: 32-Bit value")], + self.b16Result = CSRStorage(16, reset_less=False, + fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")], description=""" Processing result """) @@ -156,20 +156,20 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextValue(self.Loader_Active, True), # Loader up & running NextValue(self.Loader_Delay, 0), # Reset read delay timer NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel - NextValue(self.b32Result.storage, 0), # Indicate # delays - NextValue(self.b32Status.storage[0], True), # Current status - NextValue(self.b32Value1.storage, 0), # Nothing loaded so far ... - NextValue(self.b32Value2.storage, 0), # Nothing loaded so far ... - NextValue(self.b32Value3.storage, 0), # Nothing loaded so far ... + NextValue(self.b16Result.storage, 0), # Indicate # delays + NextValue(self.b16Status.storage[0], True), # Current status + NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ... + NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ... + NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ... NextValue(self.bReady, False), # LED off! NextState("Loader_LOAD1") ).Elif(~self.bEnable.storage, # Externally aborted? - NextValue(self.b32Status.storage, 0), # Current status: inactive + NextValue(self.b16Status.storage, 0), # Current status: inactive NextValue(self.Loader_Active, False), # Reset in sync w/ global activation ) ) Loader_fsm.act("Loader_LOAD1", - NextValue(self.b32Status.storage[1], True), # Current status added + NextValue(self.b16Status.storage[1], True), # Current status added If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry? NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation NextState("Loader_LOAD2") @@ -178,10 +178,12 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) ) Loader_fsm.act("Loader_LOAD2", - NextValue(self.b32Status.storage[2], True), # Current status added + NextValue(self.b16Status.storage[2], True), # Current status added If(self.Loader_Delay > RAMWaitTime, - NextValue(self.b32Value1.storage, LoadUnit.b32Data.storage), # Pick 1st date - NextValue(fpu.fs1, LoadUnit.b32Data.storage), + NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date + NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date + NextValue(fpu.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), + NextValue(fpu.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])), NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation NextValue(self.Loader_Delay, 0), # Reset delay NextState("Loader_LOAD3") @@ -190,39 +192,27 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) ) Loader_fsm.act("Loader_LOAD3", - NextValue(self.b32Status.storage[3], True), # Current status added + NextValue(self.b16Status.storage[3], True), # Current status added If(self.Loader_Delay > RAMWaitTime, - NextValue(self.b32Value2.storage, LoadUnit.b32Data.storage), # Pick 2nd date - NextValue(fpu.fs2, LoadUnit.b32Data.storage), - NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation - NextValue(self.Loader_Delay, 0), # Reset delay - NextState("Loader_LOAD4") + NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date + NextValue(fpu.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])), + NextState("Loader_EXEC1") ).Else( # MEM wait cycles NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment ) ) - Loader_fsm.act("Loader_LOAD4", - NextValue(self.b32Status.storage[4], True), # Current status added - If(self.Loader_Delay > RAMWaitTime, - NextValue(self.b32Value3.storage, LoadUnit.b32Data.storage), # Pick 3rd date - NextValue(fpu.fs3, LoadUnit.b32Data.storage), - NextState("Loader_EXEC1") - ).Else( # MEM wait cycles - NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment - ) - ) Loader_fsm.act("Loader_EXEC1", - NextValue(self.b32Status.storage[5], True), # Current status added + NextValue(self.b16Status.storage[5], True), # Current status added NextValue(fpu.fadd, True), # This command requested NextValue(fpu.fready, False), # Engage trigger NextState("Loader_EXEC2") ) Loader_fsm.act("Loader_EXEC2", - NextValue(self.b32Status.storage[6], True), # Current status added + NextValue(self.b16Status.storage[6], True), # Current status added If(fpu.fready, NextValue(fpu.fadd, False), # Clear command request - NextValue(self.b32Result.storage, fpu.fresult), # Pick result - NextValue(self.b32Status.storage[31], True), # Indicate readyness ... + NextValue(self.b16Result.storage, fpu.fresult[16:32]), # Pick result (little endian, high word!) + NextValue(self.b16Status.storage[15], True), # Indicate readyness ... NextValue(self.bReady, True), # Indicate readyness (LED on!) NextState("Loader_IDLE") ) diff --git a/libmodules/bfloat16processor.py b/libmodules/bfloat16processor.py index 6e30712..d877b8e 100644 --- a/libmodules/bfloat16processor.py +++ b/libmodules/bfloat16processor.py @@ -3,7 +3,7 @@ # # bfloat16processor.py # -# bfloat16 processing +# bfloat16 processing (1 bit sign, 8 bit exponent, 7 bit mantissa) # # History: # -------- @@ -50,16 +50,20 @@ class bfloat16Processor(Module): self.e1 = Signal((8,True), reset_less=True) # Signed exponents! self.e2 = Signal((8,True), reset_less=True) self.e3 = Signal((8,True), reset_less=True) - self.m1 = Signal((23+1+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign! - self.m2 = Signal((24+1+3,False), reset_less=True) # 23 bits + 1bit (1.xx = 0x800000) - self.m3 = Signal((25+1+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits + #self.m1 = Signal((23+1+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign! + self.m1 = Signal((7+2+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign! + #self.m2 = Signal((24+1+3,False), reset_less=True) # 23 bits + 1bit (1.xx = 0x800000) + self.m2 = Signal((7+2+3,False), reset_less=True) # 7 bits + 1bit (1.xx = 0x800000) + 2 spare + #self.m3 = Signal((25+1+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits + self.m3 = Signal((8+2+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits self.lm3 = Signal((64,True), reset_less=True) # MUL long result self.s32 = Signal((32,True), reset_less=True) # Signed 32-bit self.s_bit = Signal() # Sticky bit (for rounding control) self.branch1 = Signal() # Branch helpers self.branch2 = Signal() - self.i = Signal(5) # Loop counter, range 0..31 + #self.i = Signal(5) # Loop counter, range 0..31 + self.i = Signal(4) # Loop counter, range 0..15 FPU_fsm = FSM(reset_state="FPU_IDLE") # FSM starts idling ... self.submodules += FPU_fsm @@ -72,16 +76,20 @@ class bfloat16Processor(Module): NextValue(self.sign2, self.fs2[31] ^ self.fsub), # Invert sign for subtraction! NextValue(self.e1, self.fs1[23:31] - 127), NextValue(self.e2, self.fs2[23:31] - 127), - NextValue(self.m1, Cat(0,0,0, self.fs1[0:23], 1, 0)), # | 0x00800000 + R/G/S bits - NextValue(self.m2, Cat(0,0,0, self.fs2[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + #NextValue(self.m1, Cat(0,0,0, self.fs1[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + NextValue(self.m1, Cat(0,0,0, self.fs1[0:7], 1, 0)), # | 0x00800000 + R/G/S bits + #NextValue(self.m2, Cat(0,0,0, self.fs2[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + NextValue(self.m2, Cat(0,0,0, self.fs2[0:7], 1, 0)), # | 0x00800000 + R/G/S bits NextState("FADD1") ).Elif((self.fmin | self.fmax | self.fmadd | self.fmsub | self.fnmadd | self.fnmsub | self.fmul | self.fdiv) & ~self.fready, # Triggers set & ready flag reset externally! NextValue(self.sign1, self.fs1[31]), NextValue(self.sign2, self.fs2[31]), NextValue(self.e1, self.fs1[23:31] - 127), NextValue(self.e2, self.fs2[23:31] - 127), - NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000 - NextValue(self.m2, Cat(self.fs2[0:23], 1, 0)), # | 0x00800000 + #NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000 + NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000 + #NextValue(self.m2, Cat(self.fs2[0:23], 1, 0)), # | 0x00800000 + NextValue(self.m2, Cat(self.fs2[0:7], 1, 0, 0,0,0)), # | 0x00800000 If(self.fdiv, # Division NextState("FDIV1"), ).Elif(self.fmin, # Minimum @@ -94,7 +102,8 @@ class bfloat16Processor(Module): ).Elif(self.fsqrt & ~self.fready, # Trigger set & ready flag reset externally! NextValue(self.sign1, self.fs1[31]), NextValue(self.e1, self.fs1[23:31] - 127), - NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000 + #NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000 + NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000 NextState("FSQRT1"), ) ) @@ -205,7 +214,8 @@ class bfloat16Processor(Module): FPU_fsm.act("FADD6", NextValue(self.FPU_state, 6), # 6. Normalization of result: Overflow - If(self.m3[24], # & 0x01000000, + #If(self.m3[24], # & 0x01000000, + If(self.m3[7+1], # & 0x01000000, NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent NextValue(self.e3, self.e3 + 1) ).Else( @@ -216,7 +226,8 @@ class bfloat16Processor(Module): FPU_fsm.act("FADD7", # 7. Normalization: Result NextValue(self.FPU_state, 7), - If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops) + #If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops) + If(~self.m3[7] & (self.i < 7), # & 0x00800000 (limit to max. loops) NextValue(self.m3, self.m3 << 1), # Subtraction normalization NextValue(self.e3, self.e3 - 1), NextValue(self.i, self.i + 1), # Count loops ... @@ -231,7 +242,8 @@ class bfloat16Processor(Module): ) FPU_fsm.act("FADD8", NextValue(self.FPU_state, 8), - If(self.m3[24], # & 0x01000000, # Overflow? + #If(self.m3[24], # & 0x01000000, # Overflow? + If(self.m3[7+1], # & 0x01000000, # Overflow? NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent NextValue(self.e3, self.e3 + 1) ), @@ -241,7 +253,8 @@ class bfloat16Processor(Module): FPU_fsm.act("self.fresult", # Result contruction & possible rounding NextValue(self.FPU_state, 9), # 6. Build the actual resulting float - NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)), + #NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)), + NextValue(self.fresult, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, self.m3[0:7], self.e3+127, self.sign3)), NextValue(self.fready, 1), # Indicate ready to main decoder NextState("FPU_IDLE") ) @@ -326,8 +339,10 @@ class bfloat16Processor(Module): NextValue(self.sign2, self.fs3[31] ^ (self.fmsub | self.fnmsub)), # Invert sign for subtraction! NextValue(self.e1, self.e3), NextValue(self.e2, self.fs3[23:31] - 127), - NextValue(self.m1, Cat(0,0,0, self.m3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits - NextValue(self.m2, Cat(0,0,0, self.fs3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + #NextValue(self.m1, Cat(0,0,0, self.m3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + NextValue(self.m1, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits + #NextValue(self.m2, Cat(0,0,0, self.fs3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits + NextValue(self.m2, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.fs3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits NextState("FADD1") ) @@ -363,7 +378,8 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FDIV2", - If(self.i < 24, + #If(self.i < 24, + If(self.i < 8, NextValue(self.FPU_state, 2), If(self.m1 < self.m2, NextValue(self.m3, self.m3 << 1), # Append a zero @@ -376,7 +392,8 @@ class bfloat16Processor(Module): ).Else( # Loop exceeded # 4. Normalization NextValue(self.FPU_state, 3), - If(~self.m3[23], # & 0x00800000 + #If(~self.m3[23], # & 0x00800000 + If(~self.m3[7], # & 0x00800000 NextValue(self.m3, self.m3 << 1), # Subtraction normalization NextValue(self.e3, self.e3 - 1), ).Else( @@ -398,7 +415,8 @@ class bfloat16Processor(Module): NextState("FPU_IDLE") ).Else( # Better fast, than accurate! Use Newton-Raphson in S/W for better accuracy! # Goldschmidt's algorithm (only 1 digit after decimal point ok, error varies, s.b) - If((self.m1[0:23] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1 + #If((self.m1[0:23] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1 + If((self.m1[0:7] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1 #return sqrt_approx(f, 0x0004B0D2); // Minimized error (max. 3.5%) NextValue(self.branch1, 1), # Use 0x0004B0D2 for minimized error (<= 3.5%) ).Else( diff --git a/software/source/bfloat16nnlib.c b/software/source/bfloat16nnlib.c index 4c55d38..4a23a25 100644 --- a/software/source/bfloat16nnlib.c +++ b/software/source/bfloat16nnlib.c @@ -56,7 +56,7 @@ static int fpgaload(uint32_t *mempt, int16_t len) bfloat16nn_b32Sentinel_write(*sentinel); bfloat16nn_bEnable_write(1); // Finally: Engage! for(int i=0;i<10;i++) { // Max. 100ms delay - if(bfloat16nn_b32Status_read() & 0x80000000) { + if(bfloat16nn_b16Status_read() & 0x8000) { bfloat16nn_bEnable_write(0); // Disable transfer return 1; // Ok, ready! } @@ -69,53 +69,71 @@ static int fpgaload(uint32_t *mempt, int16_t len) static float fp1_read(void) { - uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value1_read(); - float *fpt = (float *)&v; + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required + float *fpt = (float *)&v; return *fpt; } static float fp2_read(void) { - uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value2_read(); + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read(); float *fpt = (float *)&v; return *fpt; } static float fp3_read(void) { - uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value3_read(); + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read(); float *fpt = (float *)&v; return *fpt; } static float fpResult_read(void) { - uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Result_read(); + uint32_t v __attribute__((aligned(16))) = 0; + *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result_read(); float *fpt = (float *)&v; return *fpt; } +static uint16_t f2ui16(float f) +{ + return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ... +} + +void dumpfloat(float f) +{ + printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f)); +} int key_eval(void) { extern void printf1(const char *fmt, float f1); static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t)); - uint32_t *uiptr; - float *fptr; + uint32_t *ui32ptr; + uint16_t *ui16ptr; int i; + float fp1, fp2, fp3, fpResult; switch(kbhit()) { case 'r': // Reload printf("\e[35;1m*** Reload ***\e[0m\n"); - for(i=0, uiptr = (uint32_t *)DRAMDATABASE;i