diff --git a/libmodules/bfloat16nncore.py b/libmodules/bfloat16nncore.py index e84e7e8..e32cbcf 100644 --- a/libmodules/bfloat16nncore.py +++ b/libmodules/bfloat16nncore.py @@ -90,6 +90,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): description=""" Processing stati """) + self.b16FPUStates = CSRStorage(16, reset_less=False, + fields=[CSRField("FPUStates", size=16, description="*Field*: 16-Bit value")], + description=""" + FPU states: Low FPU#1, High FPU#2 + """) self.b16Value1_1 = CSRStorage(16, reset_less=False, fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")], description=""" @@ -223,26 +228,28 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): ) Loader_fsm.act("Loader_EXEC1", NextValue(self.b16Status.storage[2], True), # Current status added - If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), - If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! - NextValue(fpu1.fmul, True), # 1st ADD requested - NextValue(fpu2.fmul, True), - ).Else( - NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested - NextValue(fpu2.fmadd, True), - ) + #If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1), + If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! + NextValue(fpu1.fmul, True), # 1st ADD requested + NextValue(fpu2.fmul, True), + ).Else( + NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested + NextValue(fpu2.fmadd, True), ), NextValue(fpu1.fready, False), # Engage trigger FPU#1 - NextValue(fpu2.fready, False), # Engage trigger FPU#2 + NextValue(fpu2.fready, False), # Engage trigger FPU#2 + #), NextState("Loader_EXEC2") ) Loader_fsm.act("Loader_EXEC2", NextValue(self.b16Status.storage[3], True), # Current status added - If(fpu1.fready & fpu2.fready, - If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! + NextValue(self.b16Status.storage[8], fpu1.fready), # TODO: Remove! + NextValue(self.b16Status.storage[9], fpu2.fready), # TODO: Remove! + If(fpu1.fready & fpu2.fready, + If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0) NextValue(fpu1.fmul, False), # Clear command request FPU#1 NextValue(fpu2.fmul, False), # Clear command request FPU#2 - ).Else( + ).Else( # Entries 1..len NextValue(fpu1.fmadd, False), # Clear command request FPU#1 NextValue(fpu2.fmadd, False), # Clear command request FPU#2 ), @@ -274,3 +281,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc): NextState("Loader_IDLE") ) ) + + self.sync += [ # Show individual FPU states + self.b16FPUStates.storage[0:8].eq(fpu1.FPU_state[0:8]), + self.b16FPUStates.storage[8:16].eq(fpu2.FPU_state[0:8]), + ] \ No newline at end of file diff --git a/libmodules/bfloat16processor.py b/libmodules/bfloat16processor.py index 2434d47..a459199 100644 --- a/libmodules/bfloat16processor.py +++ b/libmodules/bfloat16processor.py @@ -104,7 +104,7 @@ class bfloat16Processor(Module): ) FPU_fsm.act("FADD1", - NextValue(self.FPU_state, 1), + NextValue(self.FPU_state, 20), # 1. Verify valid ranges 1st! If(((self.fs1[0:31] == 0x7FFFFFFF) | (self.fs2[0:31] == 0x7FFFFFFF)) | ((self.sign1 ^ self.sign2) & ((self.e1 == -1) & (self.e2 == -1))), @@ -168,7 +168,7 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FADD3", - NextValue(self.FPU_state, 3), + NextValue(self.FPU_state, 24), # 3. Add mantissas (as both are of same base now) If(~self.sign1 & ~self.sign2, # Negotiate sign -> ADD/SUB NextValue(self.m3, self.m1 + self.m2) @@ -186,7 +186,7 @@ class bfloat16Processor(Module): NextState("FADD4") ) FPU_fsm.act("FADD4", - NextValue(self.FPU_state, 4), + NextValue(self.FPU_state, 25), # 4. Retrieve sign & unsigned absolute value If(self.m3 < 0, NextValue(self.sign3, 1), # Pull sign @@ -198,7 +198,7 @@ class bfloat16Processor(Module): NextState("FADD5") ) FPU_fsm.act("FADD5", - NextValue(self.FPU_state, 5), + NextValue(self.FPU_state, 26), # 5. Rounding to nearest/even (FCS_FRM=0x00) If(self.m3[0:3] == 0x7, # Remainder (all set?): REMAINDER(0) + GUARD(MSB) + STICKYBIT (ORed rest) NextValue(self.s_bit, 1) # Indicate rounding @@ -209,7 +209,7 @@ class bfloat16Processor(Module): NextState("FADD6") ) FPU_fsm.act("FADD6", - NextValue(self.FPU_state, 6), + NextValue(self.FPU_state, 27), # 6. Normalization of result: Overflow #If(self.m3[24], # & 0x01000000, If(self.m3[7+1], # & 0x01000000, @@ -222,7 +222,7 @@ class bfloat16Processor(Module): ) FPU_fsm.act("FADD7", # 7. Normalization: Result - NextValue(self.FPU_state, 7), + NextValue(self.FPU_state, 28), #If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops) If(~self.m3[7] & (self.i < 7), # & 0x00800000 (limit to max. loops) NextValue(self.m3, self.m3 << 1), # Subtraction normalization @@ -241,7 +241,7 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FADD8", - NextValue(self.FPU_state, 8), + NextValue(self.FPU_state, 29), #If(self.m3[24], # & 0x01000000, # Overflow? If(self.m3[7+1], # & 0x01000000, # Overflow? NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent @@ -254,7 +254,7 @@ class bfloat16Processor(Module): ) # End of fadd.s processing FPU_fsm.act("FRESULT", # Result contruction & possible rounding - NextValue(self.FPU_state, 9), + NextValue(self.FPU_state, 30), # 6. Build the actual resulting float NextValue(self.fresult, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, self.m3[0:7], self.e3+127, self.sign3)), NextValue(self.fready, 1), # Indicate ready to main decoder @@ -295,7 +295,7 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FMUL2", - NextValue(self.FPU_state, 2), + NextValue(self.FPU_state, 2), # 4. MSB set in significants (i.e. bit[45])? # Bitoffset: 48 32 16 0 If(self.lm3[47], # & 0x0000800000000000, TODO: Verify bit# (45 or 47?)! @@ -309,6 +309,7 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FMUL3", + NextValue(self.FPU_state, 3), # 5. Rounding to nearest/even (FCS_FRM=0x00) If(self.lm3[22] & self.lm3[23], # & 0xC00000) == 0xC00000 Remainder (to be skipped): RESULTBIT(0) + REMAINDERBIT(MSB) set? If(self.lm3[0:22] != 0, # Sticky-Bit S (ORed rest) set? @@ -324,6 +325,7 @@ class bfloat16Processor(Module): ) ) FPU_fsm.act("FMUL4", + NextValue(self.FPU_state, 4), # Overflow normalization # Bit:48 32 16 0 If(self.lm3[47], # & 0x0000800000000000 @@ -333,6 +335,7 @@ class bfloat16Processor(Module): NextState("FMUL5") ) FPU_fsm.act("FMUL5", + NextValue(self.FPU_state, 5), # 6. Construction of result NextValue(self.m3, ((self.lm3 >> 23) & 0x7FFFFF)[16:23]), # TODO: e3=se3 omitted ok? @@ -344,6 +347,7 @@ class bfloat16Processor(Module): ) # End of fmul.s processing FPU_fsm.act("FMADD1", + NextValue(self.FPU_state, 6), # Result->fs1: sign3/e3/m3 -> sign1/e1/m1 & fs1, fs3->fs2: fs3 -> sign2/e2/m2 & fs2 NextValue(self.sign1, self.sign3), # Negate mult. result w/ fxxx NextValue(self.sign2, self.fs3[31] ^ (self.fmsub | self.fnmsub)), # Invert sign for subtraction! diff --git a/libmodules/dramtransfer.py b/libmodules/dramtransfer.py index 591e757..2e3dc70 100644 --- a/libmodules/dramtransfer.py +++ b/libmodules/dramtransfer.py @@ -204,6 +204,7 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc): ), self.bData1.eq(rdport1.dat_r) # Assign to external var. ... ), + self.b2Address2.eq(self.b9Offset2.storage[0:2]), # Filter bits 0..1 (range 0-3) If(self.b9Offset2.storage < maxwords, #rdport.adr.eq(self.b9Offset2.storage), # w/ translation! diff --git a/software/source/bfloat16nnlib.c b/software/source/bfloat16nnlib.c index ac861ca..a3df6a8 100644 --- a/software/source/bfloat16nnlib.c +++ b/software/source/bfloat16nnlib.c @@ -51,14 +51,16 @@ static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen) { uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t)); - if((len < 4) | (len > 512)) return -1; // Verify length of transfer was understood! - - flush_l2_cache(); // Strictly nec. for longer transfers - bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from - bfloat16nn_b32Sentinel_write(*sentinel); - bfloat16nn_b9ArrayWordLen_write(calclen); // Indicate array length for calc. + if((len < 4) | (len > 512)) return -1; // Verify length of transfer was understood! + if((calclen < 2) | (calclen > len/2)) return -2; // Reasonable calc amount? + bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...) + flush_l2_cache(); // Strictly nec. for longer transfers + bfloat16nn_b9ArrayWordLen_write(calclen); // Indicate array length for calc. + bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from + //*sentinel = 0xAAFF01A3; + bfloat16nn_b32Sentinel_write(*sentinel); bfloat16nn_bEnable_write(1); // Finally: Engage! - for(int i=0;i<100;i++) { // Max. 100ms delay + for(int i=0;i<10;i++) { // Max. 100ms delay if(bfloat16nn_b16Status_read() & 0x8000) { bfloat16nn_bEnable_write(0); // Disable transfer return 1; // Ok, ready! @@ -135,13 +137,13 @@ int key_eval(void) float fp1_1, fp1_2, fpResult1; float fp2_1, fp2_2, fpResult2; -#define MAXCALCLEN 4 +#define MAXCALCLEN 16 switch(kbhit()) { case 'r': // Reload printf("\e[35;1m*** Reload ***\e[0m\n"); for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i