Up & running!

master
kaqu 1 year ago
parent e99f510d9e
commit 24ef44e4d5
  1. 36
      libmodules/bfloat16nncore.py
  2. 22
      libmodules/bfloat16processor.py
  3. 1
      libmodules/dramtransfer.py
  4. 40
      software/source/bfloat16nnlib.c

@ -90,6 +90,11 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
description="""
Processing stati
""")
self.b16FPUStates = CSRStorage(16, reset_less=False,
fields=[CSRField("FPUStates", size=16, description="*Field*: 16-Bit value")],
description="""
FPU states: Low FPU#1, High FPU#2
""")
self.b16Value1_1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
@ -223,26 +228,28 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[2], True), # Current status added
If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1),
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1!
NextValue(fpu1.fmul, True), # 1st ADD requested
NextValue(fpu2.fmul, True),
).Else(
NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested
NextValue(fpu2.fmadd, True),
)
#If(LoadUnit.b9Offset1.storage < self.b9ArrayWordLen.storage, #(LUCacheSize >> 1),
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1!
NextValue(fpu1.fmul, True), # 1st ADD requested
NextValue(fpu2.fmul, True),
).Else(
NextValue(fpu1.fmadd, True), # 2nd ... last MUL/ADD requested
NextValue(fpu2.fmadd, True),
),
NextValue(fpu1.fready, False), # Engage trigger FPU#1
NextValue(fpu2.fready, False), # Engage trigger FPU#2
NextValue(fpu2.fready, False), # Engage trigger FPU#2
#),
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[3], True), # Current status added
If(fpu1.fready & fpu2.fready,
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1!
NextValue(self.b16Status.storage[8], fpu1.fready), # TODO: Remove!
NextValue(self.b16Status.storage[9], fpu2.fready), # TODO: Remove!
If(fpu1.fready & fpu2.fready,
If(LoadUnit.b9Offset1.storage == 1, # As pointer already moved ahead 1! (Actually: Entry #0)
NextValue(fpu1.fmul, False), # Clear command request FPU#1
NextValue(fpu2.fmul, False), # Clear command request FPU#2
).Else(
).Else( # Entries 1..len
NextValue(fpu1.fmadd, False), # Clear command request FPU#1
NextValue(fpu2.fmadd, False), # Clear command request FPU#2
),
@ -274,3 +281,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextState("Loader_IDLE")
)
)
self.sync += [ # Show individual FPU states
self.b16FPUStates.storage[0:8].eq(fpu1.FPU_state[0:8]),
self.b16FPUStates.storage[8:16].eq(fpu2.FPU_state[0:8]),
]

@ -104,7 +104,7 @@ class bfloat16Processor(Module):
)
FPU_fsm.act("FADD1",
NextValue(self.FPU_state, 1),
NextValue(self.FPU_state, 20),
# 1. Verify valid ranges 1st!
If(((self.fs1[0:31] == 0x7FFFFFFF) | (self.fs2[0:31] == 0x7FFFFFFF))
| ((self.sign1 ^ self.sign2) & ((self.e1 == -1) & (self.e2 == -1))),
@ -168,7 +168,7 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FADD3",
NextValue(self.FPU_state, 3),
NextValue(self.FPU_state, 24),
# 3. Add mantissas (as both are of same base now)
If(~self.sign1 & ~self.sign2, # Negotiate sign -> ADD/SUB
NextValue(self.m3, self.m1 + self.m2)
@ -186,7 +186,7 @@ class bfloat16Processor(Module):
NextState("FADD4")
)
FPU_fsm.act("FADD4",
NextValue(self.FPU_state, 4),
NextValue(self.FPU_state, 25),
# 4. Retrieve sign & unsigned absolute value
If(self.m3 < 0,
NextValue(self.sign3, 1), # Pull sign
@ -198,7 +198,7 @@ class bfloat16Processor(Module):
NextState("FADD5")
)
FPU_fsm.act("FADD5",
NextValue(self.FPU_state, 5),
NextValue(self.FPU_state, 26),
# 5. Rounding to nearest/even (FCS_FRM=0x00)
If(self.m3[0:3] == 0x7, # Remainder (all set?): REMAINDER(0) + GUARD(MSB) + STICKYBIT (ORed rest)
NextValue(self.s_bit, 1) # Indicate rounding
@ -209,7 +209,7 @@ class bfloat16Processor(Module):
NextState("FADD6")
)
FPU_fsm.act("FADD6",
NextValue(self.FPU_state, 6),
NextValue(self.FPU_state, 27),
# 6. Normalization of result: Overflow
#If(self.m3[24], # & 0x01000000,
If(self.m3[7+1], # & 0x01000000,
@ -222,7 +222,7 @@ class bfloat16Processor(Module):
)
FPU_fsm.act("FADD7",
# 7. Normalization: Result
NextValue(self.FPU_state, 7),
NextValue(self.FPU_state, 28),
#If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops)
If(~self.m3[7] & (self.i < 7), # & 0x00800000 (limit to max. loops)
NextValue(self.m3, self.m3 << 1), # Subtraction normalization
@ -241,7 +241,7 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FADD8",
NextValue(self.FPU_state, 8),
NextValue(self.FPU_state, 29),
#If(self.m3[24], # & 0x01000000, # Overflow?
If(self.m3[7+1], # & 0x01000000, # Overflow?
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
@ -254,7 +254,7 @@ class bfloat16Processor(Module):
) # End of fadd.s processing
FPU_fsm.act("FRESULT", # Result contruction & possible rounding
NextValue(self.FPU_state, 9),
NextValue(self.FPU_state, 30),
# 6. Build the actual resulting float
NextValue(self.fresult, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, self.m3[0:7], self.e3+127, self.sign3)),
NextValue(self.fready, 1), # Indicate ready to main decoder
@ -295,7 +295,7 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FMUL2",
NextValue(self.FPU_state, 2),
NextValue(self.FPU_state, 2),
# 4. MSB set in significants (i.e. bit[45])?
# Bitoffset: 48 32 16 0
If(self.lm3[47], # & 0x0000800000000000, TODO: Verify bit# (45 or 47?)!
@ -309,6 +309,7 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FMUL3",
NextValue(self.FPU_state, 3),
# 5. Rounding to nearest/even (FCS_FRM=0x00)
If(self.lm3[22] & self.lm3[23], # & 0xC00000) == 0xC00000 Remainder (to be skipped): RESULTBIT(0) + REMAINDERBIT(MSB) set?
If(self.lm3[0:22] != 0, # Sticky-Bit S (ORed rest) set?
@ -324,6 +325,7 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FMUL4",
NextValue(self.FPU_state, 4),
# Overflow normalization
# Bit:48 32 16 0
If(self.lm3[47], # & 0x0000800000000000
@ -333,6 +335,7 @@ class bfloat16Processor(Module):
NextState("FMUL5")
)
FPU_fsm.act("FMUL5",
NextValue(self.FPU_state, 5),
# 6. Construction of result
NextValue(self.m3, ((self.lm3 >> 23) & 0x7FFFFF)[16:23]),
# TODO: e3=se3 omitted ok?
@ -344,6 +347,7 @@ class bfloat16Processor(Module):
) # End of fmul.s processing
FPU_fsm.act("FMADD1",
NextValue(self.FPU_state, 6),
# Result->fs1: sign3/e3/m3 -> sign1/e1/m1 & fs1, fs3->fs2: fs3 -> sign2/e2/m2 & fs2
NextValue(self.sign1, self.sign3), # Negate mult. result w/ f<n>xxx
NextValue(self.sign2, self.fs3[31] ^ (self.fmsub | self.fnmsub)), # Invert sign for subtraction!

@ -204,6 +204,7 @@ class DRAM2FPGA(Module, AutoCSR, AutoDoc, ModuleDoc):
),
self.bData1.eq(rdport1.dat_r) # Assign to external var. ...
),
self.b2Address2.eq(self.b9Offset2.storage[0:2]), # Filter bits 0..1 (range 0-3)
If(self.b9Offset2.storage < maxwords,
#rdport.adr.eq(self.b9Offset2.storage), # w/ translation!

@ -51,14 +51,16 @@ static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen)
{
uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
if((len < 4) | (len > 512)) return -1; // Verify length of transfer was understood!
flush_l2_cache(); // Strictly nec. for longer transfers
bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from
bfloat16nn_b32Sentinel_write(*sentinel);
bfloat16nn_b9ArrayWordLen_write(calclen); // Indicate array length for calc.
if((len < 4) | (len > 512)) return -1; // Verify length of transfer was understood!
if((calclen < 2) | (calclen > len/2)) return -2; // Reasonable calc amount?
bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...)
flush_l2_cache(); // Strictly nec. for longer transfers
bfloat16nn_b9ArrayWordLen_write(calclen); // Indicate array length for calc.
bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from
//*sentinel = 0xAAFF01A3;
bfloat16nn_b32Sentinel_write(*sentinel);
bfloat16nn_bEnable_write(1); // Finally: Engage!
for(int i=0;i<100;i++) { // Max. 100ms delay
for(int i=0;i<10;i++) { // Max. 100ms delay
if(bfloat16nn_b16Status_read() & 0x8000) {
bfloat16nn_bEnable_write(0); // Disable transfer
return 1; // Ok, ready!
@ -135,13 +137,13 @@ int key_eval(void)
float fp1_1, fp1_2, fpResult1;
float fp2_1, fp2_2, fpResult2;
#define MAXCALCLEN 4
#define MAXCALCLEN 16
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1;
*ui32ptr++ = 0; // Clear all memory ...
// FPU#1
ui16ptr1 = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint32_t)); // Absolute: bytes!
@ -169,7 +171,7 @@ int key_eval(void)
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
printf("S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("TOTAL SUM=%8.4f\n", fpResult1);
@ -186,8 +188,8 @@ int key_eval(void)
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data2_read());
*/
}
else {
printf("CURRENT: Timeout! %04Xh ", (uint32_t)bfloat16nn_b16Status_read());
else {
printf("CURRENT: TIMEOUT! S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
printf("Offset 1: %d ", (uint32_t)dram2fpga_b9Offset1_read());
printf("Offset 2: %d\n", (uint32_t)dram2fpga_b9Offset2_read());
}
@ -198,9 +200,8 @@ int key_eval(void)
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("INVALIDATED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
fpResult2 = fpResult2_read();
printf("INVALIDATED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);
@ -208,8 +209,8 @@ int key_eval(void)
printf1("V2_2=%6.3f ", fp2_2);
printf1("RESULT2=%8.4f\n", fpResult2);
}
else
printf("INVALIDATED: Timeout! %04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
else
printf("INVALIDATED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
break;
case 's':
fp1_1 = fp1_1_read();
@ -217,9 +218,8 @@ int key_eval(void)
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("REQUESTED: S=%04Xh:\n", bfloat16nn_b16Status_read());
printf("S=%04Xh:\n", (uint32_t)bfloat16nn_b16Status_read());
fpResult2 = fpResult2_read();
printf("REQUESTED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);
printf1("RESULT1=%8.4f\n", fpResult1);

Loading…
Cancel
Save