bfloat16 fadd working!

master
kaqu 1 year ago
parent 8838cf4f9d
commit 7a592492fc
  1. 72
      libmodules/bfloat16nncore.py
  2. 56
      libmodules/bfloat16processor.py
  3. 60
      software/source/bfloat16nnlib.c

@ -49,7 +49,7 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
Outputs:
########
:b32Result: Processing result
:b16Result: Processing result
:bReady: Ready indication (wire to LED ... ;)
@ -78,28 +78,28 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
""")
# Outputs
self.b32Status = CSRStorage(32, reset_less=False,
fields=[CSRField("Status", size=32, description="*Field*: 32-Bit value")],
self.b16Status = CSRStorage(16, reset_less=False,
fields=[CSRField("Status", size=16, description="*Field*: 16-Bit value")],
description="""
Processing stati
""")
self.b32Value1 = CSRStorage(32, reset_less=False,
fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")],
self.b16Value1 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 1
""")
self.b32Value2 = CSRStorage(32, reset_less=False,
fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")],
self.b16Value2 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 2
""")
self.b32Value3 = CSRStorage(32, reset_less=False,
fields=[CSRField("Value", size=32, description="*Field*: 32-Bit value")],
self.b16Value3 = CSRStorage(16, reset_less=False,
fields=[CSRField("Value", size=16, description="*Field*: 16-Bit value")],
description="""
Float register 3
""")
self.b32Result = CSRStorage(32, reset_less=False,
fields=[CSRField("Result", size=32, description="*Field*: 32-Bit value")],
self.b16Result = CSRStorage(16, reset_less=False,
fields=[CSRField("Result", size=16, description="*Field*: 16-Bit value")],
description="""
Processing result
""")
@ -156,20 +156,20 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(self.Loader_Active, True), # Loader up & running
NextValue(self.Loader_Delay, 0), # Reset read delay timer
NextValue(LoadUnit.b9Offset.storage, LUCacheSize - 1), # Adjust offset to read sentinel
NextValue(self.b32Result.storage, 0), # Indicate # delays
NextValue(self.b32Status.storage[0], True), # Current status
NextValue(self.b32Value1.storage, 0), # Nothing loaded so far ...
NextValue(self.b32Value2.storage, 0), # Nothing loaded so far ...
NextValue(self.b32Value3.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Result.storage, 0), # Indicate # delays
NextValue(self.b16Status.storage[0], True), # Current status
NextValue(self.b16Value1.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value2.storage, 0), # Nothing loaded so far ...
NextValue(self.b16Value3.storage, 0), # Nothing loaded so far ...
NextValue(self.bReady, False), # LED off!
NextState("Loader_LOAD1")
).Elif(~self.bEnable.storage, # Externally aborted?
NextValue(self.b32Status.storage, 0), # Current status: inactive
NextValue(self.b16Status.storage, 0), # Current status: inactive
NextValue(self.Loader_Active, False), # Reset in sync w/ global activation
)
)
Loader_fsm.act("Loader_LOAD1",
NextValue(self.b32Status.storage[1], True), # Current status added
NextValue(self.b16Status.storage[1], True), # Current status added
If(LoadUnit.b32Data.storage == self.b32Sentinel.storage, # Valid last entry?
NextValue(LoadUnit.b9Offset.storage, 0), # 1st value offset preparation
NextState("Loader_LOAD2")
@ -178,10 +178,12 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
)
Loader_fsm.act("Loader_LOAD2",
NextValue(self.b32Status.storage[2], True), # Current status added
NextValue(self.b16Status.storage[2], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b32Value1.storage, LoadUnit.b32Data.storage), # Pick 1st date
NextValue(fpu.fs1, LoadUnit.b32Data.storage),
NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date
NextValue(fpu.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD3")
@ -190,39 +192,27 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
)
Loader_fsm.act("Loader_LOAD3",
NextValue(self.b32Status.storage[3], True), # Current status added
NextValue(self.b16Status.storage[3], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b32Value2.storage, LoadUnit.b32Data.storage), # Pick 2nd date
NextValue(fpu.fs2, LoadUnit.b32Data.storage),
NextValue(LoadUnit.b9Offset.storage, 2), # 3rd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD4")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
Loader_fsm.act("Loader_LOAD4",
NextValue(self.b32Status.storage[4], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b32Value3.storage, LoadUnit.b32Data.storage), # Pick 3rd date
NextValue(fpu.fs3, LoadUnit.b32Data.storage),
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date
NextValue(fpu.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b32Status.storage[5], True), # Current status added
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu.fadd, True), # This command requested
NextValue(fpu.fready, False), # Engage trigger
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b32Status.storage[6], True), # Current status added
NextValue(self.b16Status.storage[6], True), # Current status added
If(fpu.fready,
NextValue(fpu.fadd, False), # Clear command request
NextValue(self.b32Result.storage, fpu.fresult), # Pick result
NextValue(self.b32Status.storage[31], True), # Indicate readyness ...
NextValue(self.b16Result.storage, fpu.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!)
NextState("Loader_IDLE")
)

@ -3,7 +3,7 @@
#
# bfloat16processor.py
#
# bfloat16 processing
# bfloat16 processing (1 bit sign, 8 bit exponent, 7 bit mantissa)
#
# History:
# --------
@ -50,16 +50,20 @@ class bfloat16Processor(Module):
self.e1 = Signal((8,True), reset_less=True) # Signed exponents!
self.e2 = Signal((8,True), reset_less=True)
self.e3 = Signal((8,True), reset_less=True)
self.m1 = Signal((23+1+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign!
self.m2 = Signal((24+1+3,False), reset_less=True) # 23 bits + 1bit (1.xx = 0x800000)
self.m3 = Signal((25+1+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits
#self.m1 = Signal((23+1+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign!
self.m1 = Signal((7+2+3,False), reset_less=True) # Unsigned mantissas! TODO: Verify sign!
#self.m2 = Signal((24+1+3,False), reset_less=True) # 23 bits + 1bit (1.xx = 0x800000)
self.m2 = Signal((7+2+3,False), reset_less=True) # 7 bits + 1bit (1.xx = 0x800000) + 2 spare
#self.m3 = Signal((25+1+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits
self.m3 = Signal((8+2+3,True), reset_less=True) # + Sign + R(0)/Guard & Sticky bits
self.lm3 = Signal((64,True), reset_less=True) # MUL long result
self.s32 = Signal((32,True), reset_less=True) # Signed 32-bit
self.s_bit = Signal() # Sticky bit (for rounding control)
self.branch1 = Signal() # Branch helpers
self.branch2 = Signal()
self.i = Signal(5) # Loop counter, range 0..31
#self.i = Signal(5) # Loop counter, range 0..31
self.i = Signal(4) # Loop counter, range 0..15
FPU_fsm = FSM(reset_state="FPU_IDLE") # FSM starts idling ...
self.submodules += FPU_fsm
@ -72,16 +76,20 @@ class bfloat16Processor(Module):
NextValue(self.sign2, self.fs2[31] ^ self.fsub), # Invert sign for subtraction!
NextValue(self.e1, self.fs1[23:31] - 127),
NextValue(self.e2, self.fs2[23:31] - 127),
NextValue(self.m1, Cat(0,0,0, self.fs1[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, self.fs2[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m1, Cat(0,0,0, self.fs1[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m1, Cat(0,0,0, self.fs1[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m2, Cat(0,0,0, self.fs2[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, self.fs2[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
NextState("FADD1")
).Elif((self.fmin | self.fmax | self.fmadd | self.fmsub | self.fnmadd | self.fnmsub | self.fmul | self.fdiv) & ~self.fready, # Triggers set & ready flag reset externally!
NextValue(self.sign1, self.fs1[31]),
NextValue(self.sign2, self.fs2[31]),
NextValue(self.e1, self.fs1[23:31] - 127),
NextValue(self.e2, self.fs2[23:31] - 127),
NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
NextValue(self.m2, Cat(self.fs2[0:23], 1, 0)), # | 0x00800000
#NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000
#NextValue(self.m2, Cat(self.fs2[0:23], 1, 0)), # | 0x00800000
NextValue(self.m2, Cat(self.fs2[0:7], 1, 0, 0,0,0)), # | 0x00800000
If(self.fdiv, # Division
NextState("FDIV1"),
).Elif(self.fmin, # Minimum
@ -94,7 +102,8 @@ class bfloat16Processor(Module):
).Elif(self.fsqrt & ~self.fready, # Trigger set & ready flag reset externally!
NextValue(self.sign1, self.fs1[31]),
NextValue(self.e1, self.fs1[23:31] - 127),
NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
#NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000
NextState("FSQRT1"),
)
)
@ -205,7 +214,8 @@ class bfloat16Processor(Module):
FPU_fsm.act("FADD6",
NextValue(self.FPU_state, 6),
# 6. Normalization of result: Overflow
If(self.m3[24], # & 0x01000000,
#If(self.m3[24], # & 0x01000000,
If(self.m3[7+1], # & 0x01000000,
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
NextValue(self.e3, self.e3 + 1)
).Else(
@ -216,7 +226,8 @@ class bfloat16Processor(Module):
FPU_fsm.act("FADD7",
# 7. Normalization: Result
NextValue(self.FPU_state, 7),
If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops)
#If(~self.m3[23] & (self.i < 23), # & 0x00800000 (limit to max. loops)
If(~self.m3[7] & (self.i < 7), # & 0x00800000 (limit to max. loops)
NextValue(self.m3, self.m3 << 1), # Subtraction normalization
NextValue(self.e3, self.e3 - 1),
NextValue(self.i, self.i + 1), # Count loops ...
@ -231,7 +242,8 @@ class bfloat16Processor(Module):
)
FPU_fsm.act("FADD8",
NextValue(self.FPU_state, 8),
If(self.m3[24], # & 0x01000000, # Overflow?
#If(self.m3[24], # & 0x01000000, # Overflow?
If(self.m3[7+1], # & 0x01000000, # Overflow?
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
NextValue(self.e3, self.e3 + 1)
),
@ -241,7 +253,8 @@ class bfloat16Processor(Module):
FPU_fsm.act("self.fresult", # Result contruction & possible rounding
NextValue(self.FPU_state, 9),
# 6. Build the actual resulting float
NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)),
#NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)),
NextValue(self.fresult, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, self.m3[0:7], self.e3+127, self.sign3)),
NextValue(self.fready, 1), # Indicate ready to main decoder
NextState("FPU_IDLE")
)
@ -326,8 +339,10 @@ class bfloat16Processor(Module):
NextValue(self.sign2, self.fs3[31] ^ (self.fmsub | self.fnmsub)), # Invert sign for subtraction!
NextValue(self.e1, self.e3),
NextValue(self.e2, self.fs3[23:31] - 127),
NextValue(self.m1, Cat(0,0,0, self.m3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, self.fs3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m1, Cat(0,0,0, self.m3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m1, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m2, Cat(0,0,0, self.fs3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.fs3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
NextState("FADD1")
)
@ -363,7 +378,8 @@ class bfloat16Processor(Module):
)
)
FPU_fsm.act("FDIV2",
If(self.i < 24,
#If(self.i < 24,
If(self.i < 8,
NextValue(self.FPU_state, 2),
If(self.m1 < self.m2,
NextValue(self.m3, self.m3 << 1), # Append a zero
@ -376,7 +392,8 @@ class bfloat16Processor(Module):
).Else( # Loop exceeded
# 4. Normalization
NextValue(self.FPU_state, 3),
If(~self.m3[23], # & 0x00800000
#If(~self.m3[23], # & 0x00800000
If(~self.m3[7], # & 0x00800000
NextValue(self.m3, self.m3 << 1), # Subtraction normalization
NextValue(self.e3, self.e3 - 1),
).Else(
@ -398,7 +415,8 @@ class bfloat16Processor(Module):
NextState("FPU_IDLE")
).Else( # Better fast, than accurate! Use Newton-Raphson in S/W for better accuracy!
# Goldschmidt's algorithm (only 1 digit after decimal point ok, error varies, s.b)
If((self.m1[0:23] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1
#If((self.m1[0:23] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1
If((self.m1[0:7] != 0) | (self.e1 == 1), # Not 2^x (m==0!) and x!=1
#return sqrt_approx(f, 0x0004B0D2); // Minimized error (max. 3.5%)
NextValue(self.branch1, 1), # Use 0x0004B0D2 for minimized error (<= 3.5%)
).Else(

@ -56,7 +56,7 @@ static int fpgaload(uint32_t *mempt, int16_t len)
bfloat16nn_b32Sentinel_write(*sentinel);
bfloat16nn_bEnable_write(1); // Finally: Engage!
for(int i=0;i<10;i++) { // Max. 100ms delay
if(bfloat16nn_b32Status_read() & 0x80000000) {
if(bfloat16nn_b16Status_read() & 0x8000) {
bfloat16nn_bEnable_write(0); // Disable transfer
return 1; // Ok, ready!
}
@ -69,53 +69,71 @@ static int fpgaload(uint32_t *mempt, int16_t len)
static float fp1_read(void)
{
uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value1_read();
float *fpt = (float *)&v;
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_read(); // Low-endian, high half word required
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_read(void)
{
uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value2_read();
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp3_read(void)
{
uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Value3_read();
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value3_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fpResult_read(void)
{
uint32_t v __attribute__((aligned(16))) = bfloat16nn_b32Result_read();
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result_read();
float *fpt = (float *)&v;
return *fpt;
}
static uint16_t f2ui16(float f)
{
return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ...
}
void dumpfloat(float f)
{
printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f));
}
int key_eval(void)
{
extern void printf1(const char *fmt, float f1);
static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
uint32_t *uiptr;
float *fptr;
uint32_t *ui32ptr;
uint16_t *ui16ptr;
int i;
float fp1, fp2, fp3, fpResult;
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
for(i=0, uiptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*uiptr++ = i+1;
fptr = (float *)(DRAMDATABASE + 0 * sizeof(float));
*fptr++ = 1.0;
*fptr++ = 2.0;
*fptr++ = 3.0;
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1;
ui16ptr = (uint16_t *)(DRAMDATABASE + 0 * sizeof(float));
*ui16ptr++ = f2ui16(1.0);
*ui16ptr++ = f2ui16(2.0);
*ui16ptr++ = f2ui16(3.0);
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
printf("S=%08Xh ", bfloat16nn_b32Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());
printf1("RESULT=%4.2f\n", fpResult_read());
fp1 = fp1_read();
fp2 = fp2_read();
fp3 = fp3_read();
fpResult = fpResult_read();
printf("S=%04Xh ", (uint32_t)bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1);
printf1("V2=%4.2f ", fp2);
printf1("V3=%4.2f ", fp3);
printf1("RESULT=%4.2f\n", fpResult);
/*
for(i=0;i<DRAMDATASIZE;i+=32) {
dram2fpga_b9Offset_write(i);
@ -130,7 +148,7 @@ int key_eval(void)
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, 512)) {
printf("INVALIDATED: S=%08Xh ", bfloat16nn_b32Status_read());
printf("INVALIDATED: S=%04Xh ", bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());
@ -140,7 +158,7 @@ int key_eval(void)
printf("INVALIDATED: Timeout!");
break;
case 's':
printf("REQUESTED: S=%08Xh ", bfloat16nn_b32Status_read());
printf("REQUESTED: S=%04Xh ", bfloat16nn_b16Status_read());
printf1("V1=%4.2f ", fp1_read());
printf1("V2=%4.2f ", fp2_read());
printf1("V3=%4.2f ", fp3_read());

Loading…
Cancel
Save