fnmadd/fnmsub fixed (finally!)

master
kaqu 1 year ago
parent f7c5c95ffa
commit 122bf1c3c6
  1. 28
      libmodules/bfloat16nncore.py
  2. 22
      libmodules/bfloat16processor.py

@ -142,8 +142,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
)
)
#---------------- bfloat16 FPU -------------------------------------------------------------
self.submodules.fpu = fpu = bfloat16Processor() # Integrate bfloat16 FPU
#---------------- bfloat16 FPUs -------------------------------------------------------------
self.submodules.fpu1 = fpu1 = bfloat16Processor() # Integrate bfloat16 FPU
#---------------- Loaded data testing --------------------------------------------------
Loader_fsm = FSM(reset_state="Loader_IDLE") # FSM starts idling ...
@ -182,8 +182,8 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value1.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 1st date
NextValue(self.b16Value2.storage, LoadUnit.b32Data.storage >> 16), # Pick 2nd date
NextValue(fpu.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(fpu1.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu1.fs2, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[16:32])),
NextValue(LoadUnit.b9Offset.storage, 1), # 2nd value offset preparation
NextValue(self.Loader_Delay, 0), # Reset delay
NextState("Loader_LOAD3")
@ -195,29 +195,25 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(self.b16Status.storage[3], True), # Current status added
If(self.Loader_Delay > RAMWaitTime,
NextValue(self.b16Value3.storage, LoadUnit.b32Data.storage & 0xFFFF), # Pick 3rd date
NextValue(fpu.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextValue(fpu1.fs3, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, LoadUnit.b32Data.storage[0:16])),
NextState("Loader_EXEC1")
).Else( # MEM wait cycles
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
"""
fpu.fnmadd
fpu.fnmsub
"""
)
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu.fmsub, True), # This command requested
NextValue(fpu.fready, False), # Engage trigger
NextValue(fpu1.fnmsub, True), # This command requested
NextValue(fpu1.fready, False), # Engage trigger
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[6], True), # Current status added
If(fpu.fready,
NextValue(fpu.fmsub, False), # Clear command request
NextValue(self.b16Result.storage, fpu.fresult[16:32]), # Pick result (little endian, high word!)
If(fpu1.fready,
NextValue(fpu1.fnmsub, False), # Clear command request
NextValue(self.b16Result.storage, fpu1.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!)
NextValue(self.bReady, True), # Indicate readyness (LED on!) (TODO: Remove!)
NextState("Loader_IDLE")
)
)

@ -122,9 +122,9 @@ class bfloat16Processor(Module):
).Elif(self.fs1[0:31] == 0, # Nothing to add? (w/o sign!)
If(self.fsub, # Subtract yields negative result!
NextValue(self.fresult, self.fs2 ^ 0x80000000), # Invert sign
).Elif(self.fmsub | self.fnmsub, # 0*x=>0! 0-fs3 = +fs3!
).Elif(self.fmsub | self.fnmadd, # 0*x=>0! 0-fs3 or -(0+fs3) = +fs3! FIXME->risq5 -*- => +!
NextValue(self.fresult, self.fs3 ^ 0x80000000), # Invert sign
).Elif(self.fmadd | self.fnmadd, # 0*x=>0! 0+fs3 = fs3!
).Elif(self.fmadd, # 0*x=>0! 0+fs3 = fs3!
NextValue(self.fresult, self.fs3), # Ready!
).Else( # Straight add
NextValue(self.fresult, self.fs2), # Ready!
@ -136,7 +136,10 @@ class bfloat16Processor(Module):
NextValue(self.fready, 1),
NextState("FPU_IDLE")
).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!)
NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
NextValue(self.sign3, ~self.sign3) # Invert result finally
),
NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
).Else( # Ok, valid floats supplied ...
NextValue(self.s_bit, 0),
NextValue(self.branch1, 0), # Reset helpers
@ -231,6 +234,9 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 + self.s_bit),
NextState("FADD8") # Adjust possible overflow ...
).Else( # Nope, all ready
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
NextValue(self.sign3, ~self.sign3) # Invert result finally
),
NextState("FRESULT")
)
)
@ -242,13 +248,15 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
NextValue(self.e3, self.e3 + 1)
),
If(self.fnmadd | self.fnmsub, # sign3/e3/m3 used in FRESULT FIXME->risq5
NextValue(self.sign3, ~self.sign3) # Invert result finally
),
NextState("FRESULT")
) # End of fadd.s processing
FPU_fsm.act("FRESULT", # Result contruction & possible rounding
NextValue(self.FPU_state, 9),
# 6. Build the actual resulting float
#NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)),
# 6. Build the actual resulting float
NextValue(self.fresult, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, self.m3[0:7], self.e3+127, self.sign3)),
NextValue(self.fready, 1), # Indicate ready to main decoder
NextState("FPU_IDLE")
@ -324,11 +332,11 @@ class bfloat16Processor(Module):
# TODO: e3=se3 omitted ok?
If(self.fmul, # Simple multiplication
NextState("FRESULT")
).Else( # Fused multiply-add?
NextValue(self.sign3, self.sign3 ^ (self.fnmadd | self.fnmsub)), # Negate mult. result w/ f<n>xxx
).Else( # Fused multiply-add?
NextState("FMADD1")
)
) # End of fmul.s processing
FPU_fsm.act("FMADD1",
# Result->fs1: sign3/e3/m3 -> sign1/e1/m1 & fs1, fs3->fs2: fs3 -> sign2/e2/m2 & fs2
NextValue(self.sign1, self.sign3), # Negate mult. result w/ f<n>xxx

Loading…
Cancel
Save