bugs fixed, fmadd ok

master
kaqu 1 year ago
parent ea794dfa9c
commit 69b4557926
  1. 7
      libmodules/bfloat16nncore.py
  2. 48
      libmodules/bfloat16processor.py

@ -201,22 +201,21 @@ class bfloat16NeuralNetworkCore(Module, AutoCSR, AutoDoc, ModuleDoc):
NextValue(self.Loader_Delay, self.Loader_Delay + 1), # Increment
)
)
"""
fpu.fmadd
"""
fpu.fmsub
fpu.fnmadd
fpu.fnmsub
"""
Loader_fsm.act("Loader_EXEC1",
NextValue(self.b16Status.storage[5], True), # Current status added
NextValue(fpu.fmax, True), # This command requested
NextValue(fpu.fmadd, True), # This command requested
NextValue(fpu.fready, False), # Engage trigger
NextState("Loader_EXEC2")
)
Loader_fsm.act("Loader_EXEC2",
NextValue(self.b16Status.storage[6], True), # Current status added
If(fpu.fready,
NextValue(fpu.fmax, False), # Clear command request
NextValue(fpu.fmadd, False), # Clear command request
NextValue(self.b16Result.storage, fpu.fresult[16:32]), # Pick result (little endian, high word!)
NextValue(self.b16Status.storage[15], True), # Indicate readyness ...
NextValue(self.bReady, True), # Indicate readyness (LED on!)

@ -8,6 +8,7 @@
# History:
# --------
# 22.04.21/KQ Initial version
# 03.05.21/KQ FMADD1 state bugs fixed (fs1/fs2 not prepared?)
#
from migen import *
@ -77,9 +78,11 @@ class bfloat16Processor(Module):
NextValue(self.e1, self.fs1[23:31] - 127),
NextValue(self.e2, self.fs2[23:31] - 127),
#NextValue(self.m1, Cat(0,0,0, self.fs1[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m1, Cat(0,0,0, self.fs1[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m1, Cat(0,0,0, self.fs1[0:7], 1, 0)), # | 0x00800000 + R/G/S bits FIXME!
NextValue(self.m1, Cat(0,0,0, self.fs1[16:23], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m2, Cat(0,0,0, self.fs2[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, self.fs2[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m2, Cat(0,0,0, self.fs2[0:7], 1, 0)), # | 0x00800000 + R/G/S bits FIXME!
NextValue(self.m2, Cat(0,0,0, self.fs2[16:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextState("FADD1")
).Elif((self.fmin | self.fmax | self.fmadd | self.fmsub | self.fnmadd | self.fnmsub | self.fmul | self.fdiv) & ~self.fready, # Triggers set & ready flag reset externally!
NextValue(self.sign1, self.fs1[31]),
@ -87,9 +90,11 @@ class bfloat16Processor(Module):
NextValue(self.e1, self.fs1[23:31] - 127),
NextValue(self.e2, self.fs2[23:31] - 127),
#NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000
#NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000 FIXME!
NextValue(self.m1, Cat(self.fs1[16:23], 1, 0, 0,0,0)), # | 0x00800000
#NextValue(self.m2, Cat(self.fs2[0:23], 1, 0)), # | 0x00800000
NextValue(self.m2, Cat(self.fs2[0:7], 1, 0, 0,0,0)), # | 0x00800000
#NextValue(self.m2, Cat(self.fs2[0:7], 1, 0, 0,0,0)), # | 0x00800000 FIXME!
NextValue(self.m2, Cat(self.fs2[16:23], 1, 0, 0,0,0)), # | 0x00800000
If(self.fdiv, # Division
NextState("FDIV1"),
).Elif(self.fmin, # Minimum
@ -103,7 +108,8 @@ class bfloat16Processor(Module):
NextValue(self.sign1, self.fs1[31]),
NextValue(self.e1, self.fs1[23:31] - 127),
#NextValue(self.m1, Cat(self.fs1[0:23], 1, 0)), # | 0x00800000
NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000
#NextValue(self.m1, Cat(self.fs1[0:7], 1, 0, 0,0,0)), # | 0x00800000 FIXME!
NextValue(self.m1, Cat(self.fs1[16:23], 1, 0, 0,0,0)), # | 0x00800000
NextState("FSQRT1"),
)
)
@ -141,7 +147,7 @@ class bfloat16Processor(Module):
NextValue(self.fready, 1),
NextState("FPU_IDLE")
).Elif((self.fmadd | self.fmsub | self.fnmadd | self.fnmsub) & ((self.e2 == 0) & (self.m2 == 0)), # Nothing to add (w/o sign!)
NextState("self.fresult") # Just supply (normalized finally!) result from multiplication!
NextState("FRESULT") # Just supply (normalized finally!) result from multiplication!
).Else( # Ok, valid floats supplied ...
NextValue(self.s_bit, 0),
NextValue(self.branch1, 0), # Reset helpers
@ -236,7 +242,7 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 + self.s_bit),
NextState("FADD8") # Adjust possible overflow ...
).Else( # Nope, all ready
NextState("self.fresult")
NextState("FRESULT")
)
)
)
@ -247,10 +253,10 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 >> 1), # Adjust mantissa & increment exponent
NextValue(self.e3, self.e3 + 1)
),
NextState("self.fresult")
NextState("FRESULT")
) # End of fadd.s processing
FPU_fsm.act("self.fresult", # Result contruction & possible rounding
FPU_fsm.act("FRESULT", # Result contruction & possible rounding
NextValue(self.FPU_state, 9),
# 6. Build the actual resulting float
#NextValue(self.fresult, Cat(self.m3[0:23], self.e3+127, self.sign3)),
@ -327,23 +333,33 @@ class bfloat16Processor(Module):
# 6. Construction of result
NextValue(self.m3, (self.lm3 >> 23) & 0x7FFFFF),
# TODO: e3=se3 omitted ok?
If(self.fmul, # Simple multiplication
NextState("self.fresult")
If(self.fmul, # Simple multiplication
NextState("FRESULT")
).Else( # Fused multiply-add?
NextValue(self.sign3, self.sign3 ^ (self.fnmadd | self.fnmsub)), # Negate mult. result w/ f<n>xxx
NextState("FMADD1")
)
) # End of fmul.s processing
FPU_fsm.act("FMADD1",
FPU_fsm.act("FMADD1",
# sign3/e3/m3 -> sign1/e1/m1, fs3 -> sign2/e2/m2
NextValue(self.sign1, self.sign3), # Negate mult. result w/ f<n>xxx
NextValue(self.sign2, self.fs3[31] ^ (self.fmsub | self.fnmsub)), # Invert sign for subtraction!
NextValue(self.e1, self.e3),
NextValue(self.e2, self.fs3[23:31] - 127),
#NextValue(self.m1, Cat(0,0,0, self.m3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m1, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m1, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m1, Cat(0,0,0, self.m3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
#NextValue(self.m2, Cat(0,0,0, self.fs3[0:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.fs3[0:7], 1, 0)), # | 0x00800000 + R/G/S bits
NextState("FADD1")
#NextValue(self.m2, Cat(0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.fs3[16:23], 1, 0)), # | 0x00800000 + R/G/S bits
NextValue(self.m2, Cat(0,0,0, self.fs3[16:23], 1, 0)), # | 0x00800000 + R/G/S bits
# sign3/e3/m3 -> fs1 (reconstruction, nec. for compares, s.a.!)
NextValue(self.fs1, Cat(0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, self.m3[0:7], (self.e3+127)[0:8], self.sign3)),
# fs3 -> fs2
NextValue(self.fs2, self.fs3),
NextState("FADD1") # Add fs1 & fs2!
)
FPU_fsm.act("FDIV1",
@ -397,7 +413,7 @@ class bfloat16Processor(Module):
NextValue(self.m3, self.m3 << 1), # Subtraction normalization
NextValue(self.e3, self.e3 - 1),
).Else(
NextState("self.fresult")
NextState("FRESULT")
)
)
) # End of fdiv.s processing

Loading…
Cancel
Save