bfloat16nn/bfloat16nn.py

299 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
#
# bfloat16nn.py
#
# This file has been derived from LiteX-Boards/colorlight_5b_75x.py
# Copyright (c) 2020 Florent Kermarrec <florent@enjoy-digital.fr>
# SPDX-License-Identifier: BSD-2-Clause
#
# Disclaimer: Still a proof of concept with large timings violations on the IP/UDP and
# Etherbone stack that need to be optimized. It was initially just used to validate the reversed
# pinout but happens to work on hardware...
#
# History:
# --------
# 21.04.21/KQ Initially derived version
#
# Build/Use ----------------------------------------------------------------------------------------
# - 'python3 bfloat16nn.py --build --revision=7.0 --uart-name=crossover --with-etherbone --ip-address=192.168.1.20 --csr-csv=build/csr.csv'
# to generate
# - 'python3 bfloat16nn.py --load' to download to FPGA
# - 'ping 192.168.1.20' to verify ethernet connection - via LEFT(!) RJ45 port
# - 'wishbone-tool --ethernet-host 192.168.1.20 --server terminal --csr-csv build/csr.csv'
# You should see the LiteX BIOS and be able to interact with it
# - To load a file to RAM (@0x40000000 len=0x400000) use:
# wishbone-tool --ethernet-host 192.168.1.20 --server load-file --csr-csv build/csr.csv
# --load-address 0x40000000
# --load-name build/colorlight_5a_75b/software/<filename>
# To disassemble raw file:
# ../fpga/litex/riscv64-unknown-elf-gcc-8.3.0-2019.08.0-x86_64-linux-ubuntu14/bin/riscv64-unknown-elf-objdump
# -D -b binary ./build/colorlight_5a_75b/software/bios/bios.bin -m riscv
#
import os
import argparse
import sys
import time
from migen import *
from migen.genlib.misc import WaitTimer
from migen.genlib.resetsync import AsyncResetSynchronizer
from litex.build.io import DDROutput
from litex_boards.platforms import colorlight_5a_75b
from litex.build.lattice.trellis import trellis_args, trellis_argdict
from litex.soc.cores.clock import *
from litex.soc.cores.spi_flash import ECP5SPIFlash
from litex.soc.cores.gpio import GPIOOut
from litex.soc.integration.soc_core import *
from litex.soc.integration.builder import *
from litex.soc.interconnect.csr import AutoCSR, CSRStatus, CSRStorage, CSRField
from litex.soc.interconnect.stream import SyncFIFO
from litedram.modules import M12L16161A, M12L64322A
from litedram.phy import GENSDRPHY, HalfRateGENSDRPHY
from litedram.frontend.dma import LiteDRAMDMAReader, LiteDRAMDMAWriter
from liteeth.phy.ecp5rgmii import LiteEthPHYRGMII
from litex.build.generic_platform import *
import litex.soc.doc as lxsocdoc
from libmodules.dramtransfer import DRAM2FPGA, FPGA2DRAM
from libmodules.systime import SysTime
from libmodules.bfloat16nncore import bfloat16NeuralNetworkCore
from helpers.prepare_firmware import copyjob
# CRG ----------------------------------------------------------------------------------------------
class _CRG(Module):
def __init__(self, platform, sys_clk_freq, use_internal_osc=False, with_usb_pll=False, with_rst=True, sdram_rate="1:1"):
self.clock_domains.cd_sys = ClockDomain()
if sdram_rate == "1:2":
self.clock_domains.cd_sys2x = ClockDomain()
self.clock_domains.cd_sys2x_ps = ClockDomain(reset_less=True)
else:
self.clock_domains.cd_sys_ps = ClockDomain(reset_less=True)
# # #
# Clk / Rst
if not use_internal_osc:
clk = platform.request("clk25")
clk_freq = 25e6
else:
clk = Signal()
div = 5
self.specials += Instance("OSCG",
p_DIV = div,
o_OSC = clk)
clk_freq = 310e6/div
rst_n = 1 if not with_rst else platform.request("user_btn_n", 0)
# PLL
self.submodules.pll = pll = ECP5PLL()
self.comb += pll.reset.eq(~rst_n)
pll.register_clkin(clk, clk_freq)
pll.create_clkout(self.cd_sys, sys_clk_freq)
if sdram_rate == "1:2":
pll.create_clkout(self.cd_sys2x, 2*sys_clk_freq)
pll.create_clkout(self.cd_sys2x_ps, 2*sys_clk_freq, phase=180) # Idealy 90° but needs to be increased.
else:
pll.create_clkout(self.cd_sys_ps, sys_clk_freq, phase=180) # Idealy 90° but needs to be increased.
# USB PLL
if with_usb_pll:
self.submodules.usb_pll = usb_pll = ECP5PLL()
self.comb += usb_pll.reset.eq(~rst_n)
usb_pll.register_clkin(clk, clk_freq)
self.clock_domains.cd_usb_12 = ClockDomain()
self.clock_domains.cd_usb_48 = ClockDomain()
usb_pll.create_clkout(self.cd_usb_12, 12e6, margin=0)
usb_pll.create_clkout(self.cd_usb_48, 48e6, margin=0)
# SDRAM clock
sdram_clk = ClockSignal("sys2x_ps" if sdram_rate == "1:2" else "sys_ps")
self.specials += DDROutput(1, 0, platform.request("sdram_clock"), sdram_clk)
# BaseSoC ------------------------------------------------------------------------------------------
class BaseSoC(SoCCore):
def __init__(self, board, revision, with_ethernet=False, with_etherbone=False, eth_phy=0, ip_address=None, mac_address=None, sys_clk_freq=60e6, use_internal_osc=False, sdram_rate="1:1", **kwargs):
platform = colorlight_5a_75b.Platform(revision="7.0")
# SoCCore ----------------------------------------------------------------------------------
SoCCore.__init__(self, platform, int(sys_clk_freq),
ident = "LiteX SoC on Colorlight " + board.upper(),
ident_version = True,
**kwargs)
# CRG --------------------------------------------------------------------------------------
with_rst = kwargs["uart_name"] not in ["serial", "bridge"] # serial_rx shared with user_btn_n.
with_usb_pll = kwargs.get("uart_name", None) == "usb_acm"
self.submodules.crg = _CRG(platform, sys_clk_freq, use_internal_osc=use_internal_osc, with_usb_pll=with_usb_pll,with_rst=with_rst, sdram_rate=sdram_rate)
# SDR SDRAM --------------------------------------------------------------------------------
if not self.integrated_main_ram_size:
sdrphy_cls = HalfRateGENSDRPHY if sdram_rate == "1:2" else GENSDRPHY
self.submodules.sdrphy = sdrphy_cls(platform.request("sdram"))
if board == "5a-75e" and revision == "6.0":
sdram_cls = M12L64322A
sdram_size = 0x80000000
else:
sdram_cls = M12L16161A
sdram_size = 0x40000000
self.add_sdram("sdram",
phy = self.sdrphy,
module = sdram_cls(sys_clk_freq, sdram_rate),
origin = self.mem_map["main_ram"],
size = kwargs.get("max_sdram_size", sdram_size),
l2_cache_size = kwargs.get("l2_size", 8192),
l2_cache_min_data_width = kwargs.get("min_l2_data_width", 128),
l2_cache_reverse = True
)
# Ethernet / Etherbone ---------------------------------------------------------------------
if with_ethernet or with_etherbone:
self.submodules.ethphy = LiteEthPHYRGMII(
clock_pads = self.platform.request("eth_clocks", eth_phy),
pads = self.platform.request("eth", eth_phy))
self.add_csr("ethphy")
if with_ethernet:
self.add_ethernet(phy=self.ethphy)
if with_etherbone:
self.add_etherbone(
phy=self.ethphy,
ip_address = ip_address,
mac_address = mac_address,
)
# Base counter (used for clocking)
counter = Signal(32) # 32-Bit counter
self.sync += counter.eq(counter + 1)
# System time (count)
self.submodules.systime = systime = SysTime(comparecount=0x0000EA90)
self.add_csr("systime")
# DRAM access section
MAXWORDS = 512 #512 # Transfer length 32 x 32-bit, FIFO depth (511 L1 cache currently possible = 9-bit!)
# Load unit memory access
self.submodules.dma_reader = dma_reader = LiteDRAMDMAReader(self.sdram.crossbar.get_port(), fifo_depth=MAXWORDS, fifo_buffered=True)
dma_reader.add_csr()
self.add_csr("dma_reader")
# Load unit transfer
self.submodules.sync_fifo_in = sync_fifo_in = SyncFIFO([("data", 32)], MAXWORDS, True)
self.comb += dma_reader.source.connect(sync_fifo_in.sink) # Connect DMA-Reader.source -> FIFO.sink
# Load unit (LU)
self.submodules.dram2fpga = dram2fpga = DRAM2FPGA(maxwords=MAXWORDS, dma_reader=dma_reader, sync_fifo=sync_fifo_in)
self.add_csr("dram2fpga")
""" *** Not used currently ! ***
MAXWRITEWORDS = 1 # Transfer length 1 x 32-bit = 4 byte maximum (SU)
# Store unit memory access
self.submodules.dma_writer = dma_writer = LiteDRAMDMAWriter(self.sdram.crossbar.get_port(), fifo_depth=MAXWRITEWORDS, fifo_buffered=True)
dma_writer.add_csr()
self.add_csr("dma_writer")
# Store unit transfer
self.submodules.sync_fifo_out = sync_fifo_out = SyncFIFO([("data", 32)], MAXWRITEWORDS, True)
self.comb += sync_fifo_out.source.connect(dma_writer.sink) # Connect FIFO.source -> DMA-Writer.sink
# Store unit (SU)
self.submodules.fpga2dram = fpga2dram = FPGA2DRAM(dma_writer=dma_writer, sync_fifo=sync_fifo_out)
self.add_csr("fpga2dram")
"""
# Integrate bfloat16NN processor
RAMWAITTIME=1 # Minimum wait!
self.submodules.bfloat16nn = bfloat16nn = bfloat16NeuralNetworkCore(
RAMWaitTime=RAMWAITTIME,
LUCacheSize=MAXWORDS,
LoadUnit=dram2fpga,
StoreUnit=None, # *** Not used currently: fpga2dram,
)
self.add_csr("bfloat16nn")
# USERLED blink (on-board LED)
# only w/ uart-name=crossover option:
if kwargs["uart_name"] not in ["serial", "bridge"]:
self.comb += platform.request("user_led_n").eq(~(bfloat16nn.bReady))
# Build --------------------------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="LiteX SoC on Colorlight 5A-75X")
builder_args(parser)
soc_core_args(parser)
trellis_args(parser)
parser.add_argument("--build", action="store_true", help="Build bitstream")
parser.add_argument("--load", action="store_true", help="Load bitstream")
parser.add_argument("--board", default="5a-75b", help="Board type: 5a-75b (default) & don't change!")
parser.add_argument("--revision", default="7.0", type=str, help="Board revision 7.0 (default) & don't change!")
parser.add_argument("--with-ethernet", action="store_true", help="Enable Ethernet support")
parser.add_argument("--with-etherbone", action="store_true", help="Enable Etherbone support")
parser.add_argument("--eth-phy", default=0, type=int, help="Ethernet PHY 0 or 1 (default=0)")
parser.add_argument("--ip-address", default="192.168.1.50", help="Ethernet IP address of the board.")
parser.add_argument("--mac-address", default="0x726b895bc2e2", help="Ethernet MAC address of the board.")
parser.add_argument("--sys-clk-freq", default=60e6, type=float, help="System clock frequency (default=60MHz)")
parser.add_argument("--use-internal-osc", action="store_true", help="Use internal oscillator")
parser.add_argument("--sdram-rate", default="1:1", help="SDRAM Rate 1:1 Full Rate (default), 1:2 Half Rate")
parser.add_argument("--csr_csv", default="build/csr.csv", help="CSR list location")
parser.add_argument("--doc", action="store_true", help="Create doc files for sphinx generator")
parser.add_argument("--flash", action="store_true", help="Load bitstream to flash")
args = parser.parse_args()
#assert not (args.with_ethernet and args.with_etherbone)
soc = BaseSoC(board=args.board, revision=args.revision,
with_ethernet = args.with_ethernet,
with_etherbone = args.with_etherbone,
eth_phy = args.eth_phy,
ip_address = args.ip_address,
mac_address = int(args.mac_address, 0),
sys_clk_freq = args.sys_clk_freq,
use_internal_osc = args.use_internal_osc,
sdram_rate = args.sdram_rate,
**soc_core_argdict(args))
# 32MBit SPIFlash ------------------------------------------------------------------------
flashbase = 0xc0000000
flashoffset = 0x100000 # Used to be zero (default)
soc.mem_map["spiflash"] = flashbase # Length: 0x01000000 ('til 0xc1000000 - 1)
# Boot at +1MB
soc.add_constant("FLASH_BOOT_ADDRESS", soc.mem_map["spiflash"] + 1024*1024) # 0xc0100000
soc.add_spi_flash(name="spiflash", mode="1x", dummy_cycles=8, clk_freq=5e6)
builder = Builder(soc, **builder_argdict(args))
# Now override boot address (used to be zero/default)
args.ecppack_bootaddr = flashbase + flashoffset # 0xC0100000
builder.build(**trellis_argdict(args), run=args.build) # Written here to (local) build tree
if args.doc:
print("Generating documentation for sphinx ...")
lxsocdoc.generate_docs(soc, "build/documentation/", project_name="neopixelar", author="KQ")
print("Generate via: 'sphinx-build -b html build/documentation build/documentation/html'")
if args.load:
prog = soc.platform.create_programmer()
prog.load_bitstream(os.path.join(builder.gateware_dir, soc.build_name + ".svf"))
return
if args.flash: # Convert Bit File to Jtag Write Flash command
name = os.path.join(builder.gateware_dir, soc.build_name)
print(f"Executing ./bit_to_flash.py {name}.bit {name}.svf.flash")
from helpers.bit_to_flash import convertBitToFlashFile
convertBitToFlashFile(name + ".bit", name + ".svf.flash", address=0)
from helpers.load_to_flash import load2flash
load2flash(name + ".svf.flash")
return
if __name__ == "__main__":
starttime = time.time()
copyjob() # Create backup if nec. & move our firmware to the correct location
main() # Create FPGA & load/flash
print("Time used: {0} min.".format(int((time.time() - starttime)/60.0)))