parent
a71d4f908e
commit
57ff44b72b
@ -0,0 +1,68 @@ |
||||
# bfloat16nn - an FPGA project # |
||||
|
||||
This project demonstrates the use of half-precision floats on an FPGA, dubbed 'bfloat16nn'. |
||||
The project requires a colorlight-5a-75b board. A RISC-V CPU (RV32I) is incorporated. |
||||
The project also makes use of LiteDRAM DMA capabilities. |
||||
|
||||
(Hint: project has been tested on Linux Mint 20 only, but should run on other Linux versions as well ...) |
||||
|
||||
## Installation ## |
||||
|
||||
### 1. Software ### |
||||
|
||||
To use this project effectively, you will have to install LiteX, see https://github.com/enjoy-digital/litex for details (and project Trellis, NextPNR & YoSys requirements). |
||||
Also, it is recommended to install the board support, see https://github.com/litex-hub/litex-boards, |
||||
as well as the the RISC-V tool chain (see https://github.com/sifive/freedom-tools/releases). |
||||
To communicate with your board via network, install the wishbone tools, see https://github.com/litex-hub/wishbone-utils. |
||||
|
||||
To use the automatic documentation feature, you will have to install sphinx, see https://www.sphinx-doc.org/en/master. Also its wavedrom extension has to be installed, see https://pypi.org/project/wavedrom. |
||||
Some helpful links for RST docstring formats: |
||||
http://daouzli.com/blog/docstring.html & |
||||
https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html |
||||
|
||||
The project assumes a local 'fpga' path within the home directory of the user, where all the above mentioned software packages are installed. |
||||
Furthermore, the project assumes a virtual environment named 'fpga' where all project relevant python libs are registered (this is not strictly necessary ... maybe software/ramcreate.sh has to be adjusted, as well as the python interpreter settings within VSC!). |
||||
The actual project may be installed anywhere, but local paths will have to be adjusted (firmware/main.c, software/ramcreate.sh ... worx for me ;). |
||||
|
||||
### 2. Hardware ### |
||||
|
||||
A JTAG programmer will be required for successful device programming. Thanx to Wolfgang, I'm using the Versaloon (s/w for blue-pill STM32), see https://github.com/zoobab/versaloon. To use this device, you also will have to install openocd via 'apt install openocd'. See https://git.hacknology.de/wolfgang/colorlight#user-content-class-hub75sender for details, on how to connect the JTAG adapter. |
||||
|
||||
For board specific details see |
||||
https://github.com/enjoy-digital/colorlite/blob/master. |
||||
Other helpful links to board data: |
||||
https://github.com/q3k/chubby75/blob/master/5a-75b/hardware_V7.0.md |
||||
https://saturn.ffzg.hr/rot13/index.cgi?colorlight_5a_75b |
||||
https://github.com/trabucayre/litexOnColorlightLab004/ |
||||
https://blog.pcbxprt.com/index.php/2020/07/19/running-risc-v-core-on-small-fpga-board/ |
||||
|
||||
## Program structure: ## |
||||
1. bfloat16nn.py - this is the main FPGA building source |
||||
2. start_terminal_service.sh - once the FPGA has been loaded, this will prepare the terminal service |
||||
2. libmodules subdir - contains the actual bfloat16 processing units, DRAM DMA helpers & system time support |
||||
4. helpers subdir - contains python helpers for load & flash etc. (not used here) |
||||
5. firmware subdir - contains some modified BIOS files (relative to the original version) |
||||
6. software subdir - contains a separate build, load & flash logic for separate (RV32i) application code |
||||
(the rest is of minor importance ...) |
||||
|
||||
## Quickstart ## |
||||
|
||||
After installation of the relevant toolchains: |
||||
1. Open the project in VSC (or use your favourite IDE & maybe adjust some settings ;), adjust local paths if nec. ... |
||||
2. Connect your JTAG adapter as described in Wolfgang's documentation @ https://git.hacknology.de/wolfgang/colorlight |
||||
3. Run bfloat16nn.py with these options (you may omit the --doc option if there is no Sphinx installed): |
||||
--build --load --revision=7.0 --uart-name=crossover --with-etherbone --ip-address=192.168.1.20 --csr-csv=build/csr.csv --doc |
||||
to create & load the project to on-board SRAM via the USB/JTAG-Adapter (this takes it's time ...) |
||||
|
||||
## Individual (separate) applications ## |
||||
1. This time, open up a terminal & cd to the project local 'software' subdirectory |
||||
2. You can load an application to RAM bank 1: |
||||
./ramcreate.sh main bfloat16nnlib 1 |
||||
3. To run the (now) RAM based application, type 'cd ..' within terminal |
||||
4. Connect the Litex-Terminal to the board via: |
||||
./start_terminal_service.sh |
||||
5. Type 'ramboot' into terminal, the RAM based application should come up now |
||||
6. You can load an application to RAM bank 2: |
||||
./ramcreate.sh main bfloat16nnlib 2 |
||||
7. Now, use 'ramboot' again! The system should swap to RAM bank #2 and boot the application right away |
||||
8. This is the testing loop, once your happy w/ your application, it needs to be flashed |
@ -1,107 +0,0 @@ |
||||
#!/usr/bin/env python3 |
||||
|
||||
# |
||||
# remotetest.py |
||||
# Test remote access to DRAM (start from project root) |
||||
# Requires litex server running (~/fpga/bin/lxserver --udp --udp-ip=192.168.1.20) |
||||
# |
||||
# History: |
||||
# -------- |
||||
# 20.09.20/KQ Initial version |
||||
# |
||||
|
||||
import argparse |
||||
import time |
||||
from litex import RemoteClient |
||||
|
||||
def test(csr_csv): |
||||
wb = RemoteClient(csr_csv=csr_csv) # Access wishbone bus |
||||
wb.open() # to remote client |
||||
|
||||
print("DRAM read test: ") |
||||
for i in range(10): |
||||
print(".",end="") |
||||
#You may add manually: ~/fpga/litex/litex/litex/tools/remote/csr_builder.py |
||||
# def writearray(self, value): #21.09.20/KQ |
||||
# if self.mode not in ["rw", "wo"]: |
||||
# raise KeyError(self.name + "register not writable") |
||||
# self.writefn(self.addr, value) |
||||
|
||||
# From build/csr.csv we know: |
||||
# csr_register,dramtest_b32Address,0x82004000,4,rw |
||||
# csr_register,dramtest_b12Offset,0x82004010,2,rw |
||||
# csr_register,dramtest_b8Len,0x82004018,1,rw |
||||
# csr_register,dramtest_bEnable,0x8200401c,1,rw |
||||
# csr_register,dramtest_b32Data,0x82004020,4,rw |
||||
# csr_register,dramtest_bValid,0x82004030,1,rw |
||||
wb.regs.dramtest_bEnable.write(0) # Disable read |
||||
wb.regs.dramtest_b32Address.write(0x40190000) # Base address |
||||
wb.regs.dramtest_b12Offset.write(0) # + Offset |
||||
wb.regs.dramtest_b8Len.write(4) # Len (bytes) |
||||
wb.regs.dramtest_bEnable.write(1) # Enable read |
||||
for j in range(10): |
||||
bValid = wb.regs.dramtest_bValid.read() |
||||
if bValid != 0: |
||||
break |
||||
else: |
||||
pass #time.sleep(0.05) # Give a little time |
||||
|
||||
if bValid == 1: # Valid data available? |
||||
data = wb.regs.dramtest_b32Data.read() # Get it! |
||||
print("DATA: {}".format(hex(data)), end=" ") |
||||
if data != 0x40190000: |
||||
print("*** WRONG!!! ***") |
||||
else: |
||||
print("OK") |
||||
else: # Ooophs! Shouldn't happen ... |
||||
print("Data not valid in time?!") |
||||
wb.close() # Close wishbone access |
||||
print(" Done.") |
||||
|
||||
def test2(csr_csv): |
||||
wb = RemoteClient(csr_csv=csr_csv) # Access wishbone bus |
||||
wb.open() # to remote client |
||||
|
||||
print("DRAM read test #2: ") |
||||
for i in range(10): |
||||
#print(".",end="") |
||||
|
||||
# From build/csr.csv we know: |
||||
# csr_register,dma_reader_base,0x82004000,4,rw |
||||
# csr_register,dma_reader_length,0x82004010,4,rw |
||||
# csr_register,dma_reader_start,0x82004020,1,rw |
||||
# csr_register,dma_reader_done,0x82004024,1,ro |
||||
# csr_register,dma_reader_loop,0x82004028,1,rw |
||||
#wb.regs.dma_reader_start.write(0) # Disable read |
||||
address = 0x40190000 + i*4 #40190000 |
||||
wb.regs.dma_reader_base.write(address) # Base address |
||||
wb.regs.dma_reader_length.write(4) # Len (bytes) |
||||
wb.regs.dma_reader_start.write(1) # Enable read, triggers _start.re for one cycle ... |
||||
for j in range(20): |
||||
print(wb.regs.dma_reader_loop.read(),end=" ") |
||||
|
||||
for j in range(10): |
||||
bValid = wb.regs.dma_reader_done.read() |
||||
print("_done=", bValid) |
||||
if bValid == 1: |
||||
break |
||||
else: |
||||
pass #time.sleep(0.05) # Give a little time |
||||
|
||||
if bValid == 1: # Valid data available? |
||||
data = wb.regs.dma_reader_loop.read() # Get it! |
||||
print("@{}: DATA={}".format(hex(address),hex(data))) |
||||
else: # Ooophs! Shouldn't happen ... |
||||
print("Data not valid in time?!") |
||||
wb.close() # Close wishbone access |
||||
print(" Done.") |
||||
|
||||
def main(): |
||||
parser = argparse.ArgumentParser(description="LiteX SoC on Colorlight 5A-75X Testbench") |
||||
parser.add_argument("--csr-csv", default="build/csr.csv", help="CSR list location") |
||||
args = parser.parse_args() |
||||
#test(args.csr_csv) |
||||
test2(args.csr_csv) |
||||
|
||||
if __name__ == "__main__": |
||||
main() |
Binary file not shown.
@ -1,31 +1,12 @@ |
||||
# LIBMODULES # |
||||
|
||||
THE project files (where the problem in question is solved 🤔 ...). |
||||
## File contents: ## |
||||
|
||||
__dramtransfer.py__ - contains main helpers for DRAM access. |
||||
|
||||
__bfloat16nncore.py__ - Neural network core processing |
||||
|
||||
__bfloat16processor.py__ - contains the bfloat16nn processing paths. |
||||
|
||||
A note on the square root logic: I have implemented a variant of Goldschmidt's |
||||
algorithm which allows for up to ⚠ 3.5% error, but there is simply no replacement for speed! |
||||
If you need more accuracy, you will have to implement Newton-Raphson in s/w or perhaps |
||||
doubles w/ external lib. calls. Example: |
||||
|
||||
// Newton-Raphson approximation (6 digits after decimal ok) |
||||
#define MAXITERATION 128 |
||||
#define ACCURRACY 1E-16 |
||||
|
||||
float f = <value>; // Whatever you wanna calc.! |
||||
float approx = 0.5 * f; // 1st approximation |
||||
float betterapprox; |
||||
for(int i=0;i < MAXITERATION;i++) { |
||||
betterapprox = 0.5 * (approx + f/approx); |
||||
if(f_abs(betterapprox - approx) < ACCURRACY) |
||||
break; |
||||
approx = betterapprox; |
||||
} |
||||
__bfloat16nncore.py__ - Neural network core processing (more or less an empty framework, for now just one function included) |
||||
|
||||
__bfloat16processor.py__ - contains the bfloat16 processing cores (# may be increased on larger chips) |
||||
|
||||
__systime.py__ - contains system time support |
||||
|
@ -1,6 +1,5 @@ |
||||
main.c - This is the rudimentary BIOS loop |
||||
illumination.c - This is a sample application, demonstrating the use |
||||
of the 'neopixelengine' (the documentation can be found |
||||
under ./build/documentation/http/index.html) |
||||
bfloat16nnlib.c - This is a sample application, demonstrating the speedup |
||||
of the 'bfloat16nn' FPGA solution |
||||
my_vsnprintf.c - Some helpers. For float formatting use printf1() (really ugly!) |
||||
systime.c - (Daily) Time helper |
||||
|
@ -1,201 +0,0 @@ |
||||
//
|
||||
// dramtransfer.c
|
||||
// DRAM transfer routines
|
||||
//
|
||||
// History:
|
||||
// --------
|
||||
// 07.10.20/KQ Initial version
|
||||
// 18.10.20/KQ RAM version w/ command interpreter loop ready
|
||||
//
|
||||
|
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <console.h> |
||||
#include <string.h> |
||||
#include <uart.h> |
||||
#include <system.h> |
||||
#include <id.h> |
||||
#include <irq.h> |
||||
#include <crc.h> |
||||
#include "boot.h" |
||||
#include "readline.h" |
||||
#include "helpers.h" |
||||
#include "command.h" |
||||
|
||||
#include "../../build/colorlight_5a_75b/software/include/generated/csr.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/soc.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/mem.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/git.h" |
||||
|
||||
#include <spiflash.h> |
||||
|
||||
#include <liblitedram/sdram.h> |
||||
|
||||
#include <libliteeth/udp.h> |
||||
#include <libliteeth/mdio.h> |
||||
|
||||
#include <liblitespi/spiflash.h> |
||||
|
||||
#include <liblitesdcard/sdcard.h> |
||||
|
||||
#include "../include/systime.h" |
||||
#include "../include/dramtransfer.h" |
||||
|
||||
extern void busy_wait(unsigned int ms); // Worx!
|
||||
extern char kbhit(void); |
||||
extern int key_eval(void); |
||||
|
||||
static int iDelay = 15; // 100ms by default
|
||||
|
||||
int key_eval(void) |
||||
{ |
||||
switch(kbhit()) { |
||||
case 'w': if(iDelay < 200) |
||||
iDelay += 10;
|
||||
break; |
||||
case 's': if(iDelay > 10)
|
||||
iDelay -= 10;
|
||||
break; |
||||
case 'x': return 1; // Abort indication
|
||||
default: ; |
||||
} |
||||
return(0); |
||||
} |
||||
|
||||
// Store FPGA data to DRAM
|
||||
int store_FPGA(uint32_t iBaseAddress, int8_t bLen) |
||||
{ |
||||
int i; |
||||
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
|
||||
if((iBaseAddress & 0xF) != 0) { |
||||
iBaseAddress &= 0xFFFFFFF0; // Enforce 16-byte alignment
|
||||
printf("*** load_FPGA(): iBaseAddress alignment needs to be 16 Byte! Using %08Xh\n", iBaseAddress); |
||||
}
|
||||
if((iBaseAddress < 0x40000000) || (iBaseAddress > 0x40400000)) { |
||||
printf("*** load_FPGA(): iBaseAddress not within DRAM range!\n"); |
||||
return 0; |
||||
}
|
||||
|
||||
fpga2dram_bEnable_write(0); // For address change, turn off first!
|
||||
fpga2dram_b32Address_write(iBaseAddress); // Provide a valid DRAM address
|
||||
fpga2dram_bEnable_write(1); // Start DMA pickup & FIFO fill ...
|
||||
for(i = 0; i < TIMEOUT; i++) { |
||||
if(fpga2dram_bValid_read()) // Wait 'til transfer done
|
||||
break; |
||||
} |
||||
fpga2dram_bEnable_write(0); // Indicate termination of action
|
||||
if(i>=TIMEOUT) { // Timing?
|
||||
printf("*** TIMEOUT: bValid not set?\n"); |
||||
return 0; |
||||
} |
||||
printf("State: %d\n", fpga2dram_b32Data_read()); |
||||
printf("Transferred: %d\n", fpga2dram_b32WCount_read()); |
||||
return 1; // Ok, FPGA data stored to DRAM!
|
||||
} |
||||
|
||||
// Load FPGA from DRAM
|
||||
int load_FPGA(uint32_t iBaseAddress, int8_t bLen) |
||||
{ |
||||
int i; |
||||
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
|
||||
if((iBaseAddress & 0xF) != 0) { |
||||
iBaseAddress &= 0xFFFFFFF0; // Enforce 16-byte alignment
|
||||
printf("*** load_FPGA(): iBaseAddress alignment needs to be 16 Byte! Using %08Xh\n", iBaseAddress); |
||||
}
|
||||
if((iBaseAddress < 0x40000000) || (iBaseAddress > 0x40400000)) { |
||||
printf("*** load_FPGA(): iBaseAddress not within DRAM range!\n"); |
||||
return 0; |
||||
}
|
||||
|
||||
dramtransfer_bEnable_write(0); // For address change, turn off first!
|
||||
dramtransfer_b32Address_write(iBaseAddress); // Provide a valid DRAM address
|
||||
dramtransfer_bEnable_write(1); // Start DMA pickup & FIFO fill ...
|
||||
for(i = 0; i < TIMEOUT; i++) { |
||||
if(dramtransfer_bValid_read()) // Wait 'til transfer done
|
||||
break; |
||||
} |
||||
dramtransfer_bEnable_write(0); // Indicate termination of action
|
||||
if(i>=TIMEOUT) { // Timing?
|
||||
printf("*** TIMEOUT: bValid not set?\n"); |
||||
return 0; |
||||
} |
||||
return 1; // Ok, FPGA loaded!
|
||||
} |
||||
|
||||
// Retrieve FPGA memory@offset
|
||||
int32_t retrieve_FPGA(int iOffset) |
||||
{
|
||||
if((iOffset < 0) || (iOffset >= FIFOSIZE)) { |
||||
printf("*** retrieve_FPGA(): Invalid offset?!\n"); |
||||
iOffset = 0; |
||||
} |
||||
dramtransfer_b9Offset_write(iOffset); // ->memory[offset]
|
||||
return dramtransfer_b32Data_read(); // memory[offset]
|
||||
} |
||||
|
||||
#define BASEADDRESS 0x40190000 |
||||
#define MAXLOOPS 64 // 8x 0x400 (8x 256 * 4 = 8x 1024) => 0x2000
|
||||
|
||||
void dramtest(void)
|
||||
{ |
||||
uint32_t *TxPtr;
|
||||
int i, j; |
||||
uint32_t iStart; |
||||
|
||||
printf("---- DRAM writer test ----\n"); |
||||
store_FPGA(0x40190000,4); |
||||
|
||||
#ifdef DRAMREAD |
||||
printf("---- DRAM transfer test ----\n"); |
||||
iStart = systime(1); // Reset to defined value ...
|
||||
int iSum = 0; // Time aggregate
|
||||
for(j = 0; j < MAXLOOPS; j++) {
|
||||
// Attention: int32_t Assignments require at least 4 bytes boundary alignment!
|
||||
TxPtr = (uint32_t *)((BASEADDRESS & 0xFFFFFFFC) + j * (FIFOSIZE * sizeof(int32_t))); |
||||
for(i = 0; i < FIFOSIZE; i++) { // Fill source range data within DRAM
|
||||
*(uint32_t *)(TxPtr+i) = j*FIFOSIZE + i+1; |
||||
} |
||||
iStart = systime(0); // Expect 0.70 ms/transfer ...
|
||||
if(load_FPGA((uint32_t)TxPtr, 0)) { // Load <n> bytes from 0x40190000 ... (16 byte aligned!)
|
||||
iSum += (systime(0) - iStart); // Time delta added
|
||||
for(i=0;i<FIFOSIZE;i++) { |
||||
if((j*FIFOSIZE + i + 1) != retrieve_FPGA(i)) { |
||||
printf("%d: TxPtr -> %08X (%d 32-bit words) (Count before: %d) ", j, (uint32_t)TxPtr, FIFOSIZE, dramtransfer_b32RCount_read()); |
||||
printf("\n*** FAIL mem[%03d] %d != %08Xh\n", i, j * FIFOSIZE + i + 1, retrieve_FPGA(i)); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
else { |
||||
printf("*** load_FPGA(): FAILED?!\n"); |
||||
break; |
||||
}
|
||||
} |
||||
if(j >= MAXLOOPS) { |
||||
printf("PASS: %d bytes transferred in %dms w/ %d chuncks of %d bytes/each (0.%d ms/transfer), range %08Xh-%08Xh.\n",
|
||||
MAXLOOPS * 4 * FIFOSIZE, iSum, MAXLOOPS, 4 * FIFOSIZE, iSum*100/MAXLOOPS, BASEADDRESS, BASEADDRESS+MAXLOOPS*4*FIFOSIZE-1); |
||||
} |
||||
#endif |
||||
#ifdef XXX |
||||
printf("\n---- System time test ----\n");
|
||||
systime(86399999-1000); |
||||
printf("Pre: %d\n", systime(0)); |
||||
busy_wait(2000); |
||||
printf("Post: %d\n", systime(0)); |
||||
|
||||
int hh, mm, ss; // ------------------ Time adjust test ----------------------
|
||||
setsystime(13,55,0); |
||||
getsystime(&hh, &mm, &ss); |
||||
printf("Time now: %02d:%02d:%02d\nWait 60s, press [x] to terminate ...\n", hh, mm, ss);
|
||||
while(!key_eval()) { |
||||
busy_wait(60000); // 60s
|
||||
getsystime(&hh, &mm, &ss); |
||||
printf("Time now: %02d:%02d:%02d\nWait 60s, press [x] to terminate ...\n", hh, mm, ss);
|
||||
} |
||||
#endif |
||||
} |
||||
|
@ -1,117 +0,0 @@ |
||||
//
|
||||
// fpga2dram.c
|
||||
// DRAM transfer tests
|
||||
//
|
||||
// History:
|
||||
// --------
|
||||
// 25.01.21/KQ Initial version
|
||||
//
|
||||
|
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <console.h> |
||||
#include <string.h> |
||||
#include <uart.h> |
||||
#include <system.h> |
||||
#include <id.h> |
||||
#include <irq.h> |
||||
#include <crc.h> |
||||
#include "boot.h" |
||||
#include "readline.h" |
||||
#include "helpers.h" |
||||
#include "command.h" |
||||
|
||||
#include "../../build/colorlight_5a_75b/software/include/generated/csr.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/soc.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/mem.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/git.h" |
||||
|
||||
#include <spiflash.h> |
||||
|
||||
#include <liblitedram/sdram.h> |
||||
|
||||
#include <libliteeth/udp.h> |
||||
#include <libliteeth/mdio.h> |
||||
|
||||
#include <liblitespi/spiflash.h> |
||||
|
||||
#include <liblitesdcard/sdcard.h> |
||||
|
||||
#include "../include/systime.h" |
||||
#include "../include/dramtransfer.h" |
||||
|
||||
extern void busy_wait(unsigned int ms); // Worx!
|
||||
extern char kbhit(void); |
||||
extern int key_eval(void); |
||||
|
||||
static int iDelay = 15; // 100ms by default
|
||||
|
||||
int key_eval(void) |
||||
{ |
||||
switch(kbhit()) { |
||||
case 'w': if(iDelay < 200) |
||||
iDelay += 10;
|
||||
break; |
||||
case 's': if(iDelay > 10)
|
||||
iDelay -= 10;
|
||||
break; |
||||
case 'x': return 1; // Abort indication
|
||||
default: ; |
||||
} |
||||
return(0); |
||||
} |
||||
|
||||
#define WRITETIMEOUT 1024 |
||||
// Store FPGA data to DRAM
|
||||
int store_FPGA(uint32_t iBaseAddress, int8_t bLen) |
||||
{ |
||||
int i; |
||||
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
|
||||
#ifdef ALLADDRESS |
||||
if((iBaseAddress & 0xF) != 0) { |
||||
iBaseAddress &= 0xFFFFFFF0; // Enforce 16-byte alignment
|
||||
printf("*** store_FPGA(): iBaseAddress alignment needs to be 16 Byte! Using %08Xh\n", iBaseAddress); |
||||
}
|
||||
#endif |
||||
if((iBaseAddress < 0x40000000) || (iBaseAddress > 0x40400000)) { |
||||
printf("*** store_FPGA(): iBaseAddress not within DRAM range!\n"); |
||||
return 0; |
||||
}
|
||||
|
||||
fpga2dram_bEnable_write(0); // For address change, turn off first!
|
||||
fpga2dram_b32Address_write(iBaseAddress); // Provide a valid DRAM address (length: 4x4=16 bytes implied)
|
||||
fpga2dram_bEnable_write(1); // Start DMA pickup & FIFO fill ...
|
||||
for(i = 0; i < WRITETIMEOUT; i++) { //TIMEOUT; i++) {
|
||||
if(fpga2dram_bValid_read()) // Wait 'til transfer done
|
||||
break; |
||||
} |
||||
fpga2dram_bEnable_write(0); // Indicate termination of action
|
||||
if(i>=WRITETIMEOUT) { // Timing?
|
||||
printf("*** TIMEOUT: bValid not set?\n"); |
||||
return 0; |
||||
} |
||||
printf("State: %d\n", fpga2dram_b32Data_read()); |
||||
printf("Transferred: %d\n", fpga2dram_b32WCount_read()); |
||||
return 1; // Ok, FPGA data stored to DRAM!
|
||||
} |
||||
|
||||
// 0..3 -> 12..15
|
||||
// 4..7 -> 8..11
|
||||
// 8..11 -> 4..7
|
||||
// 12..15 -> 0..3
|
||||
#define BASEADDRESS 0x40190004 // 0x40190000 ok (16-byte aligned)
|
||||
|
||||
void dramtest(void)
|
||||
{ |
||||
printf("---- DRAM DMA writer test ----\n"); |
||||
store_FPGA(BASEADDRESS, 4); |
||||
printf("FSM #%08Xh\t\tCounter #%d\n", (uint32_t)fpga2dram_b32Data_read(), (uint32_t)fpga2dram_b32WCount_read());
|
||||
busy_wait(1000); |
||||
printf("FSM #%08Xh\t\tCounter #%d\n", (uint32_t)fpga2dram_b32Data_read(), (uint32_t)fpga2dram_b32WCount_read());
|
||||
busy_wait(1000); |
||||
printf("FSM #%08Xh\t\tCounter #%d\n", (uint32_t)fpga2dram_b32Data_read(), (uint32_t)fpga2dram_b32WCount_read());
|
||||
printf("Done."); |
||||
} |
||||
|
@ -1,217 +0,0 @@ |
||||
//
|
||||
// illumination.c
|
||||
// The Neopixel-Engine demonstration
|
||||
//
|
||||
// History:
|
||||
// --------
|
||||
// 07.10.20/KQ Initial version
|
||||
// 18.10.20/KQ RAM version w/ command interpreter loop ready
|
||||
// 02.01.21/KQ Includes DRAM DMA data access now
|
||||
//
|
||||
|
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <console.h> |
||||
#include <string.h> |
||||
#include <uart.h> |
||||
#include <system.h> |
||||
#include <id.h> |
||||
#include <irq.h> |
||||
#include <crc.h> |
||||
|
||||
#include "boot.h" |
||||
#include "readline.h" |
||||
#include "helpers.h" |
||||
#include "command.h" |
||||
|
||||
#include "../../build/colorlight_5a_75b/software/include/generated/csr.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/soc.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/mem.h" |
||||
#include "../../build/colorlight_5a_75b/software/include/generated/git.h" |
||||
|
||||
#include <spiflash.h> |
||||
|
||||
#include <liblitedram/sdram.h> |
||||
|
||||
#include <libliteeth/udp.h> |
||||
#include <libliteeth/mdio.h> |
||||
|
||||
#include <liblitespi/spiflash.h> |
||||
|
||||
#include <liblitesdcard/sdcard.h> |
||||
|
||||
#include "../include/illumination.h" |
||||
|
||||
extern void busy_wait(unsigned int ms); // Worx!
|
||||
|
||||
// Considering timing: 2x 2x 511 (2044 LEDs) superfast
|
||||
// 2x 3x 511 (3066 LEDs) fast enough
|
||||
#define MAXTABLES 3 // 1..64 MUST match h/w! Use Power of 2!
|
||||
#define MAXLEDS 512 // 1..512 MUST match h/w! Use Power of 2!
|
||||
static int32_t arLEDBuffer[MAXTABLES*3][MAXLEDS] __attribute__((aligned(16)));; // GRB values
|
||||
|
||||
extern char kbhit(void); |
||||
extern int key_eval(void); |
||||
|
||||
static int iDelay = 15; // 100ms by default
|
||||
|
||||
int key_eval(void) |
||||
{ |
||||
switch(kbhit()) { |
||||
case 'w': if(iDelay < 200) |
||||
iDelay += 10;
|
||||
break; |
||||
case 's': if(iDelay > 10)
|
||||
iDelay -= 10;
|
||||
break; |
||||
case 'x': return 1; // Abort indication
|
||||
default: ; |
||||
} |
||||
return(0); |
||||
} |
||||
|
||||
void enable_LEDS(int iEnable) |
||||
{ |
||||
|
||||
static uint32_t uiLoopCount = 0; |
||||
|
||||
if(iEnable) { |
||||
npe_b6NoOfTables_write(MAXTABLES > 63? 63 : MAXTABLES ); // Prepare # of tables (0..63)
|
||||
npe_b9Len_write(MAXLEDS > 511? 511 : MAXLEDS); // Prepare length (0..511)
|
||||
npe2_b6NoOfTables_write(MAXTABLES > 63? 63 : MAXTABLES ); // Prepare # of tables (0..63)
|
||||
npe2_b9Len_write(MAXLEDS > 511? 511 : MAXLEDS); // Prepare length (0..511)
|
||||
npe3_b6NoOfTables_write(MAXTABLES > 63? 63 : MAXTABLES ); // Prepare # of tables (0..63)
|
||||
npe3_b9Len_write(MAXLEDS > 511? 511 : MAXLEDS); // Prepare length (0..511)
|
||||
for(int j=0;j<MAXTABLES;j++) { |
||||
//printf("DRAM->[%d] = %08Xh\n", j, (uint32_t)&arLEDBuffer[j]);
|
||||
npe_b6StoreOffset_write(j); // Indicate which entry to use for address storage
|
||||
npe_b32DRAMAddress_write((uint32_t)&arLEDBuffer[j]); // Base address of LED buffer
|
||||
npe2_b6StoreOffset_write(j); // Indicate which entry to use for address storage
|
||||
npe2_b32DRAMAddress_write((uint32_t)&arLEDBuffer[MAXTABLES+j]); // Base address of LED buffer
|
||||
npe3_b6StoreOffset_write(j); // Indicate which entry to use for address storage
|
||||
npe3_b32DRAMAddress_write((uint32_t)&arLEDBuffer[2*MAXTABLES+j]); // Base address of LED buffer
|
||||
} |
||||
uiLoopCount = dramtransfer_b32RCount_read(); |
||||
} |
||||
else { |
||||
printf("Disabling NPE, transfer count %d w/ %d tables of length %d\n",
|
||||
dramtransfer_b32RCount_read() - uiLoopCount, MAXTABLES, MAXLEDS); |
||||
} |
||||
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
|
||||
busy_wait(10); // Testing still ...
|
||||
npe_bEnable_write(iEnable ? 1 : 0); // Enable/disable
|
||||
npe2_bEnable_write(iEnable ? 1 : 0); // Enable/disable
|
||||
npe3_bEnable_write(iEnable ? 1 : 0); // Enable/disable
|
||||
} |
||||
|
||||
void clear_LEDs(int iTable) |
||||
{ |
||||
for(int i=0;i<MAXLEDS;i++)
|
||||
arLEDBuffer[iTable][i] = 0;
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
} |
||||
|
||||
void load_triple_LEDs(int iTable, int32_t green, int32_t red, int32_t blue) |
||||
{ |
||||
for(int i=0;i<MAXLEDS;i+=3) { |
||||
arLEDBuffer[iTable][i] = green; |
||||
arLEDBuffer[iTable][i+1] = red; |
||||
arLEDBuffer[iTable][i+2] = blue; |
||||
}
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
} |
||||
|
||||
int illumination(void) |
||||
{
|
||||
int32_t green = 0x040000; |
||||
int32_t red = 0x000400; |
||||
int32_t blue = 0x000004; |
||||
int iTable; |
||||
|
||||
printf("----- Illumination demo -----\n"); |
||||
|
||||
// Prepare first output
|
||||
iTable = 0; |
||||
load_triple_LEDs(iTable, green, red, blue); // 1st load
|
||||
iTable = 1; |
||||
load_triple_LEDs(iTable, red, blue, green);
|
||||
iTable = 2; |
||||
load_triple_LEDs(iTable, blue, green, red);
|
||||
enable_LEDS(1); // Engage!
|
||||
busy_wait(2000); |
||||
|
||||
// Let them flicker ...
|
||||
for(int i=0;i<100;i++) { |
||||
int32_t temp = green; |
||||
green = red; |
||||
red = blue; |
||||
blue = temp; |
||||
for(iTable=0;iTable<MAXTABLES*3;iTable++) |
||||
load_triple_LEDs(iTable, green, red, blue); |
||||
busy_wait(iDelay); // Slow down a bit
|
||||
if(key_eval()) { |
||||
enable_LEDS(0); |
||||
return 1; |
||||
} |
||||
} |
||||
// Make 3 run along ...
|
||||
for(iTable=0;iTable<MAXTABLES*3;iTable++) { |
||||
clear_LEDs(iTable); // Reset buffers
|
||||
if(iTable != 0) {
|
||||
green = 0x404000; |
||||
red = 0x004040; |
||||
blue = 0x400040; |
||||
} |
||||
else { |
||||
green = 0x040000; |
||||
red = 0x000400; |
||||
blue = 0x000004;
|
||||
} |
||||
arLEDBuffer[iTable][0] = green; |
||||
arLEDBuffer[iTable][1] = red; |
||||
arLEDBuffer[iTable][2] = blue; |
||||
} |
||||
int max_LED = MAXLEDS-1; // 1..512
|
||||
for(int i=0;i<MAXLEDS-3;i++) { // Forward shift 3
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
busy_wait(iDelay); |
||||
for(iTable=0;iTable<MAXTABLES*3;iTable++) { |
||||
for(int j=0;j<max_LED;j++) |
||||
arLEDBuffer[iTable][max_LED - j] = arLEDBuffer[iTable][(max_LED - 1) - j]; |
||||
arLEDBuffer[iTable][i] = 0; |
||||
} |
||||
if(key_eval()) { |
||||
enable_LEDS(0); |
||||
return 1; |
||||
} |
||||
} |
||||
printf("Halfway thru ...\n"); |
||||
for(int i=0;i<MAXLEDS-1;i++) { // Backward shift 3
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
busy_wait(iDelay); |
||||
for(iTable=0;iTable<MAXTABLES*3;iTable++) { |
||||
for(int j=0;j<max_LED;j++) |
||||
arLEDBuffer[iTable][j] = arLEDBuffer[iTable][j+1]; |
||||
arLEDBuffer[iTable][max_LED-i] = 0; |
||||
} |
||||
if(key_eval()) { |
||||
enable_LEDS(0); |
||||
return 1; |
||||
} |
||||
} |
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
|
||||
busy_wait(400); |
||||
|
||||
// Prepare final output
|
||||
for(iTable=0;iTable<MAXTABLES*3;iTable++)
|
||||
load_triple_LEDs(iTable, 0x010000, 0x000100, 0x000001); // 1st load
|
||||
flush_l2_cache(); // Strictly nec. for longer transfers
|
||||
busy_wait(500); |
||||
enable_LEDS(0); |
||||
printf("Finished!\n"); |
||||
return 0; |
||||
} |
||||
|
Loading…
Reference in new issue