|
|
|
//
|
|
|
|
// bfloat16nnlib.c
|
|
|
|
// bfloat16 neural network support
|
|
|
|
//
|
|
|
|
// History:
|
|
|
|
// --------
|
|
|
|
// 24.04.21/KQ Initial version
|
|
|
|
//
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <console.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <uart.h>
|
|
|
|
#include <system.h>
|
|
|
|
#include <id.h>
|
|
|
|
#include <irq.h>
|
|
|
|
#include <crc.h>
|
|
|
|
#include "boot.h"
|
|
|
|
#include "readline.h"
|
|
|
|
#include "helpers.h"
|
|
|
|
#include "command.h"
|
|
|
|
|
|
|
|
#include "../../build/colorlight_5a_75b/software/include/generated/csr.h"
|
|
|
|
#include "../../build/colorlight_5a_75b/software/include/generated/soc.h"
|
|
|
|
#include "../../build/colorlight_5a_75b/software/include/generated/mem.h"
|
|
|
|
#include "../../build/colorlight_5a_75b/software/include/generated/git.h"
|
|
|
|
|
|
|
|
#include <spiflash.h>
|
|
|
|
|
|
|
|
#include <liblitedram/sdram.h>
|
|
|
|
|
|
|
|
#include <libliteeth/udp.h>
|
|
|
|
#include <libliteeth/mdio.h>
|
|
|
|
|
|
|
|
#include <liblitespi/spiflash.h>
|
|
|
|
|
|
|
|
#include <liblitesdcard/sdcard.h>
|
|
|
|
|
|
|
|
#include "../include/systime.h"
|
|
|
|
#include "../include/bfloat16nnlib.h"
|
|
|
|
|
|
|
|
extern void busy_wait(unsigned int ms); // Worx!
|
|
|
|
extern char kbhit(void);
|
|
|
|
extern int key_eval(void);
|
|
|
|
|
|
|
|
#define DRAMDATABASE1 0x40190000
|
|
|
|
#define DRAMDATASIZE1 400 // 512 OK => 2 Load cycles (2*512)!
|
|
|
|
#define DRAMDATABASE2 (DRAMDATABASE1 + (DRAMDATASIZE1*sizeof(int32_t)))
|
|
|
|
#define DRAMDATASIZE2 DRAMDATASIZE1
|
|
|
|
|
|
|
|
static uint32_t fpgastate, fpustates;
|
|
|
|
static int fpgaload(uint32_t *mempt1, uint32_t *mempt2, uint16_t calclen, int bReload1, int bReload2)
|
|
|
|
{
|
|
|
|
uint32_t *sentinel1 = (uint32_t *)(DRAMDATABASE1 + ((DRAMDATASIZE1-1) * sizeof(uint32_t)));
|
|
|
|
uint32_t *sentinel2 = (uint32_t *)(DRAMDATABASE2 + ((DRAMDATASIZE2-1) * sizeof(uint32_t)));
|
|
|
|
static uint32_t seqno = 0x41434142; // Just some marker pattern ;)
|
|
|
|
|
|
|
|
bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...)
|
|
|
|
*sentinel1 = seqno++;
|
|
|
|
*sentinel2 = seqno++;
|
|
|
|
bfloat16nn_b32Sentinel1_write(*sentinel1);
|
|
|
|
bfloat16nn_b32Sentinel2_write(*sentinel2);
|
|
|
|
flush_l2_cache(); // Strictly nec. for longer transfers
|
|
|
|
bfloat16nn_b10ArrayWordLen_write(calclen>>1); // Indicate array length for calc. but honour split!
|
|
|
|
bfloat16nn_b32DRAMLoadAddress1_write((uint32_t)mempt1); // Indicate memory to load from
|
|
|
|
bfloat16nn_b32DRAMLoadAddress2_write((uint32_t)mempt2); // Indicate memory to load from
|
|
|
|
bfloat16nn_bReload1_write(bReload1 ? 1 : 0); // Reload mem#1
|
|
|
|
bfloat16nn_bReload2_write(bReload2 ? 1 : 0); // Reload mem#2
|
|
|
|
bfloat16nn_bEnable_write(1); // Finally: Engage!
|
|
|
|
for(int i=0;i<2000;i++) { // Max. 100ms delay
|
|
|
|
if(bfloat16nn_b16Status_read() & 0x8000) {
|
|
|
|
fpgastate = (uint32_t)bfloat16nn_b16Status_read();
|
|
|
|
fpustates = (uint32_t)bfloat16nn_b16FPUStates_read();
|
|
|
|
bfloat16nn_bEnable_write(0); // Disable transfer
|
|
|
|
return 1; // Ok, ready!
|
|
|
|
}
|
|
|
|
else
|
|
|
|
busy_wait(1); // Just wait some time ...
|
|
|
|
}
|
|
|
|
fpgastate = (uint32_t)bfloat16nn_b16Status_read();
|
|
|
|
fpustates = (uint32_t)bfloat16nn_b16FPUStates_read();
|
|
|
|
bfloat16nn_bEnable_write(0); // Disable transfer
|
|
|
|
return 0; // Timeout
|
|
|
|
}
|
|
|
|
|
|
|
|
static float fpResult1_read(void)
|
|
|
|
{
|
|
|
|
uint32_t v __attribute__((aligned(16))) = 0;
|
|
|
|
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result1_read();
|
|
|
|
float *fpt = (float *)&v;
|
|
|
|
return *fpt;
|
|
|
|
}
|
|
|
|
static float fpResult2_read(void)
|
|
|
|
{
|
|
|
|
uint32_t v __attribute__((aligned(16))) = 0;
|
|
|
|
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result2_read();
|
|
|
|
float *fpt = (float *)&v;
|
|
|
|
return *fpt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint16_t f2ui16(float f)
|
|
|
|
{
|
|
|
|
return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ...
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void dumpfloat(float f);
|
|
|
|
void dumpfloat(float f)
|
|
|
|
{
|
|
|
|
printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f));
|
|
|
|
}
|
|
|
|
|
|
|
|
int key_eval(void)
|
|
|
|
{
|
|
|
|
extern void printf1(const char *fmt, float f1);
|
|
|
|
//uint32_t *ui32ptr1, *ui32ptr2;
|
|
|
|
uint16_t *ui16ptr1, *ui16ptr2;
|
|
|
|
float fpResult1, fpResult2;
|
|
|
|
uint32_t starttime;
|
|
|
|
uint32_t deltatime;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
#define MAXCALCLEN 784 //784 //16 OK
|
|
|
|
|
|
|
|
switch(kbhit()) {
|
|
|
|
case 'r': // Reload
|
|
|
|
printf("\e[35;1m*** Reload ***\e[0m\n");
|
|
|
|
printf("Elements/FPU: %d\n", MAXCALCLEN);
|
|
|
|
printf("DRAM1->%08Xh DRAM2->%08Xh\n", DRAMDATABASE1, DRAMDATABASE2);
|
|
|
|
/*
|
|
|
|
for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i<DRAMDATASIZE1;i++) // Setup test data
|
|
|
|
*ui32ptr1++ = 0; // Clear all memory ...
|
|
|
|
for(i=0, ui32ptr2 = (uint32_t *)DRAMDATABASE2;i<DRAMDATASIZE2;i++) // Setup test data
|
|
|
|
*ui32ptr2++ = 0; // Clear all memory ...
|
|
|
|
|
|
|
|
for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i<DRAMDATASIZE1;i++) // Setup test data
|
|
|
|
*ui32ptr1++ = i+1;
|
|
|
|
for(i=0, ui32ptr2 = (uint32_t *)DRAMDATABASE2;i<DRAMDATASIZE2;i++) // Setup test data
|
|
|
|
*ui32ptr2++ = i+1;
|
|
|
|
*/
|
|
|
|
// TODO: Control procedure w/ regular code (matrice inner product)
|
|
|
|
float *floatptr1 = (float *)DRAMDATABASE1;
|
|
|
|
float *floatptr2 = (float *)(DRAMDATABASE1 + (DRAMDATASIZE1 * sizeof(float)));
|
|
|
|
for(i=1;i<=MAXCALCLEN;i++) {
|
|
|
|
*floatptr1++ = (1.0 * (float)i);
|
|
|
|
*floatptr2++ = (1.0 * (float)i);
|
|
|
|
}
|
|
|
|
floatptr1 = (float *)DRAMDATABASE1;
|
|
|
|
floatptr2 = (float *)(DRAMDATABASE1 + (DRAMDATASIZE1 * sizeof(float))); // Absolute: bytes!
|
|
|
|
starttime = systime(0);
|
|
|
|
float sum = 0.0;
|
|
|
|
for(i=1;i<=MAXCALCLEN;i++) {
|
|
|
|
sum += ((*floatptr1++) * (*floatptr2++)); // 1*1+2*2+3*3+4*4 = 1+4+9+16 = 5+9+16 = 14+16=30
|
|
|
|
}
|
|
|
|
deltatime = systime(0)-starttime;
|
|
|
|
printf("S/W Delta t: %dms ", deltatime);
|
|
|
|
printf1("\t\t\tS/W SUM=%8.4f\n", sum);
|
|
|
|
|
|
|
|
ui16ptr1 = (uint16_t *)DRAMDATABASE1; // Absolute: bytes! Matrice/row
|
|
|
|
for(i=1;i<=MAXCALCLEN;i++)
|
|
|
|
*ui16ptr1++ = f2ui16(1.0 * (float)i );
|
|
|
|
ui16ptr2 = (uint16_t *)DRAMDATABASE2; // Absolute: bytes! Vector
|
|
|
|
for(i=1;i<=MAXCALCLEN;i++)
|
|
|
|
*ui16ptr2++ = f2ui16(1.0 * (float)i );
|
|
|
|
|
|
|
|
starttime = systime(0);
|
|
|
|
if(fpgaload((uint32_t *)DRAMDATABASE1, (uint32_t *)DRAMDATABASE2, MAXCALCLEN, 1, 1)) {
|
|
|
|
deltatime = systime(0)-starttime;
|
|
|
|
printf("H/W Delta t: %dms ", deltatime);
|
|
|
|
fpResult1 = fpResult1_read();
|
|
|
|
fpResult2 = fpResult2_read();
|
|
|
|
printf("(S=%04Xh: FS=%04Xh)", fpgastate, fpustates);
|
|
|
|
printf1("\tH/W SUM=%8.4f\n", fpResult1);
|
|
|
|
//printf1("\t(FPU#2=%8.4f)\n", fpResult2);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
printf("CURRENT TIMEOUT: S=%04Xh: FS=%04Xh ", fpgastate, fpustates);
|
|
|
|
printf("Offset 1: %d (%d) ", (uint32_t)dram2fpga1_b9Offset_read(), dram2fpga1_b32Data_read());
|
|
|
|
printf("Offset 2: %d (%d) ", (uint32_t)dram2fpga2_b9Offset_read(), dram2fpga2_b32Data_read());
|
|
|
|
printf("Sentinel 1: %08Xh=%08Xh ", bfloat16nn_b32Sentinel1_read(), dram2fpga1_b32Data_read());
|
|
|
|
printf("Sentinel 2: %08Xh=%08Xh ", bfloat16nn_b32Sentinel2_read(), dram2fpga2_b32Data_read());
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
ui32ptr1 = (uint32_t *)(DRAMDATABASE1 + (DRAMDATASIZE1-4)*sizeof(uint32_t));
|
|
|
|
ui32ptr2 = (uint32_t *)(DRAMDATABASE2 + (DRAMDATASIZE2-4)*sizeof(uint32_t));
|
|
|
|
for(i=DRAMDATASIZE1-4;i<(DRAMDATASIZE1-1);i++) {
|
|
|
|
dram2fpga1_b9Offset_write(i);
|
|
|
|
dram2fpga2_b9Offset_write(i);
|
|
|
|
printf("%d:\t1:%d/%d\t2:%d/%d\n", i, dram2fpga1_b32Data_read(), *ui32ptr1++, dram2fpga2_b32Data_read(), *ui32ptr2++);
|
|
|
|
}
|
|
|
|
dram2fpga1_b9Offset_write(i);
|
|
|
|
dram2fpga2_b9Offset_write(i);
|
|
|
|
printf("%d: 1:%08Xh/%08Xh", i, dram2fpga1_b32Data_read(), *ui32ptr1++);
|
|
|
|
printf("\t2:%08Xh/%08Xh\n", dram2fpga2_b32Data_read(), *ui32ptr2++);
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
case 's':
|
|
|
|
fpResult1 = fpResult1_read();
|
|
|
|
fpResult2 = fpResult2_read();
|
|
|
|
printf("REQUESTED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
|
|
|
|
printf1("RESULT1=%8.4f\n", fpResult1);
|
|
|
|
printf1("RESULT2=%8.4f\n", fpResult2);
|
|
|
|
break;
|
|
|
|
case 'x': return 1; // Abort indication
|
|
|
|
default: ;
|
|
|
|
}
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bfloat16nn_demo(void)
|
|
|
|
{
|
|
|
|
//char buffer[80];
|
|
|
|
printf("\e[33;1mbfloat16nn demo: Press [r]eload, [s]how or e[x]it ...\e[0m\n");
|
|
|
|
while(!key_eval()); // Wait for e[x]it key ...
|
|
|
|
printf("\e[33;1mbfloat16nn demo terminated.\e[0m\n");
|
|
|
|
}
|