Half-precision floats handling
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
bfloat16nn/software/source/bfloat16nnlib.c

300 lines
10 KiB

//
// bfloat16nnlib.c
// bfloat16 neural network support
//
// History:
// --------
// 24.04.21/KQ Initial version
//
#include <stdio.h>
#include <stdlib.h>
#include <console.h>
#include <string.h>
#include <uart.h>
#include <system.h>
#include <id.h>
#include <irq.h>
#include <crc.h>
#include "boot.h"
#include "readline.h"
#include "helpers.h"
#include "command.h"
#include "../../build/colorlight_5a_75b/software/include/generated/csr.h"
#include "../../build/colorlight_5a_75b/software/include/generated/soc.h"
#include "../../build/colorlight_5a_75b/software/include/generated/mem.h"
#include "../../build/colorlight_5a_75b/software/include/generated/git.h"
#include <spiflash.h>
#include <liblitedram/sdram.h>
#include <libliteeth/udp.h>
#include <libliteeth/mdio.h>
#include <liblitespi/spiflash.h>
#include <liblitesdcard/sdcard.h>
#include "../include/systime.h"
#include "../include/bfloat16nnlib.h"
extern void busy_wait(unsigned int ms); // Worx!
extern char kbhit(void);
extern int key_eval(void);
#define DRAMDATABASE 0x40190000
#define DRAMDATASIZE 1024 // 512 OK, 800 FAIL => 2 Load cycles (2*512)!
static uint32_t fpgastate, fpustates;
static int fpgaload(uint32_t *mempt, int16_t len, int16_t calclen)
{
uint32_t *sentinel1 = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE/2 - 1) * sizeof(int32_t));
uint32_t *sentinel2 = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
if((len < 4) | (len > DRAMDATASIZE)) {
printf("*** fpgaload: len out of range!");
return -1; // Verify length of transfer was understood!
}
if((calclen < 2) | (calclen > len/2)) {
printf("*** fpgaload: calclen out of range!");
return -2; // Reasonable calc amount?
}
bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...)
*sentinel1 = 0x41434142; // Just some marker pattern ;)
*sentinel2 = 0x41434142 + 1; // Just some marker pattern ;)
bfloat16nn_b32Sentinel_write(*sentinel1);
flush_l2_cache(); // Strictly nec. for longer transfers
bfloat16nn_b10ArrayWordLen_write(calclen); // Indicate array length for calc.
bfloat16nn_b32DRAMLoadAddress_write((uint32_t)mempt); // Indicate memory to load from
bfloat16nn_bEnable_write(1); // Finally: Engage!
for(int i=0;i<2000;i++) { // Max. 100ms delay
if(bfloat16nn_b16Status_read() & 0x8000) {
bfloat16nn_bEnable_write(0); // Disable transfer
fpgastate = 0;
fpustates = 0;
return 1; // Ok, ready!
}
else
busy_wait(1); // Just wait some time ...
}
fpgastate = (uint32_t)bfloat16nn_b16Status_read();
fpustates = (uint32_t)bfloat16nn_b16FPUStates_read();
bfloat16nn_bEnable_write(0); // Disable transfer
return 0; // Timeout
}
/*
static float fp1_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_1_read(); // Low-endian, high half word required
float *fpt = (float *)&v;
return *fpt;
}
static float fp1_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value1_2_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_1_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fp2_2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Value2_2_read();
float *fpt = (float *)&v;
return *fpt;
}
*/
static float fpResult1_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result1_read();
float *fpt = (float *)&v;
return *fpt;
}
static float fpResult2_read(void)
{
uint32_t v __attribute__((aligned(16))) = 0;
*(((uint16_t *)&v) + 1) = bfloat16nn_b16Result2_read();
float *fpt = (float *)&v;
return *fpt;
}
static uint16_t f2ui16(float f)
{
return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ...
}
extern void dumpfloat(float f);
void dumpfloat(float f)
{
printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f));
}
int key_eval(void)
{
extern void printf1(const char *fmt, float f1);
static uint32_t *sentinel = (uint32_t *)(DRAMDATABASE + (DRAMDATASIZE - 1) * sizeof(int32_t));
uint32_t *ui32ptr;
uint16_t *ui16ptr1, *ui16ptr2;
int i;
//float fp1_1, fp1_2;
float fpResult1;
//float fp2_1, fp2_2;
float fpResult2;
uint32_t starttime;
uint32_t deltatime;
#define MAXCALCLEN (284) //784 //16 OK
switch(kbhit()) {
case 'r': // Reload
printf("\e[35;1m*** Reload ***\e[0m\n");
printf("Elements/FPU: %d\n", MAXCALCLEN);
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = 0; // Clear all memory ...
for(i=0, ui32ptr = (uint32_t *)DRAMDATABASE;i<DRAMDATASIZE;i++) // Setup test data
*ui32ptr++ = i+1; // Clear all memory ...
/*
// TODO: Control procedure w/ regular code (matrice inner product)
float *floatptr = (float *)DRAMDATABASE;
float *floatptr2 = (float *)(DRAMDATABASE + (MAXCALCLEN/2) * sizeof(float)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN/2;i++) {
*floatptr++ = (1.0 * (float)i);
*floatptr2++ = (1.0 * (float)i);
}
floatptr = (float *)DRAMDATABASE;
floatptr2 = (float *)(DRAMDATABASE + (MAXCALCLEN/2) * sizeof(float)); // Absolute: bytes!
starttime = systime(0);
float sum = 0.0;
for(i=1;i<=MAXCALCLEN/2;i++) {
sum += ((*floatptr++) * (*floatptr2++)); // 1*1+2*2+3*3+4*4 = 1+4+9+16 = 5+9+16 = 14+16=30
}
deltatime = systime(0)-starttime;
printf("S/W Delta t: %dms ", deltatime);
printf1("\t\t\tS/W SUM=%8.4f\n", sum);
// FPU#1
ui16ptr1 = (uint16_t *)(DRAMDATABASE + 0 * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN;i++)
*ui16ptr1++ = f2ui16(1.0 * (float)i );
// FPU#2
ui16ptr2 = (uint16_t *)(DRAMDATABASE + (DRAMDATASIZE/2) * sizeof(uint32_t)); // Absolute: bytes!
for(i=1;i<=MAXCALCLEN;i++)
*ui16ptr2++ = f2ui16(1.0 * (float)i );
*/
starttime = systime(0);
if(fpgaload((uint32_t *)DRAMDATABASE, DRAMDATASIZE, MAXCALCLEN/2)) { // 800*32-bit=3200 bytes, 400 Words/FPU to calc.
deltatime = systime(0)-starttime;
printf("H/W Delta t: %dms ", deltatime);
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("(S=%04Xh: FS=%04Xh)", fpgastate, fpustates);
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
printf1("\tS/W SUM=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
//printf1("(RESULT2=%8.4f)", fpResult2);
/*
for(i=DRAMDATASIZE/2;i<DRAMDATASIZE/2+3;i++) {
dram2fpga_b10Offset2_write(i);
printf("%d: %d\n", i, dram2fpga_b32Data2_read());
}
dram2fpga_b10Offset2_write(DRAMDATASIZE - 1);
printf("%d: %d\n", DRAMDATASIZE - 1, dram2fpga_b32Data2_read());
*/
}
else {
printf("CURRENT TIMEOUT: S=%04Xh: FS=%04Xh ", fpgastate, fpustates);
printf("Offset 1: %d (%d) ", (uint32_t)dram2fpga_b10Offset1_read(), dram2fpga_b32Data1_read());
printf("Offset 2: %d (%d)", (uint32_t)dram2fpga_b10Offset2_read(), dram2fpga_b32Data2_read());
printf("Sentinels: %08Xh %08Xh\n", bfloat16nn_b32Sentinel_read(), dram2fpga_b32Data1_read());
for(i=0;i<10;i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=DRAMDATASIZE/2 - 5;i<(DRAMDATASIZE/2 + 5);i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=512 - 5;i<(512 + 5);i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
for(i=DRAMDATASIZE-10;i<DRAMDATASIZE-1;i++) {
dram2fpga_b10Offset1_write(i);
dram2fpga_b10Offset2_write(i);
printf("%d: %d=%d\n", i, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
dram2fpga_b10Offset1_write(DRAMDATASIZE - 1);
dram2fpga_b10Offset2_write(DRAMDATASIZE - 1);
printf("%d:*%d=%d*\n", DRAMDATASIZE - 1, dram2fpga_b32Data1_read(), dram2fpga_b32Data2_read());
}
*sentinel = 0; // Invalidate data!
if(fpgaload((uint32_t *)DRAMDATABASE, DRAMDATASIZE, MAXCALCLEN/2)) {
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("INVALIDATED: S=%04Xh: FS=%04Xh\n", fpgastate, fpustates);
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
//printf1("RESULT1=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
//printf1("RESULT2=%8.4f\n", fpResult2);
}
else
printf("INVALIDATED TIMEOUT: S=%04Xh: FS=%04Xh\n", fpgastate, fpustates);
break;
case 's':
/*fp1_1 = fp1_1_read();
fp1_2 = fp1_2_read();
fp2_1 = fp2_1_read();
fp2_2 = fp2_2_read();*/
fpResult1 = fpResult1_read();
fpResult2 = fpResult2_read();
printf("REQUESTED: S=%04Xh: FS=%04Xh\n", (uint32_t)bfloat16nn_b16Status_read(), (uint32_t)bfloat16nn_b16FPUStates_read());
/*printf1("V1_1=%6.3f ", fp1_1);
printf1("V1_2=%6.3f ", fp1_2);*/
printf1("RESULT1=%8.4f\n", fpResult1);
/*printf1("V2_1=%6.3f ", fp2_1);
printf1("V2_2=%6.3f ", fp2_2);*/
printf1("RESULT2=%8.4f\n", fpResult2);
break;
case 'x': return 1; // Abort indication
default: ;
}
return(0);
}
void bfloat16nn_demo(void)
{
//char buffer[80];
printf("\e[33;1mbfloat16nn demo: Press [r]eload, [s]how or e[x]it ...\e[0m\n");
while(!key_eval()); // Wait for e[x]it key ...
printf("\e[33;1mbfloat16nn demo terminated.\e[0m\n");
}