// // bfloat16nnlib.c // bfloat16 neural network support // // History: // -------- // 24.04.21/KQ Initial version // #include #include #include #include #include #include #include #include #include #include "boot.h" #include "readline.h" #include "helpers.h" #include "command.h" #include "../../build/colorlight_5a_75b/software/include/generated/csr.h" #include "../../build/colorlight_5a_75b/software/include/generated/soc.h" #include "../../build/colorlight_5a_75b/software/include/generated/mem.h" #include "../../build/colorlight_5a_75b/software/include/generated/git.h" #include #include #include #include #include #include #include "../include/systime.h" #include "../include/bfloat16nnlib.h" extern void busy_wait(unsigned int ms); // Worx! extern char kbhit(void); extern int key_eval(void); #define DRAMDATABASE1 0x40190000 #define DRAMDATASIZE1 400 // 512 OK => 2 Load cycles (2*512)! #define DRAMDATABASE2 (DRAMDATABASE1 + (DRAMDATASIZE1*sizeof(int32_t))) #define DRAMDATASIZE2 DRAMDATASIZE1 static uint32_t fpgastate, fpustates; static int fpgaload(uint32_t *mempt1, uint32_t *mempt2, uint16_t calclen, int bReload1, int bReload2) { uint32_t *sentinel1 = (uint32_t *)(DRAMDATABASE1 + ((DRAMDATASIZE1-1) * sizeof(uint32_t))); uint32_t *sentinel2 = (uint32_t *)(DRAMDATABASE2 + ((DRAMDATASIZE2-1) * sizeof(uint32_t))); static uint32_t seqno = 0x41434142; // Just some marker pattern ;) bfloat16nn_bEnable_write(0); // Disable transfer (if still active for some reason ...) *sentinel1 = seqno++; *sentinel2 = seqno++; bfloat16nn_b32Sentinel1_write(*sentinel1); bfloat16nn_b32Sentinel2_write(*sentinel2); flush_l2_cache(); // Strictly nec. for longer transfers bfloat16nn_b10ArrayWordLen_write(calclen>>1); // Indicate array length for calc. but honour split! bfloat16nn_b32DRAMLoadAddress1_write((uint32_t)mempt1); // Indicate memory to load from bfloat16nn_b32DRAMLoadAddress2_write((uint32_t)mempt2); // Indicate memory to load from bfloat16nn_bReload1_write(bReload1 ? 1 : 0); // Reload mem#1 bfloat16nn_bReload2_write(bReload2 ? 1 : 0); // Reload mem#2 bfloat16nn_bEnable_write(1); // Finally: Engage! for(int i=0;i<2000;i++) { // Max. 100ms delay if(bfloat16nn_b16Status_read() & 0x8000) { fpgastate = (uint32_t)bfloat16nn_b16Status_read(); fpustates = (uint32_t)bfloat16nn_b16FPUStates_read(); bfloat16nn_bEnable_write(0); // Disable transfer return 1; // Ok, ready! } else busy_wait(1); // Just wait some time ... } fpgastate = (uint32_t)bfloat16nn_b16Status_read(); fpustates = (uint32_t)bfloat16nn_b16FPUStates_read(); bfloat16nn_bEnable_write(0); // Disable transfer return 0; // Timeout } static float fpResult1_read(void) { uint32_t v __attribute__((aligned(16))) = 0; *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result1_read(); float *fpt = (float *)&v; return *fpt; } static float fpResult2_read(void) { uint32_t v __attribute__((aligned(16))) = 0; *(((uint16_t *)&v) + 1) = bfloat16nn_b16Result2_read(); float *fpt = (float *)&v; return *fpt; } static uint16_t f2ui16(float f) { return *(((uint16_t *)&f)+1); // High half word needed (low-endian), hence ... } extern void dumpfloat(float f); void dumpfloat(float f) { printf("%08Xh -> %04Xh\n", *(uint32_t *)&f, f2ui16(f)); } int key_eval(void) { extern void printf1(const char *fmt, float f1); //uint32_t *ui32ptr1, *ui32ptr2; uint16_t *ui16ptr1, *ui16ptr2; float fpResult1, fpResult2; uint32_t starttime; uint32_t deltatime; int i; #define MAXCALCLEN 784 //784 //16 OK switch(kbhit()) { case 'r': // Reload printf("\e[35;1m*** Reload ***\e[0m\n"); printf("Elements/FPU: %d\n", MAXCALCLEN); printf("DRAM1->%08Xh DRAM2->%08Xh\n", DRAMDATABASE1, DRAMDATABASE2); /* for(i=0, ui32ptr1 = (uint32_t *)DRAMDATABASE1;i