A nice MLP neural network in HIP for AMD GPU's

by **hbyte** » Sat Jul 13, 2024 9:52 pm

This is my first rough draft of an MLP running on AMD gpu and solving the XoR really fast! It needs 1dkern.cpp which is listed at the end.

Code: Select all: #include "hip/hip_runtime.h" #include <stdio.h> #include <stdlib.h> #include <fstream> #include <iostream> #include <ctime> #include <time.h> #include "1dkern.cpp" using namespace std; typedef struct{ float *B_a,*A_a,*W_a,*D_a,*P_a,*P_w,*E_a,*V_a; float *L_a,*WL_a,*C_e; int *LU_a,*P_l,*W_l,*A_f; int ITER,NLAYS,NN,BS,NVAL,WTOT,NS; float Lrt,Decy,Mmnt; }myparam; int *LU_a; float getlrand(float lower,float upper){ return ((float) rand()/ RAND_MAX) * (upper-lower) + lower; } /* Call all Act functions - includes rng for noisy Acts */ __device__ float mytanh_(float x) { float exp2x = expf(2*x); return (exp2x - 1) / (exp2x + 1); } __device__ float sigmoid(float v) { return 1 / (1 + expf(-v)); } __device__ float InvSig(const float sum,const float act) { return (act*(1-act)*sum) ; } __device__ float ActFun(int type, float v, int dir, float u,int seed) { switch(type) { case 0: /* Tanh */ return (dir == 0) ? mytanh_(v) : (1.0f - mytanh_(u) * mytanh_(u)) * v; case 1: /* Sigmoid */ return (dir == 0) ? 1.0f / (1.0f + expf(-v)) : u * (1.0f - u) * v; case 2: /* Norm */ return v; default: return v; } } __device__ float rng(unsigned int m_w,unsigned int m_z){ m_z = 36969 * (m_z & 65535) + (m_z >> 16); m_w = 18000 * (m_w & 65535) + (m_w >> 16); return (m_z << 16) + m_w; /* 32-bit result */ } __global__ void INIT_PARAM(myparam PARAM){ int TW,NW; TW=PARAM.W_l[PARAM.NLAYS-1]; NW=PARAM.P_l[PARAM.NLAYS-1]; GPU_1D_KERN_LOOP(index,TW){ const int ls = (index / 1 ) %TW; PARAM.W_a[ls] = -(2425-rng(150+ls,40)/1000000); PARAM.D_a[ls] = -(2425-rng(150+ls,40)/1000000); PARAM.P_w[ls] = -(2425-rng(150+ls,40)/1000000); PARAM.P_a[ls] = -(2425-rng(150+ls,40)/1000000); } __syncthreads(); GPU_1D_KERN_LOOP(index,NW){ const int ls = (index / 1 ) %NW; PARAM.B_a[ls] = -(2425-rng(150+ls,40)/1000000); } } __global__ void NN_FF(myparam PARAM,const int* const __restrict__ LU_a){ //hipStream_t astream; //hipStreamCreateWithFlags(&astream,hipStreamNonBlocking); const int NLAYS = PARAM.NLAYS; for(int LAYER=1;LAYER<NLAYS;LAYER++){ /* Set Constants */ const int NIN = LU_a[LAYER*6+0]; const int NOUT = LU_a[LAYER*6+1]; const int SIN = LU_a[LAYER*6+2]; const int SOUT = LU_a[LAYER*6+3]; const int NVAL = PARAM.NVAL; //Number of Training Values (XoR or RL) const int NUMNET = PARAM.NN; const int ITER = PARAM.ITER; const int BS = PARAM.BS; const int NETSIZE = PARAM.NS; if(LAYER==1){ /*Set Inputs*/ GPU_1D_KERN_LOOP(index,NIN*NUMNET*BS){ const int ls = (index / 1 ) %NIN; //Outputs / layer size const int bs = (index / NIN) %BS; const int nn = (index / NIN / BS) %NUMNET; PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+ls]=ActFun(PARAM.A_f[LAYER-1],PARAM.V_a[ITER*BS*NVAL+bs*NVAL+ls],0,0,1); } __syncthreads(); } if(1){ GPU_1D_KERN_LOOP(index,NOUT*NUMNET*BS){ const int ls = (index / 1 ) %NOUT; //Outputs / layer size const int bs = (index / NIN) %BS; const int nn = (index / NIN / BS) %NUMNET; PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]=0; } __syncthreads(); //Feedforward to MLP layer GPU_1D_KERN_LOOP(index, NIN*NOUT*NUMNET*BS){ const int ls = (index / 1 ) %NOUT; //Outputs / layer size const int col = (index / NOUT ) %NIN; //Inputs / size of feature map const int bs = (index / NOUT / NIN ) %BS; const int nn = (index / NOUT / NIN / BS ) %NUMNET; atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.W_a[PARAM.W_l[LAYER-1]+(col+ls*NIN)]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+col]); atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.B_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]); } __syncthreads(); if(1){ GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){ const int ls = (index / 1 ) %NOUT; //Outputs / layer size const int bs = (index / NOUT ) %BS; const int nn = (index / NOUT / BS ) %NUMNET; PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls] = ActFun(PARAM.A_f[LAYER],PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],0,0,1); } } __syncthreads(); } }//Layers Loop } __global__ void NN_BP(myparam PARAM, const int* const __restrict__ LU_a) { const int NLAYS = PARAM.NLAYS; const int BS = PARAM.BS; const int NUMNET = PARAM.NN; const int NETSIZE = PARAM.NS; for(int LAYER = NLAYS-1; LAYER > 0; LAYER--) { const int NIN = LU_a[LAYER*6+0]; const int NOUT = LU_a[LAYER*6+1]; const int SIN = LU_a[LAYER*6+2]; const int SOUT = LU_a[LAYER*6+3]; const int WSTR = LU_a[LAYER*6+4]; if(LAYER == NLAYS-1) { // Output layer error GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET) { const int nl = index % NOUT; const int bs = (index / NOUT) % BS; const int nn = index / (NOUT * BS); int ERR_POS = nn*BS*NETSIZE + bs*NETSIZE + SOUT + nl; int VAL_POS = PARAM.ITER*BS*PARAM.NVAL + bs*PARAM.NVAL + (PARAM.NVAL-NOUT) + nl; int ACT_POS = nn*BS*NETSIZE + bs*NETSIZE + SOUT + nl; float target = PARAM.V_a[VAL_POS]; float output = PARAM.A_a[ACT_POS]; //PARAM.E_a[ERR_POS] = (target - output) * output * (1 - output); // Derivative of sigmoid //PARAM.E_a[ERR_POS] = (target - output); PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[NLAYS-1],target-output,1,output,1); } } // Hidden layer error calculation GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET) { const int nl = index % NIN; const int bs = (index / NIN) % BS; const int nn = index / (NIN * BS); int ERR_LO = nn*BS*NETSIZE + bs*NETSIZE + SIN + nl; PARAM.E_a[ERR_LO] = 0; for(int j = 0; j < NOUT; j++) { int ERR_UP = nn*BS*NETSIZE + bs*NETSIZE + SOUT + j; int WGT_POS = WSTR + j*NIN + nl; PARAM.E_a[ERR_LO] += PARAM.W_a[WGT_POS] * PARAM.E_a[ERR_UP]; } } __syncthreads(); // Apply activation derivative to hidden layer error GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET) { const int nl = index % NIN; const int bs = (index / NIN) % BS; const int nn = index / (NIN * BS); int ERR_LO = nn*BS*NETSIZE + bs*NETSIZE + SIN + nl; int ACT_POS = ERR_LO; float act = PARAM.A_a[ACT_POS]; PARAM.E_a[ERR_LO] = ActFun(PARAM.A_f[LAYER-1],PARAM.E_a[ERR_LO],1,PARAM.A_a[ACT_POS],1); atomicAdd(&PARAM.B_a[ERR_LO],PARAM.Lrt*ActFun(PARAM.A_f[LAYER-1],PARAM.E_a[ERR_LO],1,PARAM.A_a[ACT_POS],1)); } __syncthreads(); //Update PA and PW GPU_1D_KERN_LOOP(index, NIN*NOUT) { const int nl = (index / 1 ) %NOUT; //Number of labels const int us = (index /NOUT ) %NIN; //layer size PARAM.P_a[WSTR+nl*NIN+us]=PARAM.D_a[WSTR+nl*NIN+us]; PARAM.P_w[WSTR+nl*NIN+us]=PARAM.W_a[WSTR+nl*NIN+us]; } __syncthreads(); // Update weights GPU_1D_KERN_LOOP(index, NIN*NOUT*BS*NUMNET) { const int bs = (index / 1) %BS; const int nn = (index / BS) %NUMNET; const int nl = (index / BS / NUMNET ) %NOUT; //Number of labels const int us = (index / BS / NUMNET /NOUT ) %NIN; //layer size atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Mmnt*PARAM.P_a[WSTR+nl*NIN+us]); //Deltas atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Lrt*PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+us]); //Lrate atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],-PARAM.Decy*PARAM.P_w[WSTR+nl*NIN+us]); //Preweights PARAM.D_a[WSTR+nl*NIN+us] = PARAM.W_a[WSTR+nl*NIN+us]; } __syncthreads(); GPU_1D_KERN_LOOP(index, NIN*NOUT) { const int nl = (index / 1 ) %NOUT; //Number of labels const int us = (index /NOUT ) %NIN; //layer size atomicAdd(&PARAM.D_a[WSTR+nl*NIN+us],-PARAM.P_a[WSTR+nl*NIN+us]); } __syncthreads(); } } class net{ public: myparam PARAM_DEV,PARAM_HOST; int *L_ua,*PLAYR,*LAYERS,*ACTFI,*WPLAYR; int NLAYS,BS,TI,NUMNET; float *VAL; clock_t startTime, endTime; ofstream errfile; ~net(){ /* hipFree(PARAM_DEV.B_a);hipFree(PARAM_DEV.A_a);hipFree(PARAM_DEV.W_a);hipFree(PARAM_DEV.D_a);hipFree(PARAM_DEV.P_a);hipFree(PARAM_DEV.P_w);hipFree(PARAM_DEV.E_a);hipFree(PARAM_DEV.V_a); hipFree(PARAM_DEV.L_a);hipFree(PARAM_DEV.WL_a);hipFree(PARAM_DEV.C_e); hipFree(PARAM_DEV.P_l);hipFree(PARAM_DEV.W_l);hipFree(PARAM_DEV.A_f); hipFree(LU_a);hipFree(PARAM_DEV.LU_a); delete(PARAM_HOST.B_a);delete(PARAM_HOST.A_a);delete(PARAM_HOST.W_a);delete(PARAM_HOST.D_a);delete(PARAM_HOST.P_a);delete(PARAM_HOST.P_w);delete(PARAM_HOST.E_a);delete(PARAM_HOST.V_a); delete(PARAM_HOST.L_a);delete(PARAM_HOST.WL_a);delete(PARAM_HOST.C_e); delete(PARAM_HOST.P_l);delete(PARAM_HOST.W_l);delete(PARAM_HOST.A_f); delete PLAYR,LAYERS,ACTFI,WPLAYR,VAL,L_ua; */ } net(int LAYERS_[],const char* ACTF[],int NLAYERS){ TI = 2000; //Number of Iterations BS = PARAM_DEV.BS = 12; //Batch Size PARAM_DEV.ITER = 0; //Start of Iterations NUMNET = PARAM_DEV.NN = 1; //Number of Paralell Nets PARAM_DEV.Lrt = (0.5/NUMNET)*BS; PARAM_DEV.Mmnt = 0.0025/(BS*NUMNET); PARAM_DEV.Decy = 0.000065/(BS*NUMNET); errfile.open("errfile.dat",ios::out); srand(199283); NLAYS = PARAM_DEV.NLAYS = NLAYERS; LAYERS = new int[NLAYERS]; PLAYR = new int[NLAYERS]; WPLAYR = new int[NLAYERS]; ACTFI = new int[NLAYERS]; L_ua = new int[NLAYS*6]; //Layer Constants for(int i=0;i<NLAYS;i++){ ACTFI[i]=0; if(ACTF[i][0]=='T'&&ACTF[i][1]=='A'&&ACTF[i][2]=='N'&&ACTF[i][3]=='H'){ACTFI[i]=0;} if(ACTF[i][0]=='S'&&ACTF[i][1]=='I'&&ACTF[i][2]=='G'&&ACTF[i][3]=='M'){ACTFI[i]=1;} if(ACTF[i][0]=='N'&&ACTF[i][1]=='O'&&ACTF[i][2]=='R'&&ACTF[i][3]=='M'){ACTFI[i]=2;} if(i==0){ PLAYR[0] = LAYERS_[0]; WPLAYR[0] = 0; } if(i>0){ PLAYR[i] = PLAYR[i-1]+LAYERS_[i]; //Neurons WPLAYR[i] = WPLAYR[i-1]+LAYERS_[i]*LAYERS_[i-1]; //Weights } LAYERS[i] = LAYERS_[i]; } PARAM_DEV.NVAL = LAYERS[NLAYS-1]+LAYERS[0]; //Set number of Values to learn (3) PARAM_DEV.WTOT = WPLAYR[NLAYS-1]; PARAM_DEV.NS = PLAYR[NLAYS-1]; //Netsize for(int i=1;i<NLAYS;i++){ if(i>1){ L_ua[i*6+0] = PLAYR[i-1]-PLAYR[i-2];//NIN L_ua[i*6+1] = PLAYR[i]-PLAYR[i-1];//NOUT L_ua[i*6+2] = PLAYR[i-2];//SIN L_ua[i*6+3] = PLAYR[i-1];//SOUT L_ua[i*6+4] = WPLAYR[i-1];//WSTR L_ua[i*6+5] = WPLAYR[i];//WEND }else{ L_ua[i*6+0] = PLAYR[0];//NIN L_ua[i*6+1] = PLAYR[1]-PLAYR[0];//NOUT L_ua[i*6+2] = 0;//SIN L_ua[i*6+3] = PLAYR[0];//SOUT L_ua[i*6+4] = WPLAYR[i-1];//WSTR L_ua[i*6+5] = WPLAYR[i];//WEND } } PARAM_HOST.W_a = new float[WPLAYR[NLAYS-1]]; PARAM_HOST.P_w = new float[WPLAYR[NLAYS-1]]; PARAM_HOST.D_a = new float[WPLAYR[NLAYS-1]]; PARAM_HOST.P_a = new float[WPLAYR[NLAYS-1]]; PARAM_HOST.E_a = new float[NUMNET*BS*PLAYR[NLAYS-1]]; PARAM_HOST.A_a = new float[NUMNET*BS*PLAYR[NLAYS-1]]; PARAM_HOST.B_a = new float[NUMNET*BS*PLAYR[NLAYS-1]]; for(int i=0;i<BS*NUMNET*PLAYR[NLAYS-1];i++){ PARAM_HOST.A_a[i] = 0; PARAM_HOST.B_a[i] = getlrand(0,1); PARAM_HOST.E_a[i] = 0; } VAL = new float[(LAYERS[NLAYS-1]+LAYERS[0])*BS*TI]; prep_val(VAL,0); /*Init weights and deltas*/ for(int i=0;i<WPLAYR[NLAYS-1];i++){ //PARAM_HOST.W_a[i] = getlrand(0,1); PARAM_HOST.W_a[i] = getlrand(-1.0,1.0)*sqrt(LAYERS[0]); /*Xavier Initialization*/ PARAM_HOST.D_a[i] = 0;//getlrand(0,1); PARAM_HOST.P_a[i] = 0;//getlrand(0,1); PARAM_HOST.P_w[i] = 0;//getlrand(0,1); } } void mem_alloc(){ PARAM_HOST.A_a = new float[BS*NUMNET*PLAYR[NLAYS-1]]; hipMallocManaged((void **)&PARAM_DEV.A_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //ACT hipMallocManaged((void **)&PARAM_DEV.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //BIAS hipMallocManaged((void **)&PARAM_DEV.E_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float)); //ERR hipMallocManaged((void **)&PARAM_DEV.V_a, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float)); //VAL hipMallocManaged((void **)&PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float)); //WGT hipMallocManaged((void **)&PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float)); //DLT hipMallocManaged((void **)&PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float)); //PREDLT hipMallocManaged((void **)&PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float)); //PWGT hipMallocManaged((void **)&PARAM_DEV.A_f, NLAYS * sizeof(int)); //ACTFI hipMemcpy(PARAM_DEV.A_f, ACTFI, NLAYS * sizeof(int), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.V_a,VAL, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float), hipMemcpyHostToDevice); /*Batch load Values Transfer to device*/ hipMallocManaged((void **)&PARAM_DEV.P_l, NLAYS * sizeof(int)); //Pointer to Act layers hipMallocManaged((void **)&PARAM_DEV.W_l, NLAYS * sizeof(int)); //Pointer to Wgt layers hipMallocManaged((void **)&LU_a, NLAYS * 6 * sizeof(int)); //Pointer to Wgt layers hipMemcpy(PARAM_DEV.B_a, PARAM_HOST.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.W_a, PARAM_HOST.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.D_a, PARAM_HOST.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.P_a, PARAM_HOST.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.P_w, PARAM_HOST.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.W_l, WPLAYR, NLAYS * sizeof(int), hipMemcpyHostToDevice); hipMemcpy(PARAM_DEV.P_l, PLAYR, NLAYS * sizeof(int), hipMemcpyHostToDevice); hipMemcpy(LU_a, L_ua, NLAYS * 6 * sizeof(int), hipMemcpyHostToDevice); } void prep_val(float *VAL,int EPOCH){ float xord[4][3] = { {0, 0, 0}, {0, 1, 1}, {1, 0, 1}, {1, 1, 0} }; int select=0; for(int i=0;i<BS*TI;i++){ if(1){ select = rand()%4; }else{ select++; if(select==4)select=0; } VAL[i*3] = xord[select][0]; VAL[i*3+1] = xord[select][1]; VAL[i*3+2] = xord[select][2]; } } void call_nn(){ int NSIZE = NUMNET*BS*WPLAYR[NLAYS-1]; int MAXTHREADS = 1024; //Threads per block int NUMBLOCKS,NUMTHREADS; if(NSIZE<MAXTHREADS){ NUMBLOCKS =4; NUMTHREADS = NSIZE/4; }else{ NUMBLOCKS = 4*(NSIZE/32); NUMTHREADS = 4*32; } cout<<" NSIZE="<<NSIZE<<" NUMBLOCKS = "<<NUMBLOCKS<<" NUMTHREADS = "<<NUMTHREADS<<"\n"; GpuInit myinit_aconn(1024,1024,1,1); //GPU using 1024 grids and 1024 blocks GpuInit* myinitaconn; myinitaconn = &myinit_aconn; PARAM_DEV.ITER = 0; /*Cycle through ITER on device above NN_FF Each time we run NN_FF we process BS x Values */ //INIT_PARAM<<<1024,1024,0,0>>>(PARAM_DEV); if(0){ hipMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); cout<<"\nWeights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.W_a[i]<<","; } cout<<"\n\nPre Weights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.P_w[i]<<","; } cout<<"\n\nDelta Weights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.D_a[i]<<","; } cout<<"\n\nPre Deltas:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.P_a[i]<<","; } } //exit(1); startTime = clock(); for(int iter=0;iter<TI;iter++){ PARAM_DEV.ITER=iter; NN_FF<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a); NN_BP<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a); if(iter>TI-7){ hipMemcpy(PARAM_HOST.A_a, PARAM_DEV.A_a, BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.E_a, PARAM_DEV.E_a, BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); cout<<"Iter:"<<iter<<"\n"; for(int j=BS-1;j<BS;j++){ cout<<"Batch="<<j<<"\n"; cout<<"["; for(int i=0;i<PLAYR[NLAYS-1];i++){ cout<<"--Act="<<PARAM_HOST.A_a[j*PLAYR[NLAYS-1]+i]<<"--Err="; cout<<PARAM_HOST.E_a[j*PLAYR[NLAYS-1]+i]<<"\n"; } cout<<"]"; } hipMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); hipMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost); cout<<"\nWeights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.W_a[i]<<","; } cout<<"\n\nPre Weights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.P_w[i]<<","; } cout<<"\n\nDelta Weights:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.D_a[i]<<","; } cout<<"\n\nPre Deltas:"; for(int i=0;i<WPLAYR[NLAYS-1];i++){ cout<<PARAM_HOST.P_a[i]<<","; } } if(1){ float* TERR; TERR = new float[1]; hipMemcpy(TERR, PARAM_DEV.E_a+PLAYR[NLAYS-1]-1,sizeof(float), hipMemcpyDeviceToHost); errfile<<sqrt(pow(TERR[0],2))<<"\n"; //errfile<<TERR[0]<<"\n"; } }//Iter endTime = clock(); cout<<"\nCUDA gpu took "<<100000*((endTime-startTime)/(float)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl; errfile.close(); } }; int main(){ int topo[] = {2,4,1}; //const char* acts[] = {"NORM","TANH","TANH"}; const char* acts[] = {"NORM","SIGM","SIGM"}; int nlayers = 3; net mynet(topo,acts,nlayers); mynet.mem_alloc(); mynet.call_nn(); cout<<"Hello\n"; }

Code: Select all: #include <algorithm> #include <complex> #include <iostream> #include <math.h> #include <vector> /*Begin 1DKern definition */ /*This is a direct copy of Tensorflows 1DKern code*/ namespace detail { template <typename T> class GpuGridRange { struct Iterator { __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {} __device__ T operator*() const { return index_;} __device__ Iterator& operator++() { index_ += delta_; return *this; } __device__ bool operator!=(const Iterator& other) const { bool greater = index_ > other.index_; bool less = index_ < other.index_; if(!other.delta_){ return less; } if(!delta_){ return greater; } return less || greater; } private: T index_; const T delta_; }; //end Iterator struct public: __device__ GpuGridRange(T begin,T delta,T end) : begin_(begin),delta_(delta),end_(end) {} __device__ Iterator begin() const {return Iterator(begin_,delta_); } __device__ Iterator end() const {return Iterator(end_,0);} private: T begin_; T delta_; T end_; }; //end GPU class class }; //end namespace detail template <typename T> //Allows you to use GPU iterator with all data types __device__ detail::GpuGridRange<T> GpuGridRangeX(T count) { return detail::GpuGridRange<T>( /*begin*/blockIdx.x * blockDim.x + threadIdx.x, /*delta*/gridDim.x * blockDim.x, /*end*/count ); } template <typename T> //Allows you to use GPU iterator with all data types __device__ detail::GpuGridRange<T> GpuGridRangeY(T count) { return detail::GpuGridRange<T>( /*begin*/blockIdx.y * blockDim.y + threadIdx.y, /*delta*/gridDim.y * blockDim.y, /*end*/count ); } template <typename T> //Allows you to use GPU iterator with all data types __device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) { return detail::GpuGridRange<T>( /*begin*/blockIdx.z * blockDim.z + threadIdx.z, /*delta*/gridDim.z * blockDim.z, /*end*/count ); } #define GPU_1D_KERN_LOOP(i, n) \ for (int i : ::GpuGridRangeX<int>(n)) #define GPU_AXIS_KERNEL_LOOP(i, n, axis) \ for (int i : ::GpuGridRange##axis<int>(n)) /*End 1DKern definition*/ class GpuInit{ public: dim3 grid; dim3 block; hipStream_t stream1; //Could be a array of streams for multiple streams GpuInit(unsigned int gridsizeX,unsigned int blocksizeX,unsigned int gridsizeY,unsigned int blocksizeY){ /*If using GPU_1D_KERN_LOOP */ grid.x = gridsizeX; grid.y = gridsizeY; block.x = blocksizeX; block.y= blocksizeY; hipStreamCreate(&stream1); //cout<<"Grid dimensions are: "<<grid.x<<"--"<<grid.y<<"--"<<grid.z<<"\n"; //cout<<"Block dimensions are: "<<block.x<<"--"<<block.y<<"--"<<block.z<<"\n"; } }; __device__ double atomicAdd__(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } __device__ double atomicAdd_(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); }

The Scrabble One space cruiser entered the downtown slipstream and followed the port control beacons intot the landing bay.

Downtown was filled with people - all dressed in raincoats all weathering the storms brought here by the wormhole's that littered Z-space.

She climbed down from the cockpit and greeted the immigration agent. "I am Lisa Short - farscraper extrordinair. Here take my passport.", she bowed gracefully to the robot inspector.

The agent took the passport and scanned its heavey drive onto its cloud the robot then greeted her in reply," Welcome Lisa Short. Welcome to Velcra One the greatest offworld colony in the Belt system. May you find everything you desire." the robots slender arms extended towards a brightly lit rolling city scape that extended out into a vanishing horizon bordered only by the empty darkness of space.

She looked the robot up and down. She could tell the immigration robot was a recycled sex worker from the way it winked suggestively. In fact as she turned to leave she caught a glimpseof his kinky under garments that only just covered his fake skin bottom.

"Oh dear its here again.", she sighed as she entered the vast space port concourse with people and robots and pretty much everyone all shuffling about with theyre luggage in every direction - like an agar plate filled with bacteria it just seem to grow in all directions. She followed one of them and then made an acute turn into a service alley which led out into downtown. She emmerged and found herself on a relatively quiet street. Well quiet for downtown.

It was the homgee district notorious for being held and run by the drug syndicate family from io - the homgee.

Learn more about Z-Space

A nice MLP neural network in HIP for AMD GPU's

A nice MLP neural network in HIP for AMD GPU's

Who is online