Page 1 of 1

A nice MLP neural network in HIP for AMD GPU's

PostPosted: Sat Jul 13, 2024 9:52 pm
by hbyte
This is my first rough draft of an MLP running on AMD gpu and solving the XoR really fast! It needs 1dkern.cpp which is listed at the end.

Code: Select all
#include "hip/hip_runtime.h"
#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <ctime>
#include <time.h>
#include "1dkern.cpp"

using namespace std;

typedef struct{

float *B_a,*A_a,*W_a,*D_a,*P_a,*P_w,*E_a,*V_a;
float *L_a,*WL_a,*C_e;   
int *LU_a,*P_l,*W_l,*A_f;   
int ITER,NLAYS,NN,BS,NVAL,WTOT,NS;
float Lrt,Decy,Mmnt;
}myparam;

int *LU_a;

float getlrand(float lower,float upper){

           return ((float) rand()/ RAND_MAX) * (upper-lower) + lower;

}

/* Call all Act functions - includes rng for noisy Acts */
__device__ float mytanh_(float x)
{
  float exp2x = expf(2*x);
  return (exp2x - 1) / (exp2x + 1);
}

__device__ float sigmoid(float v)
{
   return 1 / (1 + expf(-v));
}


__device__ float InvSig(const float sum,const float act)
{
   return (act*(1-act)*sum) ;
}

__device__ float ActFun(int type, float v, int dir, float u,int seed) {
    switch(type) {
        case 0: /* Tanh */
            return (dir == 0) ? mytanh_(v) : (1.0f - mytanh_(u) * mytanh_(u)) * v;
        case 1: /* Sigmoid */
            return (dir == 0) ? 1.0f / (1.0f + expf(-v)) : u * (1.0f - u) * v;
        case 2: /* Norm */
            return v;
        default:
            return v;
    }
}


__device__ float rng(unsigned int m_w,unsigned int m_z){

   m_z = 36969 * (m_z & 65535) + (m_z >> 16);
   m_w = 18000 * (m_w & 65535) + (m_w >> 16);

   return (m_z << 16) + m_w;  /* 32-bit result */
   

}

__global__ void INIT_PARAM(myparam PARAM){

int TW,NW;

TW=PARAM.W_l[PARAM.NLAYS-1];
NW=PARAM.P_l[PARAM.NLAYS-1];


GPU_1D_KERN_LOOP(index,TW){

const int ls = (index / 1 )        %TW;      

PARAM.W_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.D_a[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_w[ls] = -(2425-rng(150+ls,40)/1000000);
PARAM.P_a[ls] = -(2425-rng(150+ls,40)/1000000);

                        }

__syncthreads();


GPU_1D_KERN_LOOP(index,NW){

const int ls = (index / 1 )        %NW;      

PARAM.B_a[ls] = -(2425-rng(150+ls,40)/1000000);


                        }


}



__global__ void NN_FF(myparam PARAM,const int* const __restrict__ LU_a){

//hipStream_t astream;
//hipStreamCreateWithFlags(&astream,hipStreamNonBlocking);

const int NLAYS = PARAM.NLAYS;

for(int LAYER=1;LAYER<NLAYS;LAYER++){


/* Set Constants */
const int NIN  = LU_a[LAYER*6+0];
const int NOUT = LU_a[LAYER*6+1];
const int SIN  = LU_a[LAYER*6+2];
const int SOUT = LU_a[LAYER*6+3];


const int NVAL = PARAM.NVAL;            //Number of Training Values (XoR or RL)
const int NUMNET = PARAM.NN;
const int ITER = PARAM.ITER;
const int BS = PARAM.BS;
const int NETSIZE = PARAM.NS;



if(LAYER==1){

/*Set Inputs*/

GPU_1D_KERN_LOOP(index,NIN*NUMNET*BS){

   const int ls = (index / 1 )        %NIN;      //Outputs / layer size
   const int bs = (index / NIN)        %BS;
   const int nn  = (index / NIN / BS) %NUMNET;
   
   
   PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+ls]=ActFun(PARAM.A_f[LAYER-1],PARAM.V_a[ITER*BS*NVAL+bs*NVAL+ls],0,0,1);
   
   
                                    }
__syncthreads();            

                     }
                                    
if(1){

GPU_1D_KERN_LOOP(index,NOUT*NUMNET*BS){

   const int ls = (index / 1 )         %NOUT;      //Outputs / layer size
   const int bs = (index / NIN)        %BS;
   const int nn  = (index / NIN / BS)  %NUMNET;
   
   PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]=0;
   

}

__syncthreads();            

//Feedforward to MLP layer

GPU_1D_KERN_LOOP(index, NIN*NOUT*NUMNET*BS){

   
   
      const int ls = (index / 1 )                  %NOUT;      //Outputs / layer size
      const int col = (index / NOUT )             %NIN;      //Inputs /  size of feature map
      const int bs = (index / NOUT / NIN )          %BS;
      const int nn  = (index / NOUT / NIN / BS )   %NUMNET;
                           
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.W_a[PARAM.W_l[LAYER-1]+(col+ls*NIN)]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+col]);
atomicAdd(&PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],PARAM.B_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls]);

                                          }

__syncthreads();            
                                    
if(1){

GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET){

      const int ls = (index / 1 )            %NOUT;      //Outputs / layer size
      const int bs = (index / NOUT  )        %BS;
      const int nn  = (index / NOUT / BS )   %NUMNET;
      
PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls] = ActFun(PARAM.A_f[LAYER],PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+ls],0,0,1);
                           }

}
                                    
__syncthreads();                                    


}
                                    }//Layers Loop

}
__global__ void NN_BP(myparam PARAM, const int* const __restrict__ LU_a) {
    const int NLAYS = PARAM.NLAYS;
    const int BS = PARAM.BS;
    const int NUMNET = PARAM.NN;
    const int NETSIZE = PARAM.NS;

    for(int LAYER = NLAYS-1; LAYER > 0; LAYER--) {
        const int NIN = LU_a[LAYER*6+0];
        const int NOUT = LU_a[LAYER*6+1];
        const int SIN = LU_a[LAYER*6+2];
        const int SOUT = LU_a[LAYER*6+3];
        const int WSTR = LU_a[LAYER*6+4];

        if(LAYER == NLAYS-1) {
            // Output layer error
            GPU_1D_KERN_LOOP(index, NOUT*BS*NUMNET) {
                const int nl = index % NOUT;
                const int bs = (index / NOUT) % BS;
                const int nn = index / (NOUT * BS);

                int ERR_POS = nn*BS*NETSIZE + bs*NETSIZE + SOUT + nl;
                int VAL_POS = PARAM.ITER*BS*PARAM.NVAL + bs*PARAM.NVAL + (PARAM.NVAL-NOUT) + nl;
                int ACT_POS = nn*BS*NETSIZE + bs*NETSIZE + SOUT + nl;

                float target = PARAM.V_a[VAL_POS];
                float output = PARAM.A_a[ACT_POS];
               
            //PARAM.E_a[ERR_POS] = (target - output) * output * (1 - output); // Derivative of sigmoid
            //PARAM.E_a[ERR_POS] = (target - output);
            
            PARAM.E_a[ERR_POS] = ActFun(PARAM.A_f[NLAYS-1],target-output,1,output,1);
            }
        }
      
         
            // Hidden layer error calculation
            GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET) {
                const int nl = index % NIN;
                const int bs = (index / NIN) % BS;
                const int nn = index / (NIN * BS);

                int ERR_LO = nn*BS*NETSIZE + bs*NETSIZE + SIN + nl;
                PARAM.E_a[ERR_LO] = 0;

                for(int j = 0; j < NOUT; j++) {
                    int ERR_UP = nn*BS*NETSIZE + bs*NETSIZE + SOUT + j;
                    int WGT_POS = WSTR + j*NIN + nl;
                    PARAM.E_a[ERR_LO] += PARAM.W_a[WGT_POS] * PARAM.E_a[ERR_UP];
                }
            }
            __syncthreads();
         
            // Apply activation derivative to hidden layer error
            GPU_1D_KERN_LOOP(index, NIN*BS*NUMNET) {
                const int nl = index % NIN;
                const int bs = (index / NIN) % BS;
                const int nn = index / (NIN * BS);

                int ERR_LO = nn*BS*NETSIZE + bs*NETSIZE + SIN + nl;
                int ACT_POS = ERR_LO;

                float act = PARAM.A_a[ACT_POS];
            
            PARAM.E_a[ERR_LO] = ActFun(PARAM.A_f[LAYER-1],PARAM.E_a[ERR_LO],1,PARAM.A_a[ACT_POS],1);
                atomicAdd(&PARAM.B_a[ERR_LO],PARAM.Lrt*ActFun(PARAM.A_f[LAYER-1],PARAM.E_a[ERR_LO],1,PARAM.A_a[ACT_POS],1));
            
            }
       

        __syncthreads();

         

      //Update PA and PW
      GPU_1D_KERN_LOOP(index, NIN*NOUT) {
         const int nl = (index / 1 )             %NOUT;   //Number of labels
         const int us = (index /NOUT )          %NIN;      //layer size
         
         
         PARAM.P_a[WSTR+nl*NIN+us]=PARAM.D_a[WSTR+nl*NIN+us];
         PARAM.P_w[WSTR+nl*NIN+us]=PARAM.W_a[WSTR+nl*NIN+us];
         
         }

      __syncthreads();

        // Update weights
         GPU_1D_KERN_LOOP(index, NIN*NOUT*BS*NUMNET) {
            const int bs = (index / 1)                     %BS;
         const int nn  = (index / BS)                    %NUMNET;
         const int nl = (index / BS / NUMNET )             %NOUT;   //Number of labels
         const int us = (index / BS / NUMNET /NOUT )       %NIN;      //layer size   

         
         atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Mmnt*PARAM.P_a[WSTR+nl*NIN+us]);            //Deltas   
   
         atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],PARAM.Lrt*PARAM.E_a[nn*BS*NETSIZE+bs*NETSIZE+SOUT+nl]*PARAM.A_a[nn*BS*NETSIZE+bs*NETSIZE+SIN+us]);         //Lrate
   
         atomicAdd(&PARAM.W_a[WSTR+nl*NIN+us],-PARAM.Decy*PARAM.P_w[WSTR+nl*NIN+us]);            //Preweights

         PARAM.D_a[WSTR+nl*NIN+us] = PARAM.W_a[WSTR+nl*NIN+us];

        }
      
      __syncthreads();
      
         GPU_1D_KERN_LOOP(index, NIN*NOUT) {
         const int nl = (index / 1 )             %NOUT;   //Number of labels
         const int us = (index /NOUT )          %NIN;      //layer size
         
         
         atomicAdd(&PARAM.D_a[WSTR+nl*NIN+us],-PARAM.P_a[WSTR+nl*NIN+us]);
         
         }
               

        __syncthreads();
      
      
   
   }
}

class net{
public:
myparam PARAM_DEV,PARAM_HOST;
int *L_ua,*PLAYR,*LAYERS,*ACTFI,*WPLAYR;
int NLAYS,BS,TI,NUMNET;
float *VAL;
clock_t startTime, endTime;

ofstream errfile;

~net(){
/*
hipFree(PARAM_DEV.B_a);hipFree(PARAM_DEV.A_a);hipFree(PARAM_DEV.W_a);hipFree(PARAM_DEV.D_a);hipFree(PARAM_DEV.P_a);hipFree(PARAM_DEV.P_w);hipFree(PARAM_DEV.E_a);hipFree(PARAM_DEV.V_a);
hipFree(PARAM_DEV.L_a);hipFree(PARAM_DEV.WL_a);hipFree(PARAM_DEV.C_e);   
hipFree(PARAM_DEV.P_l);hipFree(PARAM_DEV.W_l);hipFree(PARAM_DEV.A_f);   
hipFree(LU_a);hipFree(PARAM_DEV.LU_a);

delete(PARAM_HOST.B_a);delete(PARAM_HOST.A_a);delete(PARAM_HOST.W_a);delete(PARAM_HOST.D_a);delete(PARAM_HOST.P_a);delete(PARAM_HOST.P_w);delete(PARAM_HOST.E_a);delete(PARAM_HOST.V_a);
delete(PARAM_HOST.L_a);delete(PARAM_HOST.WL_a);delete(PARAM_HOST.C_e);   
delete(PARAM_HOST.P_l);delete(PARAM_HOST.W_l);delete(PARAM_HOST.A_f);   


delete PLAYR,LAYERS,ACTFI,WPLAYR,VAL,L_ua;
*/
}

net(int LAYERS_[],const char* ACTF[],int NLAYERS){

TI = 2000;                  //Number of Iterations
BS = PARAM_DEV.BS = 12;         //Batch Size
PARAM_DEV.ITER = 0;   //Start of Iterations
NUMNET = PARAM_DEV.NN = 1;   //Number of Paralell Nets
PARAM_DEV.Lrt = (0.5/NUMNET)*BS; PARAM_DEV.Mmnt = 0.0025/(BS*NUMNET); PARAM_DEV.Decy = 0.000065/(BS*NUMNET);

errfile.open("errfile.dat",ios::out);

srand(199283);

NLAYS = PARAM_DEV.NLAYS = NLAYERS;
LAYERS = new int[NLAYERS];
PLAYR = new int[NLAYERS];
WPLAYR = new int[NLAYERS];
ACTFI = new int[NLAYERS];
L_ua = new int[NLAYS*6];      //Layer Constants

for(int i=0;i<NLAYS;i++){

ACTFI[i]=0;

if(ACTF[i][0]=='T'&&ACTF[i][1]=='A'&&ACTF[i][2]=='N'&&ACTF[i][3]=='H'){ACTFI[i]=0;}
if(ACTF[i][0]=='S'&&ACTF[i][1]=='I'&&ACTF[i][2]=='G'&&ACTF[i][3]=='M'){ACTFI[i]=1;}
if(ACTF[i][0]=='N'&&ACTF[i][1]=='O'&&ACTF[i][2]=='R'&&ACTF[i][3]=='M'){ACTFI[i]=2;}


if(i==0){
PLAYR[0]  = LAYERS_[0];
WPLAYR[0] = 0;
      }
if(i>0){
PLAYR[i]  = PLAYR[i-1]+LAYERS_[i];            //Neurons
WPLAYR[i] = WPLAYR[i-1]+LAYERS_[i]*LAYERS_[i-1];   //Weights   
      }
LAYERS[i] = LAYERS_[i];      
                     }

PARAM_DEV.NVAL = LAYERS[NLAYS-1]+LAYERS[0];         //Set number of Values to learn (3)
PARAM_DEV.WTOT = WPLAYR[NLAYS-1];
PARAM_DEV.NS   = PLAYR[NLAYS-1];               //Netsize

for(int i=1;i<NLAYS;i++){
if(i>1){
L_ua[i*6+0] = PLAYR[i-1]-PLAYR[i-2];//NIN
L_ua[i*6+1] = PLAYR[i]-PLAYR[i-1];//NOUT
L_ua[i*6+2] = PLAYR[i-2];//SIN
L_ua[i*6+3] = PLAYR[i-1];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
   }else{
L_ua[i*6+0] = PLAYR[0];//NIN
L_ua[i*6+1] = PLAYR[1]-PLAYR[0];//NOUT
L_ua[i*6+2] = 0;//SIN
L_ua[i*6+3] = PLAYR[0];//SOUT
L_ua[i*6+4] = WPLAYR[i-1];//WSTR
L_ua[i*6+5] = WPLAYR[i];//WEND
}
}




PARAM_HOST.W_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_w = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.D_a = new float[WPLAYR[NLAYS-1]];
PARAM_HOST.P_a = new float[WPLAYR[NLAYS-1]];

PARAM_HOST.E_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.A_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];
PARAM_HOST.B_a = new float[NUMNET*BS*PLAYR[NLAYS-1]];

for(int i=0;i<BS*NUMNET*PLAYR[NLAYS-1];i++){

PARAM_HOST.A_a[i] = 0;
PARAM_HOST.B_a[i] = getlrand(0,1);
PARAM_HOST.E_a[i] = 0;               
                              }


VAL  = new float[(LAYERS[NLAYS-1]+LAYERS[0])*BS*TI];
prep_val(VAL,0);
                                 /*Init weights and deltas*/
for(int i=0;i<WPLAYR[NLAYS-1];i++){

//PARAM_HOST.W_a[i] = getlrand(0,1);
PARAM_HOST.W_a[i] = getlrand(-1.0,1.0)*sqrt(LAYERS[0]); /*Xavier Initialization*/

PARAM_HOST.D_a[i] = 0;//getlrand(0,1);
PARAM_HOST.P_a[i] = 0;//getlrand(0,1);
PARAM_HOST.P_w[i] = 0;//getlrand(0,1);
                        
                           }

                                                }



void mem_alloc(){

PARAM_HOST.A_a = new float[BS*NUMNET*PLAYR[NLAYS-1]];

hipMallocManaged((void **)&PARAM_DEV.A_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //ACT
hipMallocManaged((void **)&PARAM_DEV.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //BIAS
hipMallocManaged((void **)&PARAM_DEV.E_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float));   //ERR
hipMallocManaged((void **)&PARAM_DEV.V_a, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float)); //VAL
hipMallocManaged((void **)&PARAM_DEV.W_a, WPLAYR[NLAYS-1] *  sizeof(float));   //WGT
hipMallocManaged((void **)&PARAM_DEV.D_a, WPLAYR[NLAYS-1] *  sizeof(float));   //DLT
hipMallocManaged((void **)&PARAM_DEV.P_a, WPLAYR[NLAYS-1] *  sizeof(float));   //PREDLT
hipMallocManaged((void **)&PARAM_DEV.P_w, WPLAYR[NLAYS-1] *  sizeof(float));   //PWGT
hipMallocManaged((void **)&PARAM_DEV.A_f, NLAYS * sizeof(int));   //ACTFI

hipMemcpy(PARAM_DEV.A_f, ACTFI, NLAYS * sizeof(int), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.V_a,VAL, TI * BS * (LAYERS[0]+LAYERS[NLAYS-1]) * sizeof(float), hipMemcpyHostToDevice);  /*Batch load Values Transfer to device*/
hipMallocManaged((void **)&PARAM_DEV.P_l, NLAYS *  sizeof(int));   //Pointer to Act layers
hipMallocManaged((void **)&PARAM_DEV.W_l, NLAYS *  sizeof(int));   //Pointer to Wgt layers
hipMallocManaged((void **)&LU_a, NLAYS * 6 *  sizeof(int));   //Pointer to Wgt layers


hipMemcpy(PARAM_DEV.B_a, PARAM_HOST.B_a, NUMNET*BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.W_a, PARAM_HOST.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.D_a, PARAM_HOST.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.P_a, PARAM_HOST.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.P_w, PARAM_HOST.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyHostToDevice);

hipMemcpy(PARAM_DEV.W_l, WPLAYR, NLAYS * sizeof(int), hipMemcpyHostToDevice);
hipMemcpy(PARAM_DEV.P_l, PLAYR, NLAYS * sizeof(int), hipMemcpyHostToDevice);
hipMemcpy(LU_a, L_ua, NLAYS * 6 * sizeof(int), hipMemcpyHostToDevice);


                  }

void prep_val(float *VAL,int EPOCH){

  float xord[4][3] = {
        {0, 0, 0},
        {0, 1, 1},
        {1, 0, 1},
        {1, 1, 0}
    };
int select=0;

for(int i=0;i<BS*TI;i++){

if(1){
select = rand()%4;
}else{
select++;
if(select==4)select=0;
}


VAL[i*3]   = xord[select][0];
VAL[i*3+1] = xord[select][1];
VAL[i*3+2] = xord[select][2];

                     
}

}

void call_nn(){


int NSIZE = NUMNET*BS*WPLAYR[NLAYS-1];

int MAXTHREADS = 1024;      //Threads per block

int NUMBLOCKS,NUMTHREADS;

if(NSIZE<MAXTHREADS){
   
NUMBLOCKS =4;
NUMTHREADS = NSIZE/4;   
}else{

NUMBLOCKS = 4*(NSIZE/32);
NUMTHREADS = 4*32; 


}

cout<<" NSIZE="<<NSIZE<<" NUMBLOCKS = "<<NUMBLOCKS<<" NUMTHREADS = "<<NUMTHREADS<<"\n";


GpuInit myinit_aconn(1024,1024,1,1);   //GPU using 1024 grids and 1024 blocks
GpuInit* myinitaconn;
myinitaconn = &myinit_aconn;


PARAM_DEV.ITER = 0;      /*Cycle through ITER on device above NN_FF
                    Each time we run NN_FF we process
                    BS x Values
                  */
                  
//INIT_PARAM<<<1024,1024,0,0>>>(PARAM_DEV);                  

if(0){   
hipMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);

cout<<"\nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"\n\nPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"\n\nDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"\n\nPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}
      }

//exit(1);

startTime = clock();

for(int iter=0;iter<TI;iter++){
                  
PARAM_DEV.ITER=iter;
                  
NN_FF<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);

NN_BP<<<myinitaconn->grid,myinitaconn->block,0,myinitaconn->stream1>>>(PARAM_DEV,LU_a);

if(iter>TI-7){
hipMemcpy(PARAM_HOST.A_a, PARAM_DEV.A_a, BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.E_a, PARAM_DEV.E_a, BS*PLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);

cout<<"Iter:"<<iter<<"\n";
for(int j=BS-1;j<BS;j++){
cout<<"Batch="<<j<<"\n";
cout<<"[";
for(int i=0;i<PLAYR[NLAYS-1];i++){
cout<<"--Act="<<PARAM_HOST.A_a[j*PLAYR[NLAYS-1]+i]<<"--Err=";
cout<<PARAM_HOST.E_a[j*PLAYR[NLAYS-1]+i]<<"\n";
                              }
cout<<"]";                              
                                    }
               
hipMemcpy(PARAM_HOST.W_a, PARAM_DEV.W_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.P_w, PARAM_DEV.P_w, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.D_a, PARAM_DEV.D_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);
hipMemcpy(PARAM_HOST.P_a, PARAM_DEV.P_a, WPLAYR[NLAYS-1] * sizeof(float), hipMemcpyDeviceToHost);

cout<<"\nWeights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.W_a[i]<<",";
}
cout<<"\n\nPre Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_w[i]<<",";
}
cout<<"\n\nDelta Weights:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.D_a[i]<<",";
}
cout<<"\n\nPre Deltas:";
for(int i=0;i<WPLAYR[NLAYS-1];i++){
cout<<PARAM_HOST.P_a[i]<<",";
}

         }

if(1){
float* TERR;
TERR = new float[1];
hipMemcpy(TERR, PARAM_DEV.E_a+PLAYR[NLAYS-1]-1,sizeof(float), hipMemcpyDeviceToHost);
errfile<<sqrt(pow(TERR[0],2))<<"\n";
//errfile<<TERR[0]<<"\n";

      }
                           }//Iter
                           
endTime = clock();
cout<<"\nCUDA gpu took "<<100000*((endTime-startTime)/(float)CLOCKS_PER_SEC) << " ms. \n\n*******************\n*******************" << endl;


errfile.close();

                                    }

};

int main(){

int topo[] = {2,4,1};
//const char* acts[] = {"NORM","TANH","TANH"};
const char* acts[] = {"NORM","SIGM","SIGM"};

int nlayers = 3;

net mynet(topo,acts,nlayers);

mynet.mem_alloc();
mynet.call_nn();

cout<<"Hello\n";
         }


Code: Select all
#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>


/*Begin 1DKern definition */

/*This is a direct copy of Tensorflows 1DKern code*/

namespace detail {
template <typename T>
class GpuGridRange {

   struct Iterator {
      __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
      __device__ T operator*() const { return index_;}
      __device__ Iterator& operator++() {
         index_ += delta_;
         return *this;
      }

      __device__ bool operator!=(const Iterator& other) const {
         bool greater = index_ > other.index_;
         bool less = index_ < other.index_;
         if(!other.delta_){
         return less;
         }
         if(!delta_){
         return greater;
         }   

      return less || greater;
      }   

      private:
      T index_;
      const T delta_;

   };   //end Iterator struct


   public:
          __device__ GpuGridRange(T begin,T delta,T end)
      : begin_(begin),delta_(delta),end_(end) {}
   
   __device__ Iterator begin() const {return Iterator(begin_,delta_); }
   __device__ Iterator end() const {return Iterator(end_,0);}

   private:
   T begin_;
   T delta_;
   T end_;   
   


};   //end GPU class class
};   //end namespace detail

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.x * blockDim.x + threadIdx.x,
   /*delta*/gridDim.x * blockDim.x, /*end*/count
            );

}

template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.y * blockDim.y + threadIdx.y,
   /*delta*/gridDim.y * blockDim.y, /*end*/count
            );

}
template <typename T>   //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(

   /*begin*/blockIdx.z * blockDim.z + threadIdx.z,
   /*delta*/gridDim.z * blockDim.z, /*end*/count
            );

}


#define GPU_1D_KERN_LOOP(i, n) \
  for (int i : ::GpuGridRangeX<int>(n))

#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
  for (int i : ::GpuGridRange##axis<int>(n))


/*End 1DKern definition*/


class GpuInit{
public:

dim3 grid;
dim3 block;
hipStream_t stream1;   //Could be a array of streams for multiple streams

GpuInit(unsigned int gridsizeX,unsigned int blocksizeX,unsigned int gridsizeY,unsigned int blocksizeY){

/*If using GPU_1D_KERN_LOOP */

grid.x = gridsizeX;   grid.y = gridsizeY;
block.x = blocksizeX; block.y= blocksizeY;

hipStreamCreate(&stream1);


//cout<<"Grid dimensions are: "<<grid.x<<"--"<<grid.y<<"--"<<grid.z<<"\n";
//cout<<"Block dimensions are: "<<block.x<<"--"<<block.y<<"--"<<block.z<<"\n";

}

      };


__device__ double atomicAdd__(double* address, double val)
{
    unsigned long long int* address_as_ull =
                             (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}


__device__ double atomicAdd_(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);

    return __longlong_as_double(old);
}



The Scrabble One space cruiser entered the downtown slipstream and followed the port control beacons intot the landing bay.

Downtown was filled with people - all dressed in raincoats all weathering the storms brought here by the wormhole's that littered Z-space.

She climbed down from the cockpit and greeted the immigration agent. "I am Lisa Short - farscraper extrordinair. Here take my passport.", she bowed gracefully to the robot inspector.

The agent took the passport and scanned its heavey drive onto its cloud the robot then greeted her in reply," Welcome Lisa Short. Welcome to Velcra One the greatest offworld colony in the Belt system. May you find everything you desire." the robots slender arms extended towards a brightly lit rolling city scape that extended out into a vanishing horizon bordered only by the empty darkness of space.

She looked the robot up and down. She could tell the immigration robot was a recycled sex worker from the way it winked suggestively. In fact as she turned to leave she caught a glimpseof his kinky under garments that only just covered his fake skin bottom.

"Oh dear its here again.", she sighed as she entered the vast space port concourse with people and robots and pretty much everyone all shuffling about with theyre luggage in every direction - like an agar plate filled with bacteria it just seem to grow in all directions. She followed one of them and then made an acute turn into a service alley which led out into downtown. She emmerged and found herself on a relatively quiet street. Well quiet for downtown.

It was the homgee district notorious for being held and run by the drug syndicate family from io - the homgee.

Learn more about Z-Space