First few components of the Attention Kit

Put your Vanilla code here

First few components of the Attention Kit

Postby hbyte » Mon Aug 19, 2024 5:34 pm

I am starting to build a very basic Transformer here are a few of the parts that I will use to build what I am calling my Attention Kit. Aptly named as some of the parts where provided by a language model made from transformers.

The insides of an LLM like chatGPT and any General Purpose Transformer contains a sequence Encoder and Decoder.

You might of read earlier the Guts of a Transformer and that should help you understand the following code snippets that comprise the components of both the Encoder and Decoder.

Here is the CUDA code for each component:

Convert Sentence Embeddings into Query, Key and Value's:

Code: Select all
__global__ void transform_embeddings_kernel_(
    const int N,  // batch size
    const int T,  // sequence length
    const int H,  // number of heads
    const int D,  // embedding dimension
    const float * __restrict__ input,  // input embeddings. shape = (N,T,D)
    const float * __restrict__ W_q,    // query weights. shape = (D,H*D)
    const float * __restrict__ W_k,    // key weights. shape = (D,H*D)
    const float * __restrict__ W_v,    // value weights. shape = (D,H*D)
    float * __restrict__ Q,            // output queries. shape = (N,T,H,D)
    float * __restrict__ K,            // output keys. shape = (N,T,H,D)
    float * __restrict__ V             // output values. shape = (N,T,H,D)
) {
   
   
   //int index = blockDim.x*blockDim.y*blockIdx.x + blockDim.x*threadIdx.y + threadIdx.x;
   int index = blockIdx.x * blockDim.x + threadIdx.x;   
   int total_elements = N * T * H;
   
   
    for (int i = index; i < total_elements; i += blockDim.x * gridDim.x) {
        int n = i % N;            //Batch
        int t = (i / N) % T;      //Sequence
        int h = i / (T * N);      //Number of Heads
      int d = threadIdx.y;
   

    float q_sum = 0.0f, k_sum = 0.0f, v_sum = 0.0f;

   int idx = n*T*H*D + t*H*D + h*D + d;


    for (int i = 0; i < D; ++i) {
        float input_val = input[n*T*D + t*D + i];
       
      int wgt_idx = i*H*D + h*D + d;
   
      q_sum += input_val * W_q[wgt_idx];
        k_sum += input_val * W_k[wgt_idx];
        v_sum += input_val * W_v[wgt_idx];
   
   }

if(d<D){

    Q[idx] = q_sum;
    K[idx] = k_sum;
    V[idx] = v_sum;
}
   }

}


Calculate Attention Scores with Masking!

Code: Select all
__global__ void attention_kernel_(
    const int N,  // batch size
    const int T,  // sequence length
    const int H,  // number of heads
    const int D,  // embedding dimension
   const int AMASK,
    const float * __restrict__ mask,
   const float * __restrict__ Q, // query. shape = (N,T,H,D)
    const float * __restrict__ K, // key. shape = (N,T,H,D)
    const float * __restrict__ V, // value. shape = (N,T,H,D)
    float * __restrict__ output   // output. shape = (N,T,H,D)
) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;   
      int total_elements = N * T * H;

   
    for (int i = index; i < total_elements; i += blockDim.x * gridDim.x) {
        int h = i % H;            //Number of heads
        int t_q = (i / H) % T;      //Sequence len
        int n = i / (H * T);      //Batch
        int d = threadIdx.y;
   
    float max_score = -INFINITY;
    float* scores; scores = new float[T];  // Assuming T is known at compile time, otherwise use dynamic shared memory

    // First pass: compute dot products and find max
    for (int t_k = 0; t_k < T; ++t_k) {
        float dot_product = 0.0f;
        for (int i = 0; i < D; ++i) {
            dot_product += Q[n*T*H*D + t_q*H*D + h*D + i] * K[n*T*H*D + t_k*H*D + h*D + i]; 
        }
      if(AMASK==1){   /*Apply Lookahead Mask*/
        scores[t_k] = ( dot_product / sqrtf(D) ) + mask[t_q*T+t_k];      //add -LARGE_NUMBER for SoftMax not to have divide by zero                           
               }else{
      scores[t_k] =  dot_product / sqrtf(D) ;         
               }
      max_score = fmaxf(max_score, scores[t_k]);
    }

    // Second pass: compute exp and sum
    float sum_exp = 0.0f;
    for (int t_k = 0; t_k < T; ++t_k) {
        scores[t_k] = expf(scores[t_k] - max_score);
        sum_exp += scores[t_k];
    }

    // Third pass: compute weighted sum
    float weighted_sum = 0.0f;
    for (int t_k = 0; t_k < T; ++t_k) {
        float attention_weight = scores[t_k] / sum_exp;
        weighted_sum += attention_weight * V[n*T*H*D + t_k*H*D + h*D + d];
    }
   
    output[n*T*H*D + t_q*H*D + h*D + d] = weighted_sum;

}

}


I will upload more later!
hbyte
Site Admin
 
Posts: 139
Joined: Thu Aug 13, 2020 6:11 pm

Return to Python and ML

Who is online

Users browsing this forum: No registered users and 2 guests

cron