The insides of an LLM like chatGPT and any General Purpose Transformer contains a sequence Encoder and Decoder.
You might of read earlier the Guts of a Transformer and that should help you understand the following code snippets that comprise the components of both the Encoder and Decoder.
Here is the CUDA code for each component:
Convert Sentence Embeddings into Query, Key and Value's:
- Code: Select all
__global__ void transform_embeddings_kernel_(
const int N, // batch size
const int T, // sequence length
const int H, // number of heads
const int D, // embedding dimension
const float * __restrict__ input, // input embeddings. shape = (N,T,D)
const float * __restrict__ W_q, // query weights. shape = (D,H*D)
const float * __restrict__ W_k, // key weights. shape = (D,H*D)
const float * __restrict__ W_v, // value weights. shape = (D,H*D)
float * __restrict__ Q, // output queries. shape = (N,T,H,D)
float * __restrict__ K, // output keys. shape = (N,T,H,D)
float * __restrict__ V // output values. shape = (N,T,H,D)
) {
//int index = blockDim.x*blockDim.y*blockIdx.x + blockDim.x*threadIdx.y + threadIdx.x;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int total_elements = N * T * H;
for (int i = index; i < total_elements; i += blockDim.x * gridDim.x) {
int n = i % N; //Batch
int t = (i / N) % T; //Sequence
int h = i / (T * N); //Number of Heads
int d = threadIdx.y;
float q_sum = 0.0f, k_sum = 0.0f, v_sum = 0.0f;
int idx = n*T*H*D + t*H*D + h*D + d;
for (int i = 0; i < D; ++i) {
float input_val = input[n*T*D + t*D + i];
int wgt_idx = i*H*D + h*D + d;
q_sum += input_val * W_q[wgt_idx];
k_sum += input_val * W_k[wgt_idx];
v_sum += input_val * W_v[wgt_idx];
}
if(d<D){
Q[idx] = q_sum;
K[idx] = k_sum;
V[idx] = v_sum;
}
}
}
Calculate Attention Scores with Masking!
- Code: Select all
__global__ void attention_kernel_(
const int N, // batch size
const int T, // sequence length
const int H, // number of heads
const int D, // embedding dimension
const int AMASK,
const float * __restrict__ mask,
const float * __restrict__ Q, // query. shape = (N,T,H,D)
const float * __restrict__ K, // key. shape = (N,T,H,D)
const float * __restrict__ V, // value. shape = (N,T,H,D)
float * __restrict__ output // output. shape = (N,T,H,D)
) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int total_elements = N * T * H;
for (int i = index; i < total_elements; i += blockDim.x * gridDim.x) {
int h = i % H; //Number of heads
int t_q = (i / H) % T; //Sequence len
int n = i / (H * T); //Batch
int d = threadIdx.y;
float max_score = -INFINITY;
float* scores; scores = new float[T]; // Assuming T is known at compile time, otherwise use dynamic shared memory
// First pass: compute dot products and find max
for (int t_k = 0; t_k < T; ++t_k) {
float dot_product = 0.0f;
for (int i = 0; i < D; ++i) {
dot_product += Q[n*T*H*D + t_q*H*D + h*D + i] * K[n*T*H*D + t_k*H*D + h*D + i];
}
if(AMASK==1){ /*Apply Lookahead Mask*/
scores[t_k] = ( dot_product / sqrtf(D) ) + mask[t_q*T+t_k]; //add -LARGE_NUMBER for SoftMax not to have divide by zero
}else{
scores[t_k] = dot_product / sqrtf(D) ;
}
max_score = fmaxf(max_score, scores[t_k]);
}
// Second pass: compute exp and sum
float sum_exp = 0.0f;
for (int t_k = 0; t_k < T; ++t_k) {
scores[t_k] = expf(scores[t_k] - max_score);
sum_exp += scores[t_k];
}
// Third pass: compute weighted sum
float weighted_sum = 0.0f;
for (int t_k = 0; t_k < T; ++t_k) {
float attention_weight = scores[t_k] / sum_exp;
weighted_sum += attention_weight * V[n*T*H*D + t_k*H*D + h*D + d];
}
output[n*T*H*D + t_q*H*D + h*D + d] = weighted_sum;
}
}
I will upload more later!