Here is a side project I am working on to get a simple, very simple and basic tokenizer for a bespoke transformer language model. My own GPT (General Purpose Transformer). I will provide the code for every step in building the complete model once its done. But for now here is a bit of code that converts a string sentence into a list of binary words:
- Code: Select all
void make_wl(char str[]){
char** word;
word = new char*[10]; //Sentence len max = 10
for(int i=0;i<10;i++)
word[i] = new char[20]; //Word len max = 20
cout<<str<<"\n";
cout<<"String length:"<<strlen(str)<<"\n";
int ltr=0,wrd=0;
for(int i=0;i<strlen(str);i++)
{
cout<<str[i]<<",";
if(str[i]==' '){
wrd++;ltr=0;}else{
word[wrd][ltr]=str[i];
ltr++;
}
}
int *binword; binword = new int[wrd*20*8];
//length of binary representation
//of 20 char word
for(int i=0;i<wrd+1;i++){
cout<<"\n"<<word[i]<<"\n";
for(int j=0;j<10;j++){
char val = word[i][j];
cout<<"\n";
for(int k=0;k<8;k++){
binword[i*20*8+j*8+k] = val%2;
val/=2;
cout<<binword[i*20*8+j*8+k]<<"|";
}
}
}
}