Transformer Language Models

Somewhat basic implemention of transformer model

device = get_device()
print(device)

Data formatting

  • https://buomsoo-kim.github.io/attention/2020/04/21/Attention-mechanism-19.md/
# dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
# print(dataset['train'][88])
# read unstructured text into pd
df = pd.read_fwf('../data/text/tiny_shakespeare.txt', header=None)
print("Dataframe: ", df.head(2))
sentences = df[0].tolist()
print("List: ", sentences[:2])
v = Vocab(sentences)
print("Vocab: ", len(v), ''.join(v.vocabulary))
# squash list of sentences into one large list of characters
data = []
for line in sentences:
    data.extend(line + ' ')
print(data[:100])
n = len(data)
train = data[:int(n*0.9)]
val = data[int(n*0.9):]
print(val[:25])
context_length = 3
x = data[:context_length]
y = data[1:context_length+1]
print(x, y)
for t in range(context_length):
    context = x[:t+1]
    target = y[t]
    print(context, target)
ids = v.stoi(data)
print(ids[:10])
print(v.itos(ids[:10]))
def get_random_batch(token_sequence, context_length, batch_size, device=device):
    ix = torch.randint(len(token_sequence) - context_length, (batch_size,)) # max index is (L - context_length)
    x = torch.stack([token_sequence[i:i+context_length] for i in ix])
    y = torch.stack([token_sequence[i+1:i+1+context_length] for i in ix])
    return (x.to(device),y.to(device))
context_length = 10
batch_size = 8
x, y = get_random_batch(torch.LongTensor(ids), context_length, batch_size)
print(f'Batches X: {x.shape}, Y: {y.shape}')
for i in range(batch_size):
    # v.itos(int(x[i]))
    print('X: ', [v.itos(int(el)) for el in x[i]])
    print('Y: ', [v.itos(int(el)) for el in y[i]])

Attention

vocab_size = 10
batch_size = 5
embed_dim = 20
context_size = 8
dropout = 0.2
head_size = 16
# embedded input (float)
x = torch.randn(batch_size, context_size, embed_dim) #(B,T,C)
print(x.shape)
att = AttentionHead(embed_dim, head_size, context_size, dropout)
xx = att(x)
print(xx.shape)
num_heads = 3
multi_att = MultiHeadAttention(num_heads, head_size, embed_dim, context_size, dropout)
xxx = multi_att(x)
print(xxx.shape)

Feed forward

ff = FeedFoward(embed_dim, dropout)
ff_x = ff(x)
print(ff_x.shape)

Block

b = Block(embed_dim, num_heads, context_length, dropout)
bb = b(x)
print(bb.shape)

GPT model

batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
eval_interval = 500
learning_rate = 3e-4
device = get_device()
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
vocab_size = len(v)
m = GPTLanguageModel(vocab_size, n_embd, block_size, n_head, n_layer, dropout)
print(device)
device = torch.device('mps')
m = m.to(device)
x = torch.randint(vocab_size, (batch_size, block_size)).to(device)
logits, loss = m(x)
# print(logits.shape)
# @torch.no_grad()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             X, Y = get_batch(split)
#             logits, loss = model(X, Y)
#             losses[k] = loss.item()
#         out[split] = losses.mean()
#     model.train()
#     return out
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
acc_loss = []
max_iters = 1
for iter in range(max_iters):

    # sample a batch of data
    xb, yb = get_random_batch(torch.LongTensor(ids), block_size, batch_size, device=device)

    # evaluate the loss
    logits, loss = m(xb.to(device), yb.to(device))
    print(loss.item())
    acc_loss.append(loss.item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
plt.plot(acc_loss)
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(''.join(v.itos(m.generate(context, max_new_tokens=50)[0].tolist())))