= get_device()
device print(device)
Transformer Language Models
Somewhat basic implemention of transformer model
Data formatting
- https://buomsoo-kim.github.io/attention/2020/04/21/Attention-mechanism-19.md/
# dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
# print(dataset['train'][88])
# read unstructured text into pd
= pd.read_fwf('../data/text/tiny_shakespeare.txt', header=None)
df print("Dataframe: ", df.head(2))
= df[0].tolist()
sentences print("List: ", sentences[:2])
= Vocab(sentences)
v print("Vocab: ", len(v), ''.join(v.vocabulary))
# squash list of sentences into one large list of characters
= []
data for line in sentences:
+ ' ')
data.extend(line print(data[:100])
= len(data)
n = data[:int(n*0.9)]
train = data[int(n*0.9):]
val print(val[:25])
= 3
context_length = data[:context_length]
x = data[1:context_length+1]
y print(x, y)
for t in range(context_length):
= x[:t+1]
context = y[t]
target print(context, target)
= v.stoi(data) ids
print(ids[:10])
print(v.itos(ids[:10]))
def get_random_batch(token_sequence, context_length, batch_size, device=device):
= torch.randint(len(token_sequence) - context_length, (batch_size,)) # max index is (L - context_length)
ix = torch.stack([token_sequence[i:i+context_length] for i in ix])
x = torch.stack([token_sequence[i+1:i+1+context_length] for i in ix])
y return (x.to(device),y.to(device))
= 10
context_length = 8
batch_size = get_random_batch(torch.LongTensor(ids), context_length, batch_size)
x, y print(f'Batches X: {x.shape}, Y: {y.shape}')
for i in range(batch_size):
# v.itos(int(x[i]))
print('X: ', [v.itos(int(el)) for el in x[i]])
print('Y: ', [v.itos(int(el)) for el in y[i]])
Attention
= 10
vocab_size = 5
batch_size = 20
embed_dim = 8
context_size = 0.2
dropout = 16
head_size # embedded input (float)
= torch.randn(batch_size, context_size, embed_dim) #(B,T,C)
x print(x.shape)
= AttentionHead(embed_dim, head_size, context_size, dropout)
att = att(x)
xx print(xx.shape)
= 3
num_heads = MultiHeadAttention(num_heads, head_size, embed_dim, context_size, dropout)
multi_att = multi_att(x)
xxx print(xxx.shape)
Feed forward
= FeedFoward(embed_dim, dropout)
ff = ff(x)
ff_x print(ff_x.shape)
Block
= Block(embed_dim, num_heads, context_length, dropout)
b = b(x)
bb print(bb.shape)
GPT model
= 64 # how many independent sequences will we process in parallel?
batch_size = 256 # what is the maximum context length for predictions?
block_size = 500
eval_interval = 3e-4
learning_rate = get_device()
device = 200
eval_iters = 384
n_embd = 6
n_head = 6
n_layer = 0.2
dropout = len(v)
vocab_size = GPTLanguageModel(vocab_size, n_embd, block_size, n_head, n_layer, dropout) m
print(device)
= torch.device('mps')
device = m.to(device)
m = torch.randint(vocab_size, (batch_size, block_size)).to(device)
x = m(x)
logits, loss # print(logits.shape)
# @torch.no_grad()
# def estimate_loss():
# out = {}
# model.eval()
# for split in ['train', 'val']:
# losses = torch.zeros(eval_iters)
# for k in range(eval_iters):
# X, Y = get_batch(split)
# logits, loss = model(X, Y)
# losses[k] = loss.item()
# out[split] = losses.mean()
# model.train()
# return out
# create a PyTorch optimizer
= torch.optim.AdamW(m.parameters(), lr=learning_rate)
optimizer = []
acc_loss = 1
max_iters for iter in range(max_iters):
# sample a batch of data
= get_random_batch(torch.LongTensor(ids), block_size, batch_size, device=device)
xb, yb
# evaluate the loss
= m(xb.to(device), yb.to(device))
logits, loss print(loss.item())
acc_loss.append(loss.item())=True)
optimizer.zero_grad(set_to_none
loss.backward() optimizer.step()
plt.plot(acc_loss)
# generate from the model
= torch.zeros((1, 1), dtype=torch.long, device=device)
context print(''.join(v.itos(m.generate(context, max_new_tokens=50)[0].tolist())))