= get_device()
device print(device)
mps
Somewhat basic implemention of transformer model
# read unstructured text into pd
df = pd.read_fwf('../data/text/tiny_shakespeare.txt', header=None)
print("Dataframe: ", df.head(2))
sentences = df[0].tolist()
print("List: ", sentences[:2])
v = Vocab(sentences)
print("Vocab: ", len(v), ''.join(v.vocabulary))
Dataframe: 0
0 First Citizen:
1 Before we proceed any further, hear me speak.
List: ['First Citizen:', 'Before we proceed any further, hear me speak.']
Vocab: 68 !$&',-.3:;<bos><eos><pad><unk>?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# squash list of sentences into one large list of characters
data = []
for line in sentences:
data.extend(line + ' ')
print(data[:100])
['F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i', 'z', 'e', 'n', ':', ' ', 'B', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', ' ', 'A', 'l', 'l', ':', ' ', 'S', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', ' ', 'F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i', 'z', 'e', 'n', ':', ' ', 'Y', 'o', 'u', ' ', 'a']
['e', ' ', 'b', 'a', 'r', 'e', '-', 'f', 'o', 'o', 't', ' ', 'o', 'n', ' ', 'h', 'e', 'r', ' ', 'w', 'e', 'd', 'd', 'i', 'n']
context_length = 3
x = data[:context_length]
y = data[1:context_length+1]
print(x, y)
for t in range(context_length):
context = x[:t+1]
target = y[t]
print(context, target)
['F', 'i', 'r'] ['i', 'r', 's']
['F'] i
['F', 'i'] r
['F', 'i', 'r'] s
[52, 13, 11, 10, 6, 4, 40, 13, 6, 13]
['F', 'i', 'r', 's', 't', ' ', 'C', 'i', 't', 'i']
def get_random_batch(token_sequence, context_length, batch_size, device=device):
ix = torch.randint(len(token_sequence) - context_length, (batch_size,)) # max index is (L - context_length)
x = torch.stack([token_sequence[i:i+context_length] for i in ix])
y = torch.stack([token_sequence[i+1:i+1+context_length] for i in ix])
return (x.to(device),y.to(device))
context_length = 10
batch_size = 8
x, y = get_random_batch(torch.LongTensor(ids), context_length, batch_size)
print(f'Batches X: {x.shape}, Y: {y.shape}')
for i in range(batch_size):
# v.itos(int(x[i]))
print('X: ', [v.itos(int(el)) for el in x[i]])
print('Y: ', [v.itos(int(el)) for el in y[i]])
Batches X: torch.Size([8, 10]), Y: torch.Size([8, 10])
X: [' ', 'D', 'E', 'R', 'B', 'Y', ':', ' ', 'I', ' ']
Y: ['D', 'E', 'R', 'B', 'Y', ':', ' ', 'I', ' ', 'p']
X: ['b', 'e', ' ', 'r', 'e', 's', 't', 'r', 'a', 'i']
Y: ['e', ' ', 'r', 'e', 's', 't', 'r', 'a', 'i', 'n']
X: ['y', 'o', 'u', ' ', 'c', 'a', 'n', ',', ' ', 'p']
Y: ['o', 'u', ' ', 'c', 'a', 'n', ',', ' ', 'p', 'a']
X: [' ', 'p', 'e', 'r', 'p', 'e', 't', 'u', 'a', 'l']
Y: ['p', 'e', 'r', 'p', 'e', 't', 'u', 'a', 'l', ' ']
X: ['l', 'o', 'r', 'd', ';', ' ', 'f', 'o', 'r', ' ']
Y: ['o', 'r', 'd', ';', ' ', 'f', 'o', 'r', ' ', 't']
X: [' ', 'M', 'o', 'r', 'e', ' ', 'g', 'r', 'a', 'v']
Y: ['M', 'o', 'r', 'e', ' ', 'g', 'r', 'a', 'v', 'e']
X: ['o', 'n', ' ', 't', 'o', 'w', 'a', 'r', 'd', 's']
Y: ['n', ' ', 't', 'o', 'w', 'a', 'r', 'd', 's', ' ']
X: ['n', 't', 'l', 'e', 'n', 'e', 's', 's', ',', ' ']
Y: ['t', 'l', 'e', 'n', 'e', 's', 's', ',', ' ', 'K']
vocab_size = 10
batch_size = 5
embed_dim = 20
context_size = 8
dropout = 0.2
head_size = 16
# embedded input (float)
x = torch.randn(batch_size, context_size, embed_dim) #(B,T,C)
print(x.shape)
att = AttentionHead(embed_dim, head_size, context_size, dropout)
xx = att(x)
print(xx.shape)
torch.Size([5, 8, 20])
torch.Size([5, 8, 16])
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
eval_interval = 500
learning_rate = 3e-4
device = get_device()
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
vocab_size = len(v)
m = GPTLanguageModel(vocab_size, n_embd, block_size, n_head, n_layer, dropout)
device = torch.device('mps')
m = m.to(device)
x = torch.randint(vocab_size, (batch_size, block_size)).to(device)
logits, loss = m(x)
# print(logits.shape)
AssertionError: Torch not compiled with CUDA enabled
# @torch.no_grad()
# def estimate_loss():
# out = {}
# model.eval()
# for split in ['train', 'val']:
# losses = torch.zeros(eval_iters)
# for k in range(eval_iters):
# X, Y = get_batch(split)
# logits, loss = model(X, Y)
# losses[k] = loss.item()
# out[split] = losses.mean()
# model.train()
# return out
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
acc_loss = []
max_iters = 1
for iter in range(max_iters):
# sample a batch of data
xb, yb = get_random_batch(torch.LongTensor(ids), block_size, batch_size, device=device)
# evaluate the loss
logits, loss = m(xb.to(device), yb.to(device))
print(loss.item())
acc_loss.append(loss.item())
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
2.1705524921417236
2.950320243835449
2.371941328048706
2.4078457355499268
2.359858512878418
2.304844379425049
2.3548460006713867
2.318650007247925
2.296562910079956
2.2918810844421387