# N_EPOCHS for training debuggging
= 1
ITER_MAX 42) set_seed(
Neural Net Language Models
Overview
The Nimrod Language Models (LM) module provides a comprehensive framework for developing, training, and deploying advanced natural language processing models, with a focus on flexibility, performance, and cutting-edge research.
Key Features
- 🔤 Advanced Language Model Architectures
- 🧠 Transformer-based Models
- 🚀 High-Performance NLP Utilities
- 🔧 Configurable Model Components
- 🌐 Multi-Language Support
Core Components
Language Model Architectures
- Transformer-based Models
- Sequence-to-Sequence Models
- Encoder-Decoder Architectures
- Causal Language Models
- Masked Language Models
Key Capabilities
- Tokenization
- Embedding Strategies
- Attention Mechanisms
- Transfer Learning
- Few-Shot Learning
- Prompt Engineering
Supported Model Types
- BERT-like Models
- GPT-style Architectures
- T5 Variants
- BART
- RoBERTa
- XLNet
# reading with pandas
= pd.read_csv('../data/text/names.txt', header=None, names=['name'])
df = list(df.name)
data print("names: ", data[:3])
Data formatting
given last n tokens we predict token n+1
= list("alexandra")
s print(s)
= [(x,y) for x, y in zip(s, s[1:])]
bigram print(bigram)
= [ (x,y,z) for x, y, z in zip(s, s[1:], s[2:])]
trigram print(trigram)
Tiny shakespeare LM char dataset
# reading directly in plain python
= []
lines with open('../data/text/tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
if line.strip():
# only append non blank lines
lines.append(line)
# add sentence tokens
# data = [['<bos>'] +list(line.strip()) + ['<eos>'] for line in lines]
# data = [list(line.strip()) for line in lines]
= [list(line) for line in lines]
data print("data: ", data[:3])
def make_dataset(
str], # data is a list of sentences which are a list of words
words:List[# vocabulary class for mapping words to indices
v:Vocab,bool=False, # print debug info
verbose:=3 # number of words/tokens to use as context
context_length
):= []
X = []
y for word in words:
= list(word)
s if verbose:
print('row: ', s)
# init prefix with padding while len < context_length
for i in range(context_length-1):
= v.stoi(s[:i+1])
sequence = context_length - len(sequence)
pad_len = [v.stoi("<pad>")] * pad_len
pad + sequence)
X.append(pad +1]))
y.append(v.stoi(s[i
if verbose:
print(["<pad>"]+ s[:i+1], s[i+1])
# for length seq = context_length
= 0
i while i < (len(s) - context_length):
+i]))
X.append(v.stoi(s[i:context_length+context_length]))
y.append(v.stoi(s[iif verbose:
print(s[i:context_length+i], s[i+context_length])
+= 1
i return torch.tensor(X),torch.tensor(y)
for each row in the dataset we expand all the combinations of ngrams
= Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
v print("vocabulary: ", v.vocabulary)
print("vocabulary size: ", len(v))
= 3
CONTEXT_LEN = make_dataset(data[:80], v, verbose=True, context_length=CONTEXT_LEN)
X, y print("X: ", X.shape, "y:", y.shape)
MLP LM Model
https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
# F.one_hot(torch.tensor(5), num_classes=n_vocab).float()@C # == C[5]
NNLM
NNLM (n_vocab:int=30, n_emb:int=10, n_context:int=3, n_h:int=100)
*Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
x = F.relu(self.conv1(x))
return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to
, etc.
.. note:: As per the example above, an __init__()
call to the parent class must be made before assignment on the child.
Type | Default | Details | |
---|---|---|---|
n_vocab | int | 30 | vocabulary size |
n_emb | int | 10 | embedding dimension |
n_context | int | 3 | context size bigram/trigram, etc. |
n_h | int | 100 | hidden layer size |
NNLMConfig
NNLMConfig (n_vocab:int=30, n_emb:int=10, n_context:int=3, n_h:int=100)
Usage
# config model
= NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
conf = NNLM(**asdict(conf))
lm
# test data
= 25
bs = torch.randint(conf.n_vocab, (bs, conf.n_context)) # (B, T) with values between 0 and n_vocab
x print("X (B, T):", x.shape)
# prediction
= lm(x)
y print("Y_hat logits (B, n_vocab):", y.shape)
Train MLP LM
handmade dataset
= make_dataset(data[:80], v, context_length=CONTEXT_LEN)
Xtr, Ytr = make_dataset(data[80:90], v)
Xdev, Ydev = make_dataset(data[90:100], v)
Xte, Yte print("Xtr (B, T): ", Xtr.shape, "Ytr (B): ", Ytr.shape, "data:", len(data[:80]))
print("len Xtr: ", len(Xtr))
print("CONTEXT_LEN: ", CONTEXT_LEN)
Overfit on subset of 80 first rows
= get_device()
device = 'cpu'
device # lm.to(device)
# overfit on one big batch
= SGD(lm.parameters(), lr=0.01, momentum=0.9)
optim = []
train_loss = 1000
ITER_MAX for i in tqdm(range(ITER_MAX)):
# for batch in dm.train_dataloader():
# Xtr, Ytr = batch
# Ytr = Ytr[:, -1]
= Xtr.to(device)
Xtr = Ytr.to(device)
Ytr
optim.zero_grad()= lm(Xtr)
logits = F.cross_entropy(logits, Ytr)
loss
loss.backward()
optim.step()
train_loss.append(loss.item())if not(i%250):
print(loss.item())
plt.plot(train_loss)
Sample
# infer on CPU
'cpu')
lm.to(= "The country of "
prompt = lm.sample(prompt, v, max_new_tokens=250, temperature=0.6)
sequences print(sequences)
Dataloader
= OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
cfg # use <unk> and <pad> to be consistent with manual data preprocessing and have smae vocabulary size
= Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
v print("vocabulary: ", v.vocabulary)
print("vocabulary size: ", len(v))
print(cfg)
= [0.8, 0.1, 0.1]
cfg.train_val_test_split # by default data_path is relative to the recipe folder so need to update for nbs
= "../data/text/tiny_shakespeare.txt"
cfg.data_path = CONTEXT_LEN
cfg.context_size = 2700 # large batch to mimic manual data order of magnitude
cfg.batch_size = False
cfg.random_split =['<unk>', '<pad>']
cfg.specials= False
cfg.add_sentence_tokens print(cfg)
= instantiate(cfg)
dm
dm.setup()print("vocab size: ", dm.vocab_size)
# setup large batch to overfit / test model
= next(iter(dm.train_dataloader()))
Xtr, Ytr# target is last token in sequence
= Ytr[:, -1]
Ytr print("Xtr (B, T): ", Xtr.shape, "Ytr (B): ", Ytr.shape)
= dm.train_ds[0]
X, Y print(dm.ds.from_tokens(X), dm.ds.from_tokens(Y))
= NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
conf print(len(v), CONTEXT_LEN)
= NNLM(**asdict(conf))
lm = 10
bs = torch.randint(conf.n_vocab, (bs, conf.n_context)) # (B, T) with values between 0 and n_vocab
x print("X (B, T):", x.shape)
lm(x).shape
# checking data tokens are between 0 and vocab size
print(Xtr.min(), Xtr.max())
overfit
# device = get_device()
= 'cpu'
device
lm.to(device)
# overfit on one big batch
= SGD(lm.parameters(), lr=0.01, momentum=0.9)
optim = []
train_loss = 1000
ITER_MAX for i in tqdm(range(ITER_MAX)):
# for batch in dm.train_dataloader():
# Xtr, Ytr = batch
# Ytr = Ytr[:, -1]
= Xtr.to(device)
Xtr = Ytr.to(device)
Ytr
optim.zero_grad()= lm(Xtr)
logits = F.cross_entropy(logits, Ytr)
loss
loss.backward()
optim.step()
train_loss.append(loss.item())if not(i%250):
print(loss.item())
plt.plot(train_loss)
# infer on CPU
'cpu')
lm.to(= "The country of "
prompt = lm.sample(prompt, v, max_new_tokens=250, temperature=0.6)
sequences print(sequences)
Batching with dataloaders
# mini batch gradient descent with datamodule
= OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
cfg = [0.8, 0.1, 0.1]
cfg.train_val_test_split = "../data/text/tiny_shakespeare.txt"
cfg.data_path = CONTEXT_LEN
cfg.context_size = 2048
cfg.batch_size = False
cfg.random_split =['<unk>', '<pad>']
cfg.specials= False
cfg.add_sentence_tokens = instantiate(cfg)
dm
dm.setup()
= NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
conf = NNLM(**asdict(conf)) lm
= SGD(lm.parameters(), lr=0.01, momentum=0.9)
optim = []
train_loss # device = get_device()
= 'cpu'
device
lm.to(device)= 0
i = 1
EPOCHS for epoch in tqdm(range(EPOCHS)):
print(f"epoch {epoch}")
for batch in tqdm(dm.train_dataloader()):
= batch
Xtr, Ytr # target is last token in sequence
= Ytr[:, -1] # BxT
Ytr = Xtr.to(device)
Xtr = Ytr.to(device)
Ytr
= lm(Xtr)
logits = F.cross_entropy(logits, Ytr)
loss
optim.zero_grad()
loss.backward()
optim.step()
train_loss.append(loss.item())if not(i%1000):
print(loss.item())
+= 1 i
plt.plot(train_loss)
Sample
# infer on CPU
'cpu')
lm.to(= "The country of "
prompt = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
sequences print(sequences)
MLP LM X Model
NNLM_X
NNLM_X (nnet:__main__.NNLM, num_classes:int, optimizer:torch.optim.optimizer.Optimizer, scheduler:<module'torc h.optim.lr_scheduler'from'/opt/hostedtoolcache/Python/3.10.16/x64 /lib/python3.10/site-packages/torch/optim/lr_scheduler.py'>)
Helper class that provides a standard way to create an ABC using inheritance.
Usage
# omegaconf
= OmegaConf.load("../config/text/model/nnlm.yaml")
cfg = len(v)
cfg.num_classes print(len(v))
# have to convert omegaconf dict to dict for pprint
= instantiate(cfg.optimizer)
opt print(opt.keywords['lr'])
dict(cfg))
pprint.pprint(= instantiate(cfg)
lm print(lm.hparams.optimizer)
= 25
n_samples = torch.randint(conf.n_vocab, (n_samples, cfg.nnet.n_context))
x print("X:", x.shape)
= lm(x)
y print("Y_hat logits:", y.shape)
# v = Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
"The country of ", v, max_new_tokens=500, temperature=0.9) lm.sample(
L Training Loop
# vocab
print(len(v))
# data
= OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
cfg = CONTEXT_LEN
cfg.context_size "<pad>", "<unk>"]
cfg.specials: [= 2048
cfg.batch_size = False
cfg.random_split = instantiate(cfg)
dm
dm.setup()
# model
= OmegaConf.load("../config/text/model/nnlm.yaml")
cfg = instantiate(cfg) lm
print(lm.__dict__)
model can be easily trained with L trainer (c.f. recipes/text/ for examples)
= Trainer(
trainer ="auto",
accelerator=1,
max_epochs=CSVLogger("logs", name="nnlm")
logger )
trainer.fit(lm, dm.train_dataloader(), dm.val_dataloader())
= f"{trainer.logger.log_dir}/metrics.csv"
csv_path = pd.read_csv(csv_path)
metrics metrics.head()
'step'], metrics['train/loss_step'],'b.-')
plt.plot(metrics['step'], metrics['val/loss'],'r.-')
plt.plot(metrics[ plt.show()
trainer.test(lm, dm.test_dataloader())
# infer on CPU
'cpu')
lm.to(= "The country of "
prompt = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
sequences print(sequences)
Learning Rate Finder
lm.hparams
= L.Trainer(
trainer ="auto",
accelerator=1,
max_epochs
)= Tuner(trainer)
tuner = tuner.lr_find(
lr_finder
lm,=dm,
datamodule=1e-6,
min_lr=1.0,
max_lr=100, # number of iterations
num_training# attr_name="optimizer.lr",
)= lr_finder.plot(suggest=True)
fig
plt.show()print(f"Suggested learning rate: {lr_finder.suggestion()}")
re-train with new lr
= lr_finder.suggestion()
new_lr = new_lr lm.lr
= L.Trainer(
trainer ="auto",
accelerator=1,
max_epochs=CSVLogger("logs", name="nnlm"),
logger
)
trainer.fit(lm, dm.train_dataloader(), dm.val_dataloader()) trainer.test(lm, dm.test_dataloader())
= f"{trainer.logger.log_dir}/metrics.csv"
csv_path = pd.read_csv(csv_path)
metrics 'step'], metrics['train/loss_step'],'.-')
plt.plot(metrics[# plt.figure()
# plot_classifier_metrics_from_csv(csv_path)
# infer on CPU
'cpu')
lm.to(= "The country of "
prompt = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
sequences print(sequences)
NN Bigram
NNBigram
NNBigram (vocab_size:int)
*Base class for all neural network modules.
Your models should also subclass this class.
Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
x = F.relu(self.conv1(x))
return F.relu(self.conv2(x))
Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to
, etc.
.. note:: As per the example above, an __init__()
call to the parent class must be made before assignment on the child.
:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Usage
= 32, 8, 65
B, T, C = C
vocab_size = NNBigram(vocab_size)
model print("vocab size: ", model.vocab_size)
= torch.randint(0,C,(B,T))
X = torch.randint(0,C,(B,T))
Y = (X,Y)
batch = model(X) # (B, T, C)
logits print("X: ", X.shape, "Y: ", Y.shape, "logits: ", logits.shape)
# generate
=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0] model.predict(idx
# #| export
# class NNBigramL(ModelModule):
# def __init__(self, vocab_size:int, lr:float=1e-3):
# model = NNBigram(vocab_size)
# super().__init__(model, lr)
# self.accuracy = Accuracy(task='multiclass', num_classes=model.vocab_size)
# def _step(self, batch:torch.tensor, batch_idx:int):
# x, y = batch
# logits = self.model(x) # (B,T,C)
# B, T, C = logits.shape
# logits = logits.view(B*T, C)
# y = y.view(B*T)
# loss = self.loss(logits, y)
# acc = self.accuracy(logits, y)
# return loss, acc
# def predict(self,idx:torch.IntTensor, max_new_tokens:int):
# return self.model.predict(idx, max_new_tokens)
# model_pl = NNBigramL(vocab_size)
# logits = model_pl(X) # (B, T, C)
# print(logits.shape)
# model_pl.training_step(batch, 0)
# model_pl._step(batch, 0)
Data
with open('../data/text/tiny_shakespeare.txt') as f:
= f.read() text
# dataset
= 8
block_size = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size)
ds = ds[0]
X,Y print("x:", ds.from_tokens(X), "\ny:", ds.from_tokens(Y))
# dataloader
= DataLoader(ds, batch_size=32, num_workers=0)
dl = next(iter(dl))
X, Y print("x:", X.shape, "\ny:", Y.shape)
Training
= NNBigram(ds.vocab_size) model
= nn.CrossEntropyLoss()
criterion = torch.optim.AdamW(model.parameters(), lr=1e-3)
optimizer = torch.device('cpu') device
= 1000
ITER_MAX = []
train_loss for epoch in tqdm(range(ITER_MAX)):
model.train()= X.to(device) # (B,T)
X = Y.to(device) # (B,T)
Y = model(X)
logits = logits.shape
B, T, C = criterion(logits.view(B*T, C), Y.view(B*T))
loss
loss.backward()
optimizer.step()
train_loss.append(loss.item())if not(epoch % 1000):
print(loss.item())
eval()
model.with torch.no_grad():
= 0
correct = 0
total = model(X).view(B*T,C)
logits # _, predicted = torch.max(logits.data, 1)
= F.softmax(logits, dim=-1)
probs # print("probs: ", probs.shape)
= torch.argmax(probs, dim=1)
preds # print("pred:", preds.shape)
# print("Y:", Y.shape)
# print(predicted)
# total += Y.size(0)
# correct += (predicted == Y).sum()
# print(f"Epoch {epoch + 1}: Accuracy = {100 * correct / total:.2f}%")
plt.plot(train_loss)
print(ds.from_tokens(model.predict(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))
training from module
# %%time
# n_epochs = 1
# train_loss = []
# for epoch in range(n_epochs):
# model_pl.model.train()
# loss = model_pl.training_step(batch, None)
# loss.backward()
# optimizer.step()
# train_loss.append(loss.item())
# if not(epoch % 100):
# print(loss.item())
# print(ds.from_tokens(model_pl.predict(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))