Neural Net Language Models

Overview

The Nimrod Language Models (LM) module provides a comprehensive framework for developing, training, and deploying advanced natural language processing models, with a focus on flexibility, performance, and cutting-edge research.

Key Features

  • 🔤 Advanced Language Model Architectures
  • 🧠 Transformer-based Models
  • 🚀 High-Performance NLP Utilities
  • 🔧 Configurable Model Components
  • 🌐 Multi-Language Support

Core Components

Language Model Architectures

  • Transformer-based Models
  • Sequence-to-Sequence Models
  • Encoder-Decoder Architectures
  • Causal Language Models
  • Masked Language Models

Key Capabilities

  • Tokenization
  • Embedding Strategies
  • Attention Mechanisms
  • Transfer Learning
  • Few-Shot Learning
  • Prompt Engineering

Supported Model Types

  • BERT-like Models
  • GPT-style Architectures
  • T5 Variants
  • BART
  • RoBERTa
  • XLNet
# N_EPOCHS for training debuggging
ITER_MAX = 1
set_seed(42)
# reading with pandas
df = pd.read_csv('../data/text/names.txt', header=None, names=['name'])
data = list(df.name)
print("names: ", data[:3])

Data formatting

given last n tokens we predict token n+1

s = list("alexandra")
print(s)
bigram = [(x,y) for x, y in zip(s, s[1:])]
print(bigram)
trigram = [ (x,y,z) for x, y, z in zip(s, s[1:], s[2:])]
print(trigram)

Tiny shakespeare LM char dataset

# reading directly in plain python
lines = []
with open('../data/text/tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        if line.strip():
            # only append non blank lines
            lines.append(line)

# add sentence tokens
# data = [['<bos>'] +list(line.strip()) + ['<eos>'] for line in lines]
# data = [list(line.strip()) for line in lines]
data = [list(line) for line in lines]
print("data: ", data[:3])
def make_dataset(
        words:List[str], # data is a list of sentences which are a list of words
        v:Vocab,# vocabulary class for mapping words to indices
        verbose:bool=False, # print debug info
        context_length=3 # number of words/tokens to use as context
        ):
    X = []
    y = []
    for word in words:
        s = list(word)
        if verbose:
            print('row: ', s)
        # init prefix with padding while len < context_length
        for i in range(context_length-1):
            sequence = v.stoi(s[:i+1])
            pad_len = context_length - len(sequence)
            pad = [v.stoi("<pad>")] * pad_len
            X.append(pad + sequence)
            y.append(v.stoi(s[i+1]))

            if verbose:
                print(["<pad>"]+ s[:i+1], s[i+1])

        # for length seq = context_length
        i = 0
        while i < (len(s) - context_length):
            X.append(v.stoi(s[i:context_length+i]))
            y.append(v.stoi(s[i+context_length]))
            if verbose:
                print(s[i:context_length+i], s[i+context_length])
            i += 1
    return torch.tensor(X),torch.tensor(y)

for each row in the dataset we expand all the combinations of ngrams

v = Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
print("vocabulary: ", v.vocabulary)
print("vocabulary size: ", len(v))
CONTEXT_LEN = 3
X, y = make_dataset(data[:80], v, verbose=True, context_length=CONTEXT_LEN)
print("X: ", X.shape, "y:", y.shape)

MLP LM Model

https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

# F.one_hot(torch.tensor(5), num_classes=n_vocab).float()@C # == C[5]

source

NNLM

 NNLM (n_vocab:int=30, n_emb:int=10, n_context:int=3, n_h:int=100)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Type Default Details
n_vocab int 30 vocabulary size
n_emb int 10 embedding dimension
n_context int 3 context size bigram/trigram, etc.
n_h int 100 hidden layer size

source

NNLMConfig

 NNLMConfig (n_vocab:int=30, n_emb:int=10, n_context:int=3, n_h:int=100)

Usage

# config model
conf = NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
lm = NNLM(**asdict(conf))

# test data
bs = 25
x = torch.randint(conf.n_vocab, (bs, conf.n_context)) # (B, T) with values between 0 and n_vocab
print("X (B, T):", x.shape)

# prediction
y = lm(x)
print("Y_hat logits (B, n_vocab):", y.shape)

Train MLP LM

handmade dataset

Xtr, Ytr = make_dataset(data[:80], v, context_length=CONTEXT_LEN)
Xdev, Ydev = make_dataset(data[80:90], v)
Xte, Yte = make_dataset(data[90:100], v)
print("Xtr (B, T): ", Xtr.shape, "Ytr (B): ", Ytr.shape, "data:", len(data[:80]))
print("len Xtr: ", len(Xtr))
print("CONTEXT_LEN: ", CONTEXT_LEN)

Overfit on subset of 80 first rows

device = get_device()
device = 'cpu'
# lm.to(device)

# overfit on one big batch
optim = SGD(lm.parameters(), lr=0.01, momentum=0.9)
train_loss = []
ITER_MAX = 1000
for i in tqdm(range(ITER_MAX)):
    # for batch in dm.train_dataloader():
        # Xtr, Ytr = batch
        # Ytr = Ytr[:, -1]
        Xtr = Xtr.to(device)
        Ytr = Ytr.to(device)
        optim.zero_grad()
        logits = lm(Xtr)
        loss = F.cross_entropy(logits, Ytr)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())
        if not(i%250):
            print(loss.item())
plt.plot(train_loss)

Sample

# infer on CPU
lm.to('cpu')
prompt = "The country of "
sequences = lm.sample(prompt, v, max_new_tokens=250, temperature=0.6)
print(sequences)

Dataloader

cfg = OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
# use <unk> and <pad> to be consistent with manual data preprocessing and have smae vocabulary size
v = Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
print("vocabulary: ", v.vocabulary)
print("vocabulary size: ", len(v))
print(cfg)
cfg.train_val_test_split = [0.8, 0.1, 0.1]
# by default data_path is relative to the recipe folder so need to update for nbs
cfg.data_path = "../data/text/tiny_shakespeare.txt"
cfg.context_size = CONTEXT_LEN
cfg.batch_size = 2700 # large batch to mimic manual data order of magnitude
cfg.random_split = False
cfg.specials=['<unk>', '<pad>']
cfg.add_sentence_tokens = False
print(cfg)
dm = instantiate(cfg)
dm.setup()
print("vocab size: ", dm.vocab_size)
# setup large batch to overfit / test model
Xtr, Ytr= next(iter(dm.train_dataloader()))
# target is last token in sequence
Ytr = Ytr[:, -1]
print("Xtr (B, T): ", Xtr.shape, "Ytr (B): ", Ytr.shape)
X, Y = dm.train_ds[0]
print(dm.ds.from_tokens(X), dm.ds.from_tokens(Y))
conf = NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
print(len(v), CONTEXT_LEN)
lm = NNLM(**asdict(conf))
bs = 10
x = torch.randint(conf.n_vocab, (bs, conf.n_context)) # (B, T) with values between 0 and n_vocab
print("X (B, T):", x.shape)
lm(x).shape
# checking data tokens are between 0 and vocab size
print(Xtr.min(),  Xtr.max())

overfit

# device = get_device()
device = 'cpu'
lm.to(device)

# overfit on one big batch
optim = SGD(lm.parameters(), lr=0.01, momentum=0.9)
train_loss = []
ITER_MAX = 1000
for i in tqdm(range(ITER_MAX)):
    # for batch in dm.train_dataloader():
        # Xtr, Ytr = batch
        # Ytr = Ytr[:, -1]
        Xtr = Xtr.to(device)
        Ytr = Ytr.to(device)
        optim.zero_grad()
        logits = lm(Xtr)
        loss = F.cross_entropy(logits, Ytr)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())
        if not(i%250):
            print(loss.item())
plt.plot(train_loss)
# infer on CPU
lm.to('cpu')
prompt = "The country of "
sequences = lm.sample(prompt, v, max_new_tokens=250, temperature=0.6)
print(sequences)

Batching with dataloaders

# mini batch gradient descent with datamodule
cfg = OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
cfg.train_val_test_split = [0.8, 0.1, 0.1]
cfg.data_path = "../data/text/tiny_shakespeare.txt"
cfg.context_size = CONTEXT_LEN
cfg.batch_size = 2048
cfg.random_split = False
cfg.specials=['<unk>', '<pad>']
cfg.add_sentence_tokens = False
dm = instantiate(cfg)
dm.setup()

conf = NNLMConfig(n_vocab=len(v), n_context=CONTEXT_LEN)
lm = NNLM(**asdict(conf))
optim = SGD(lm.parameters(), lr=0.01, momentum=0.9)
train_loss = []
# device = get_device()
device = 'cpu'
lm.to(device)
i = 0
EPOCHS = 1
for epoch in tqdm(range(EPOCHS)):
    print(f"epoch {epoch}")
    for batch in tqdm(dm.train_dataloader()):
        Xtr, Ytr = batch
        # target is last token in sequence
        Ytr = Ytr[:, -1] # BxT
        Xtr = Xtr.to(device)
        Ytr = Ytr.to(device)
        
        logits = lm(Xtr)
        loss = F.cross_entropy(logits, Ytr)

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss.append(loss.item())
        if not(i%1000):
            print(loss.item())
        i += 1
plt.plot(train_loss)

Sample

# infer on CPU
lm.to('cpu')
prompt = "The country of "
sequences = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
print(sequences)

MLP LM X Model


source

NNLM_X

 NNLM_X (nnet:__main__.NNLM, num_classes:int,
         optimizer:torch.optim.optimizer.Optimizer, scheduler:<module'torc
         h.optim.lr_scheduler'from'/opt/hostedtoolcache/Python/3.10.16/x64
         /lib/python3.10/site-packages/torch/optim/lr_scheduler.py'>)

Helper class that provides a standard way to create an ABC using inheritance.

Usage

# omegaconf
cfg = OmegaConf.load("../config/text/model/nnlm.yaml")
cfg.num_classes = len(v)
print(len(v))
# have to convert omegaconf dict to dict for pprint
opt = instantiate(cfg.optimizer)
print(opt.keywords['lr'])

pprint.pprint(dict(cfg))
lm  = instantiate(cfg)
print(lm.hparams.optimizer)
n_samples = 25
x = torch.randint(conf.n_vocab, (n_samples, cfg.nnet.n_context))
print("X:", x.shape)
y = lm(x)
print("Y_hat logits:", y.shape)
# v = Vocab(data_path='../data/text/tiny_shakespeare.txt', specials=['<unk>','<pad>'])
lm.sample("The country of ", v, max_new_tokens=500, temperature=0.9)

L Training Loop

# vocab
print(len(v))

# data
cfg = OmegaConf.load("../config/text/data/tinyshakespeare.yaml")
cfg.context_size = CONTEXT_LEN
cfg.specials: ["<pad>", "<unk>"]
cfg.batch_size = 2048
cfg.random_split = False
dm = instantiate(cfg)
dm.setup()

# model
cfg = OmegaConf.load("../config/text/model/nnlm.yaml")
lm  = instantiate(cfg)
print(lm.__dict__)

model can be easily trained with L trainer (c.f. recipes/text/ for examples)

trainer = Trainer(
    accelerator="auto",
    max_epochs=1,
    logger=CSVLogger("logs", name="nnlm")
    )
trainer.fit(lm, dm.train_dataloader(), dm.val_dataloader())
csv_path = f"{trainer.logger.log_dir}/metrics.csv"
metrics = pd.read_csv(csv_path)
metrics.head()
plt.plot(metrics['step'], metrics['train/loss_step'],'b.-')
plt.plot(metrics['step'], metrics['val/loss'],'r.-')
plt.show()
trainer.test(lm, dm.test_dataloader())
# infer on CPU
lm.to('cpu')
prompt = "The country of "
sequences = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
print(sequences)

Learning Rate Finder

lm.hparams
trainer = L.Trainer(
    accelerator="auto",
    max_epochs=1,
)
tuner = Tuner(trainer)
lr_finder = tuner.lr_find(
    lm,
    datamodule=dm,
    min_lr=1e-6,
    max_lr=1.0,
    num_training=100,  # number of iterations
    # attr_name="optimizer.lr",
)
fig = lr_finder.plot(suggest=True)
plt.show()
print(f"Suggested learning rate: {lr_finder.suggestion()}")

re-train with new lr

new_lr = lr_finder.suggestion()
lm.lr = new_lr
trainer = L.Trainer(
    accelerator="auto",
    max_epochs=1,
    logger=CSVLogger("logs", name="nnlm"),
)
trainer.fit(lm, dm.train_dataloader(), dm.val_dataloader())
trainer.test(lm, dm.test_dataloader())
csv_path = f"{trainer.logger.log_dir}/metrics.csv"
metrics = pd.read_csv(csv_path)
plt.plot(metrics['step'], metrics['train/loss_step'],'.-')
# plt.figure()
# plot_classifier_metrics_from_csv(csv_path)
# infer on CPU
lm.to('cpu')
prompt = "The country of "
sequences = lm.sample(prompt, v, max_new_tokens=500, temperature=0.9)
print(sequences)

NN Bigram


source

NNBigram

 NNBigram (vocab_size:int)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*

Usage

B, T, C = 32, 8, 65
vocab_size = C
model = NNBigram(vocab_size)
print("vocab size: ",  model.vocab_size)
X = torch.randint(0,C,(B,T))
Y = torch.randint(0,C,(B,T))
batch = (X,Y)
logits = model(X) # (B, T, C)
print("X: ", X.shape, "Y: ", Y.shape, "logits: ", logits.shape)
# generate
model.predict(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]
# #| export
# class NNBigramL(ModelModule):
#     def __init__(self, vocab_size:int, lr:float=1e-3):
#         model = NNBigram(vocab_size)
#         super().__init__(model, lr)
#         self.accuracy = Accuracy(task='multiclass', num_classes=model.vocab_size)

#     def _step(self, batch:torch.tensor, batch_idx:int):
#         x, y = batch
#         logits = self.model(x) # (B,T,C)
#         B, T, C = logits.shape
#         logits = logits.view(B*T, C)
#         y = y.view(B*T)
#         loss = self.loss(logits, y)
#         acc = self.accuracy(logits, y)
#         return loss, acc
    
#     def predict(self,idx:torch.IntTensor, max_new_tokens:int):
#         return self.model.predict(idx, max_new_tokens)
# model_pl = NNBigramL(vocab_size)
# logits = model_pl(X) # (B, T, C)
# print(logits.shape)
# model_pl.training_step(batch, 0)
# model_pl._step(batch, 0)

Data

with open('../data/text/tiny_shakespeare.txt') as f:
    text = f.read()
# dataset
block_size = 8
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size)
X,Y = ds[0]
print("x:",  ds.from_tokens(X), "\ny:", ds.from_tokens(Y))
# dataloader
dl = DataLoader(ds, batch_size=32, num_workers=0)
X, Y = next(iter(dl))
print("x:", X.shape, "\ny:", Y.shape)

Training

model = NNBigram(ds.vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
device = torch.device('cpu')
ITER_MAX = 1000
train_loss = []
for epoch in tqdm(range(ITER_MAX)):
    model.train()
    X = X.to(device) # (B,T)
    Y = Y.to(device) # (B,T)
    logits = model(X)
    B, T, C = logits.shape
    loss = criterion(logits.view(B*T, C), Y.view(B*T))
    loss.backward()
    optimizer.step()
    train_loss.append(loss.item())
    if not(epoch % 1000):
        print(loss.item())

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        logits = model(X).view(B*T,C) 
        # _, predicted = torch.max(logits.data, 1)
        probs = F.softmax(logits, dim=-1)
        # print("probs: ", probs.shape)
        preds = torch.argmax(probs, dim=1)
        # print("pred:", preds.shape)
        # print("Y:", Y.shape)
        # print(predicted)
        # total += Y.size(0)
        # correct += (predicted == Y).sum()
        # print(f"Epoch {epoch + 1}: Accuracy = {100 * correct / total:.2f}%")
plt.plot(train_loss)
print(ds.from_tokens(model.predict(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

training from module

# %%time
# n_epochs = 1
# train_loss = []
# for epoch in range(n_epochs):
#     model_pl.model.train()
#     loss = model_pl.training_step(batch, None)
#     loss.backward()
#     optimizer.step()
#     train_loss.append(loss.item())
#     if not(epoch % 100):
#         print(loss.item())
# print(ds.from_tokens(model_pl.predict(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))