= 42
SEED set_seed(SEED)
Seed set to 42
Each row is a list of words (sentence). For each row, extract unique character and add to vocabulary. deals with special characters too.
Vocab (data_path:str|os.PathLike='../data/text/tiny_shakespeare.txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'])
Initialize self. See help(type(self)) for accurate signature.
Type | Default | Details | |
data_path | str | os.PathLike | ../data/text/tiny_shakespeare.txt | path to text data file |
specials | list | [‘ |
encode special characters |
read text file into a pandas data framew with each row as a new line
v = Vocab('../data/text/tiny_shakespeare.txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'])
[17:27:39] INFO - Vocab: read text file
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<bos>', '<eos>', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
s = v.stoi(["<bos>","h", "e", "l", "l", "o", "*", "<eos>"])
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<bos>', '<eos>', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[22, 41, 10, 37, 37, 67, 43, 49]
['<bos>', 'h', 'e', 'l', 'l', 'o', '<unk>', '<eos>']
C.f. https://karpathy.github.io/char-rnn/ text is a long continuous string
CharDataset (data_path:str|os.PathLike='../data/text/tiny_shakespeare.tx t', context_length:int=3, specials=['<pad>', '<unk>', '<bos>', '<eos>'], add_sentence_tokens:bool=True)
Type | Default | Details | |
data_path | str | os.PathLike | ../data/text/tiny_shakespeare.txt | path to the data file |
context_length | int | 3 | context length |
specials | list | [‘ |
encode special characters |
add_sentence_tokens | bool | True | add special tokens to the data |
block_size = 3 #context_length
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<pad>', '<unk>', '<bos>', '<eos>'], add_sentence_tokens=True)
# just encode <unk> in case unknown characters are encountered in test set
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<unk>', '<pad>'], add_sentence_tokens=False)
print("vocab size: ", ds.vocab_size)
for i in range(2):
x, y = ds[i]
print("x:", x, "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y)[-1])
[17:33:29] INFO - CharDataset: init
[17:33:29] INFO - Vocab: read text file
[17:33:29] INFO - CharDataset: init
[17:33:29] INFO - Vocab: read text file
vocab size: 67
x: tensor([29, 18, 49]) itos: Fir
y: tensor([18, 49, 11]) itos: s
x: tensor([18, 49, 11]) itos: irs
y: tensor([49, 11, 44]) itos: t
CPU times: user 160 ms, sys: 12 ms, total: 172 ms
Wall time: 176 ms
x,y = ds[0]
print("x:", x, "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y))
print("vocab size: ", ds.vocab_size)
print("vocabulary: ", ds.vocabulary)
x: tensor([29, 18, 49]) itos: Fir
y: tensor([18, 49, 11]) itos: irs
vocab size: 67
vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
t = len(ds)*torch.tensor((0.8, 0.1, 0.1))
lengths = [int(p * len(ds)) for p in (0.8, 0.1, 0.1)]
lengths[-1] = len(ds) - sum(lengths[:-1])
random_split(ds, lengths)
[892315, 111539, 111540]
CharDataModule (data_path:str|os.PathLike='../data/text/tiny_shakespeare .txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'], add_sentence_tokens:bool=False, train_val_test_split:Tuple[int,int,int]=(0.8, 0.1, 0.1), context_size:int=3, batch_size:int=32, num_workers:int=1, pin_memory:bool=False, persistent_workers:bool=False, random_split:bool=True)
Type | Default | Details | |
data_path | str | os.PathLike | ../data/text/tiny_shakespeare.txt | dataset |
specials | list | [‘ |
add_sentence_tokens | bool | False | |
train_val_test_split | Tuple | (0.8, 0.1, 0.1) | data module |
context_size | int | 3 | |
batch_size | int | 32 | |
num_workers | int | 1 | |
pin_memory | bool | False | |
persistent_workers | bool | False | |
random_split | bool | True |
dm = CharDataModule(
specials=['<unk>', '<pad>'],
train_val_test_split = (0.8, 0.1, 0.1),
[17:46:46] INFO - CharDataModule: init
[17:46:46] INFO - CharDataModule: setup, split datasets
[17:46:46] INFO - CharDataset: init
[17:46:46] INFO - Vocab: read text file
[17:46:47] INFO - Split dataset into train/val/test. Keep sequence order.
len(dm.train_ds), len(dm.val_ds), len(dm.test_ds)
(tensor([51, 59, 59]), tensor([59, 59, 16]))
test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))
X (B,T): torch.Size([64, 3]) X: tensor([51, 59, 59]) chars: ?
Y (B): torch.Size([64, 3]) Y: tensor([59, 59, 16]) chars:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
print("vocab size: ", len(tokenizer))
print("text row 0: ", shake['test'][0]['text'])
tokens = tokenizer.tokenize(shake['test'][0]['text'])
print("tokens of row 0: ", tokens)
context_length = 10
padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print("context block & padding for lm: ", padded)
# print(padded.keys())
print('decode single input_id: ', tokenizer.decode(849))
print([tokenizer.decode(x) for x in padded['input_ids']])
cfg = {
"context_length": 10,
"truncation": True,
"return_length": True,
"return_overflowing_tokens": True,
cfg = OmegaConf.create(cfg)
# tokenizer function called via dataset map
def tokenize_function(examples:List[dict[str,str]], cfg:OmegaConf=cfg) -> dict[str, List[List[int]]]:
result = tokenizer(
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([shake_toked["test"]['input_ids'][i] for i in range(5)])
# out = default_data_collator(shake_toked['test']['input_ids'][0])
# for key in out:
# print(f"{key} shape: {out[key].shape}")
# print('inputs: ', out['input_ids'])
# print('labels: ', out['labels'])
# data_collator = DefaultDataCollator(tokenizer)
# out = data_collator([shake_toked["test"][i] for i in range(5)])
# print(out)
def my_collate(examples, block_size: int, **kwargs):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
result["labels"] = result["input_ids"].copy()
return result
def batchify(data: torch.Tensor, bsz: int) -> torch.Tensor:
"""Divides the data into ``bsz`` separate sequences, removing extra elements
that wouldn't cleanly fit.
data: Tensor, shape ``[N]``
bsz: int, batch size
Tensor of shape ``[N // bsz, bsz]``
seq_len = data.size(0) // bsz
data = data[:seq_len * bsz]
data = data.view(bsz, seq_len).t().contiguous()
return data
bptt = 35
def get_batch(source: torch.Tensor, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
source: Tensor, shape ``[full_seq_len, batch_size]``
i: int
tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
target has shape ``[seq_len * batch_size]``
seq_len = min(bptt, len(source) - 1 - i)
data = source[i:i+seq_len]
target = source[i+1:i+1+seq_len]
return data, target
# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# # vocab.set_default_index(vocab['<unk>'])
# tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}
# tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'],
# fn_kwargs={'tokenizer': tokenizer})
# print(tokenized_dataset['train'][88]['tokens'])
# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)
print([tokenizer.decode(x) for x in padded['input_ids']])
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
print(f"{key} shape: {out[key].shape}")
print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.
Concatenate all data into one large string of text and then chunk it into context length chunks - https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf - https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s
def get_data(dataset, vocab, batch_size):
data = []
for example in dataset:
if example['tokens']:
tokens = example['tokens'].append('<eos>')
tokens = [vocab[token] for token in example['tokens']]
data = torch.LongTensor(data)
num_batches = data.shape[0] // batch_size
data = data[:num_batches * batch_size]
data = data.view(batch_size, num_batches)
return data
Basically concatenate all data into one big array of ids and then create block_lengths inputs. shift for corresponding labels.
def tokenize_function(examples:List[dict[str,str]]) -> dict[str, List[List[int]]]:
result = tokenizer(examples["text"]) #, max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result