Text datasets

SEED = 42
set_seed(SEED)

Seed set to 42

Vocab

Each row is a list of words (sentence). For each row, extract unique character and add to vocabulary. deals with special characters too.

source

Vocab

 Vocab (data_path:str|os.PathLike='../data/text/tiny_shakespeare.txt',
        specials=['<pad>', '<unk>', '<bos>', '<eos>'])

Initialize self. See help(type(self)) for accurate signature.

	Type	Default	Details
data_path	str \| os.PathLike	../data/text/tiny_shakespeare.txt	path to text data file
specials	list	[‘’, ‘’, ‘’, ‘’]	encode special characters

Usage

read text file into a pandas data framew with each row as a new line

v = Vocab('../data/text/tiny_shakespeare.txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'])
print(v.vocabulary)

[11:50:02] INFO - Vocab: read text file

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<bos>', '<eos>', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# egs where token * is not in vocab
print(v.stoi('*'))
print(v.itos(61))

12
K

print(v.vocabulary)
s = v.stoi(["<bos>","h", "e", "l", "l", "o", "*", "<eos>"])
print(s)
print(v.itos(s))

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<bos>', '<eos>', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[58, 42, 54, 49, 49, 46, 12, 57]
['<bos>', 'h', 'e', 'l', 'l', 'o', '<unk>', '<eos>']

Tiny shakespeare

source

SimpleCharDataset

 SimpleCharDataset (data, context_length)

*An abstract class representing a :class:Dataset.

All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:__getitem__, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler implementations and the default options of :class:~torch.utils.data.DataLoader. Subclasses could also optionally implement :meth:__getitems__, for speedup batched samples loading. This method accepts list of indices of samples of batch and returns list of samples.

.. note:: :class:~torch.utils.data.DataLoader by default constructs an index sampler that yields integral indices. To make it work with a map-style dataset with non-integral indices/keys, a custom sampler must be provided.*

data = Path('../data/text/tiny_shakespeare.txt').read_text()
print(data[:64])

ds = SimpleCharDataset(data, 8)
print(ds[0])
print(len(ds))

tokenizer = CharTokenizer.from_text(data)
tokenized = tokenizer.encode(data)
ds = SimpleCharDataset(tokenized, 8)
print(ds[0])

dl = DataLoader(ds, batch_size=8, shuffle=True)
x,y = next(iter(dl))
print(tokenizer.decode(x[0]))
print(tokenizer.decode(y[0]))

First Citizen:
Before we proceed any further, hear me speak.

Al
('First Ci', 'irst Cit')
1115386
(tensor([18, 47, 56, 57, 58,  1, 15, 47]), tensor([47, 56, 57, 58,  1, 15, 47, 58]))
om my fa
m my fat

Char Dataset

C.f. https://karpathy.github.io/char-rnn/ text is a long continuous string

source

CharDataset

 CharDataset
              (data_path:str|os.PathLike='../data/text/tiny_shakespeare.tx
              t', context_length:int=3, specials=['<pad>', '<unk>',
              '<bos>', '<eos>'], add_sentence_tokens:bool=True)

*An abstract class representing a :class:Dataset.

	Type	Default	Details
data_path	str \| os.PathLike	../data/text/tiny_shakespeare.txt	path to the data file
context_length	int	3	context length
specials	list	[‘’, ‘’, ‘’, ‘’]	encode special characters
add_sentence_tokens	bool	True	add special tokens to the data

Usage

block_size = 3 #context_length
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<pad>', '<unk>', '<bos>', '<eos>'], add_sentence_tokens=True)
# just encode <unk> in case unknown characters are encountered in test set
ds = CharDataset(data_path='../data/text/tiny_shakespeare.txt', context_length=block_size, specials=['<unk>', '<pad>'], add_sentence_tokens=False)
print("vocab size: ", ds.vocab_size)
print(len(ds))
for i in range(2):
    x, y = ds[i]
    print("x:", x,  "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y)[-1])

[17:33:29] INFO - CharDataset: init
[17:33:29] INFO - Vocab: read text file
[17:33:29] INFO - CharDataset: init
[17:33:29] INFO - Vocab: read text file

vocab size:  67
1115394
x: tensor([29, 18, 49]) itos:  Fir 
y: tensor([18, 49, 11]) itos:  s
x: tensor([18, 49, 11]) itos:  irs 
y: tensor([49, 11, 44]) itos:  t
CPU times: user 160 ms, sys: 12 ms, total: 172 ms
Wall time: 176 ms

x,y = ds[0]
print("x:", x,  "itos: ", ds.from_tokens(x), "\ny:", y, "itos: ", ds.from_tokens(y))
print("vocab size: ", ds.vocab_size)
print("vocabulary: ", ds.vocabulary)

x: tensor([29, 18, 49]) itos:  Fir 
y: tensor([18, 49, 11]) itos:  irs
vocab size:  67
vocabulary:  ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '<pad>', '<unk>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

print(len(ds))
t = len(ds)*torch.tensor((0.8, 0.1, 0.1))
lengths = [int(p * len(ds)) for p in (0.8, 0.1, 0.1)]
lengths[-1] = len(ds) - sum(lengths[:-1])
print(lengths)

random_split(ds, lengths)

1115394
[892315, 111539, 111540]

[<torch.utils.data.dataset.Subset>,
 <torch.utils.data.dataset.Subset>,
 <torch.utils.data.dataset.Subset>]

Char Data Module

source

CharDataModule

 CharDataModule
                 (data_path:str|os.PathLike='../data/text/tiny_shakespeare
                 .txt', specials=['<pad>', '<unk>', '<bos>', '<eos>'],
                 add_sentence_tokens:bool=False,
                 train_val_test_split:Tuple[int,int,int]=(0.8, 0.1, 0.1),
                 context_size:int=3, batch_size:int=32, num_workers:int=1,
                 pin_memory:bool=False, persistent_workers:bool=False,
                 random_split:bool=True)

Helper class that provides a standard way to create an ABC using inheritance.

	Type	Default	Details
data_path	str \| os.PathLike	../data/text/tiny_shakespeare.txt	dataset
specials	list	[‘’, ‘’, ‘’, ‘’]
add_sentence_tokens	bool	False
train_val_test_split	Tuple	(0.8, 0.1, 0.1)	data module
context_size	int	3
batch_size	int	32
num_workers	int	1
pin_memory	bool	False
persistent_workers	bool	False
random_split	bool	True

Usage

dm = CharDataModule(
    data_path="../data/text/tiny_shakespeare.txt",
    add_sentence_tokens=False,
    specials=['<unk>', '<pad>'],
    context_size=3,
    train_val_test_split = (0.8, 0.1, 0.1),
    random_split=False,
    batch_size=64,
    num_workers=0,
    pin_memory=False,
    persistent_workers=False,
    )
dm.prepare_data()
dm.setup()

[17:46:46] INFO - CharDataModule: init
[17:46:46] INFO - CharDataModule: setup, split datasets
[17:46:46] INFO - CharDataset: init
[17:46:46] INFO - Vocab: read text file
[17:46:47] INFO - Split dataset into train/val/test. Keep sequence order.

print(len(dm.ds))

X, Y = dm.train_ds[0]
print(dm.ds.from_tokens(X), dm.ds.from_tokens(Y), dm.vocab_size)

Fir irs 67

len(dm.train_ds), len(dm.val_ds), len(dm.test_ds)
print(dm.test_ds[0])
print(len(dm.test_dataloader()))

(tensor([51, 59, 59]), tensor([59, 59, 16]))
1743

test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

X (B,T):  torch.Size([64, 3]) X:  tensor([51, 59, 59]) chars:  ?


Y (B):  torch.Size([64, 3]) Y:  tensor([59, 59, 16]) chars:  

G

Init from config file

cfg = OmegaConf.load('../config/text/data/tinyshakespeare.yaml')
print(cfg)
dm = instantiate(cfg)
dm.setup()

test_dl = dm.test_dataloader()
X,Y = next(iter(test_dl))
print("X (B,T): ", X.shape, "X: ", X[0], "chars: ", dm.ds.from_tokens(X[0]))
print( "Y (B): ", Y.shape, "Y: ", Y[0], "chars: ", dm.ds.from_tokens(Y[0]))

Hugging Face

https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt

Load text file

dataset = load_dataset("text", data_files="../data/text/tiny_shakespeare.txt") #, split=['train','dev','test'])
print(dataset)
full = dataset['train']

train_test = full.train_test_split(train_size=0.8)
test_valid = train_test['test'].train_test_split(train_size=0.5)
shake = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(shake)

shake['test'][0]

Tokenization / Numericalization

Tokenize single element

tokenizer = AutoTokenizer.from_pretrained('gpt2')

print("vocab size: ", len(tokenizer))
print("text row 0: ", shake['test'][0]['text'])
tokens = tokenizer.tokenize(shake['test'][0]['text'])
print("tokens of row 0: ", tokens)

context_length = 10
padded = tokenizer(shake['test'][0]['text'], max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
print("context block & padding for lm: ", padded)
# print(padded.keys())
print('decode single input_id: ', tokenizer.decode(849))
print([tokenizer.decode(x) for x in padded['input_ids']])

Tokenize whole dataset using map

from omegaconf import OmegaConf

cfg = {
    "context_length": 10,
    "truncation": True,
    "return_length": True,
    "return_overflowing_tokens": True,
}

cfg = OmegaConf.create(cfg)

# tokenizer function called via dataset map
def tokenize_function(examples:List[dict[str,str]], cfg:OmegaConf=cfg) -> dict[str, List[List[int]]]:
    result = tokenizer(
        examples["text"],
        max_length=cfg.context_length,
        truncation=cfg.truncation,
        return_length=cfg.return_length,
        return_overflowing_tokens=cfg.return_overflowing_tokens
        )
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenize_function(shake['test'][0])

shake_toked = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

print(shake_toked['test'][0])
print([tokenizer.decode(x) for x in shake_toked['test'][0]['input_ids']])
print(tokenizer.decode(shake_toked['test'][0]['input_ids']))

for split, dset in shake_toked.items():
    print(split, dset)
    arr_len = np.sum(dset['length'], dtype=np.uint64)

Data Collator

print(tokenizer.pad_token, tokenizer.eos_token)

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([shake_toked["test"]['input_ids'][i] for i in range(5)])
# out = default_data_collator(shake_toked['test']['input_ids'][0])
print(out)
# for key in out:
#     print(f"{key} shape: {out[key].shape}")
# print('inputs: ', out['input_ids'])
# print('labels: ', out['labels'])

# data_collator = DefaultDataCollator(tokenizer)
# out = data_collator([shake_toked["test"][i] for i in range(5)])
# print(out)

def my_collate(examples, block_size: int, **kwargs):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

example = shake['test'][0]
# concatenated_examples = {k: sum(example[k], []) for k in example.keys()}
print([example[k] for k in example.keys()])

out = data_collator([shake_toked['test']['input_ids'][i] for i in range(4)])
print(out)

for i in range(4):
    print([tokenizer.decode(x) for x in out['input_ids'][i]])

Dataloader

test_dl = DataLoader(
    shake_toked['test']['input_ids'],
    batch_size=128,
    collate_fn=data_collator,
    num_workers=0,
)

#!head ../data/text/tiny_shakespeare.txt

b = next(iter(test_dl))
print(b['input_ids'].shape)
print(b['input_ids'][1])
print(b['labels'][1])
# for i in range(128):
#     print([tokenizer.decode(x) for x in b['input_ids'][i]])

Wikitext-2

Data source from Hugging Face

dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

print(len(dataset), type(dataset), dataset)
print(dataset[100])

dataset[100]

Data source from torchtext

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

# train_iter = WikiText2(root='../data/text', split='test')
# tokenizer = get_tokenizer('basic_english')

# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# vocab.set_default_index(vocab['<unk>'])
# len(vocab)

# vocab['the']

# vocab(tokenizer('this is a test'))

# # concatenate all sentences together
def data_process(raw_text_iter) -> torch.Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# for idx, i in enumerate(train_iter):
#     print(idx, i)
#     print(vocab(tokenizer(i)))

# train_iter, val_iter, test_iter = WikiText2()
# train_data = data_process(train_iter)
# val_data = data_process(val_iter)
# test_data = data_process(test_iter)

def batchify(data: torch.Tensor, bsz: int) -> torch.Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data

# test_data = batchify(test_data, 10)
# print(test_data)

bptt = 35
def get_batch(source: torch.Tensor, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

# x, y = get_batch(test_data, 0)
# print("x: ", x[:2])
# print("y: ", y[:2])

Word-based tokenization

torchtext tokenizer

# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
# vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
# # vocab.set_default_index(vocab['<unk>'])
# tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  
# tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], 
# fn_kwargs={'tokenizer': tokenizer})
# print(tokenized_dataset['train'][88]['tokens'])

hugging face tokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize(dataset[100]['text'])
print(tokens)

Numericalization

torchtext

# vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
# min_freq=3) 
# vocab.insert_token('<unk>', 0)           
# vocab.insert_token('<eos>', 1)            
# vocab.set_default_index(vocab['<unk>'])   
# print(len(vocab))                         
# print(vocab.get_itos()[:10])

hugging face

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
print(tokenizer.decode(ids))

Hugging Face for LM without intermediary steps

https://huggingface.co/course/chapter7/6?fw=pt

# directly without intermediary steps
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
text = ["this is a text.", "or so it seems"]
padded = tokenizer(text, max_length=4, truncation=True, return_length=True, return_overflowing_tokens=True)

print(padded)
print(padded.keys())
print([tokenizer.decode(x) for x in padded['input_ids']])

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator(padded['input_ids'])
for key in out:
    print(f"{key} shape: {out[key].shape}")

print(out['input_ids'], out['labels'])
# Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels.

Data loader

Concatenate all data into one large string of text and then chunk it into context length chunks - https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf - https://www.youtube.com/watch?v=ma1TrR7gE7I&t=273s

def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:                                      
            tokens = example['tokens'].append('<eos>')             
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data

# batch_size = 1024
# train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
# valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
# test_data = get_data(tokenized_dataset['test'], vocab, batch_size)

# print(tokenized_dataset.shape)
# print(tokenized_dataset['train']['tokens'][88])

Language modeling dataset

Basically concatenate all data into one big array of ids and then create block_lengths inputs. shift for corresponding labels.

print(shake)

Tokenization

tokenizer = AutoTokenizer.from_pretrained('gpt2')

def tokenize_function(examples:List[dict[str,str]]) -> dict[str, List[List[int]]]:
    result = tokenizer(examples["text"]) #, max_length=context_length, truncation=True, return_length=True, return_overflowing_tokens=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized = shake.map(
    tokenize_function, batched=True,
    remove_columns=["text"],
    num_proc = 1
)

print(tokenized['train'], type(tokenized['train']))

Sentences concatenation

all = []
for k,v in tokenized.items():
    print(k, v)
    for x in v['input_ids']:
        all += x
    print(len(all))
print(all[:15])

Batchify

def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    return x, y

x, y = get_batch(np.array(all), 16, 10)
print(x.shape, y.shape)
print(x[0], y[0])
print(tokenizer.decode(x[0]), tokenizer.decode(y[0]))