# ---
# skip_exec: true
# skip_showdoc: true
# ---
Text tokenizers
# # torchtext
# import torchtext
# from torchtext.vocab import vocab
# from torchtext.data.utils import get_tokenizer
# from torchtext.datasets import AG_NEWS
Char-based Tokenizer
CharTokenizer
CharTokenizer (vocabulary:List[str])
Initialize self. See help(type(self)) for accurate signature.
= Path('../data/text/tiny_shakespeare.txt').read_text()
text print(text[:25])
= CharTokenizer.from_text(text)
tokenizer print(len(tokenizer))
print(tokenizer.ctoi['a'])
print(tokenizer.itoc[39])
= tokenizer.encode("this is swell")
enc print(enc)
print(tokenizer.decode(enc))
First Citizen:
Before we
65
39
a
tensor([58, 46, 47, 57, 1, 47, 57, 1, 57, 61, 43, 50, 50])
this is swell
Torchtext
https://pytorch.org/text/0.16.0/data_utils.html
class Tokenizer:
def __init__(self,
str='spacy', # backend tokenizer default to spacy
backend:str='en', # language on which tokenization is applied
language:bool=False, # add beginning of sentence tag <bos>
bos:bool=False, # add end of sentence tag <eos>
eos:
):if backend == 'spacy' and language == 'en':
= 'en_core_web_sm'
language if backend== 'character_based':
self.tokenizer = self.character_tokenizer
else:
self.tokenizer = get_tokenizer(backend, language=language)
self.bos = bos
self.eos = eos
self.backend = backend
print(f"# Tokenizer uses {self.backend} backend")
@staticmethod
def character_tokenizer(text:str)->List[str]:
return [c for c in text]
@dispatch
def __call__(self, text:str)->List[str]:
= self.tokenizer(text)
res if self.bos:
= ['<bos>'] + res
res if self.eos:
= res + ['<eos>']
res return(res)
@dispatch
def __call__(self, texts:List[str])->List[List[str]]:
return [self(text) for text in texts]
@dispatch # to replace Iterable
# works with agnews type of dataset [(index, text)]
def __call__(self, data_iter:Iterable)->Iterable:
for _, text in data_iter:
yield self(text)
@dispatch
def inverse(self, tokens:List[str])->str:
if self.backend == 'character_based':
return ''.join(tokens)
# TODO: take care of white spaces
else:
return ' '.join(tokens)
@dispatch
def inverse(self, list_of_tokens:List[List[str]])->List[str]:
= []
s for tokens in list_of_tokens:
self.inverse(tokens))
s.append(return s
Usage
String
= Tokenizer(backend='character_based', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
= Tokenizer(backend='spacy', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
= Tokenizer(backend='basic_english', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
# Tokenizer uses character_based backend
original sentence: Oh, yeah I'm not sure...
tokenized: ['<bos>', 'O', 'h', ',', ' ', 'y', 'e', 'a', 'h', ' ', 'I', "'", 'm', ' ', 'n', 'o', 't', ' ', 's', 'u', 'r', 'e', '.', '.', '.', '<eos>']
un-tokenized: <bos>Oh, yeah I'm not sure...<eos>
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[11], line 11 8 print("tokenized: ", tokenized) 9 print("un-tokenized: ", tok.inverse(tokenized)) ---> 11 tok = Tokenizer(backend='spacy', bos=True, eos=True) 12 # str -> List[str] 13 s = "Oh, yeah I'm not sure..." Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos) 13 self.tokenizer = self.character_tokenizer 14 else: ---> 15 self.tokenizer = get_tokenizer(backend, language=language) 16 self.bos = bos 17 self.eos = eos NameError: name 'get_tokenizer' is not defined
List of strings
# List[str]->List[List[str]]
= ["Oh, yeah I don't know dude...", "this is a test"]
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
Iterable
# Iterable -> Iterable
= Tokenizer()
tok = AG_NEWS(split='test') # data pipe
ds = next(iter(ds)) # (label, text)
sample print(sample)
= tok(ds)
it = [token for token in it]
tokens print(len(tokens))
print(tokens[:2])
Numericalizer
https://pytorch.org/text/stable/vocab.html
# TODO: add more special characters
class Numericalizer():
def __init__(self, tokens_iter:Iterable, specials=["<pad>", "<unk>", "<bos>", "<eos>"]):
self._vocab = self.build_map_from_iter(tokens_iter, specials)
def build_map_from_iter(self,data_iter:Iterable, specials=None):
self._vocab = torchtext.vocab.build_vocab_from_iterator(data_iter, specials=specials)
if "<unk>" in specials:
self._vocab.set_default_index(self._vocab["<unk>"])
return self._vocab
@dispatch
def __call__(self, texts:List[str])->List[List[int]]:
# TODO: check self._vocab has been built
return [self._vocab[text] for text in texts]
@dispatch
def __call__(self, texts:List[List[str]]):
# TODO: use nested list comprehension
= []
res for row in texts:
self._vocab[text] for text in row])
res.append([return res
@dispatch
def __call__(self, text:str)->int:
return self._vocab[text]
@property
def vocab(self):
return(self._vocab)
@dispatch
def inverse(self, idx:int)->str:
return self._vocab.get_itos()[idx]
@dispatch
def inverse(self, indices:List[int])->List[str]:
return [self._vocab.get_itos()[i] for i in indices]
Usage
= Tokenizer()
tok # In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
for _, text in data_iter:
yield tok(text)
= token_iterator(ds)
tok_it# initialize numericalizer based on token iterator
= Numericalizer(tok_it) num
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[13], line 1 ----> 1 tok = Tokenizer() 2 # In the case of agnews, dataset is: [(index, text)] 3 def token_iterator(data_iter:Iterable)->Iterable: Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos) 13 self.tokenizer = self.character_tokenizer 14 else: ---> 15 self.tokenizer = get_tokenizer(backend, language=language) 16 self.bos = bos 17 self.eos = eos NameError: name 'get_tokenizer' is not defined
# print(num('<pad>'), num('<unk>'))
# print(num.vocab['the'])
# print(num('the'))
# print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
# print(num.inverse(0))
# print(num.inverse([6,55]))
# print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))
# tokens = tok(["here we go. asdflkj", "it was time..."])
# print(tokens)
# print([num(tok) for tok in tokens])
# print(num(tokens))
Hugging Face tokenizers
https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
= AutoTokenizer.from_pretrained("gpt2") tokenizer
= "Using a Transformer network is simple"
sequence = tokenizer.tokenize(sequence)
tokens print(tokens)
= tokenizer.convert_tokens_to_ids(tokens)
ids print(ids)