# ---
# skip_exec: true
# skip_showdoc: true
# ---Text tokenizers
# # torchtext
# import torchtext
# from torchtext.vocab import vocab
# from torchtext.data.utils import get_tokenizer
# from torchtext.datasets import AG_NEWSChar-based Tokenizer
CharTokenizer
CharTokenizer (vocabulary:List[str])
Initialize self. See help(type(self)) for accurate signature.
text = Path('../data/text/tiny_shakespeare.txt').read_text()
print(text[:25])
tokenizer = CharTokenizer.from_text(text)
print(len(tokenizer))
print(tokenizer.ctoi['a'])
print(tokenizer.itoc[39])
enc = tokenizer.encode("this is swell")
print(enc)
print(tokenizer.decode(enc))First Citizen:
Before we
65
39
a
tensor([58, 46, 47, 57, 1, 47, 57, 1, 57, 61, 43, 50, 50])
this is swell
Torchtext
https://pytorch.org/text/0.16.0/data_utils.html
class Tokenizer:
def __init__(self,
backend:str='spacy', # backend tokenizer default to spacy
language:str='en', # language on which tokenization is applied
bos:bool=False, # add beginning of sentence tag <bos>
eos:bool=False, # add end of sentence tag <eos>
):
if backend == 'spacy' and language == 'en':
language = 'en_core_web_sm'
if backend== 'character_based':
self.tokenizer = self.character_tokenizer
else:
self.tokenizer = get_tokenizer(backend, language=language)
self.bos = bos
self.eos = eos
self.backend = backend
print(f"# Tokenizer uses {self.backend} backend")
@staticmethod
def character_tokenizer(text:str)->List[str]:
return [c for c in text]
@dispatch
def __call__(self, text:str)->List[str]:
res = self.tokenizer(text)
if self.bos:
res = ['<bos>'] + res
if self.eos:
res = res + ['<eos>']
return(res)
@dispatch
def __call__(self, texts:List[str])->List[List[str]]:
return [self(text) for text in texts]
@dispatch # to replace Iterable
# works with agnews type of dataset [(index, text)]
def __call__(self, data_iter:Iterable)->Iterable:
for _, text in data_iter:
yield self(text)
@dispatch
def inverse(self, tokens:List[str])->str:
if self.backend == 'character_based':
return ''.join(tokens)
# TODO: take care of white spaces
else:
return ' '.join(tokens)
@dispatch
def inverse(self, list_of_tokens:List[List[str]])->List[str]:
s = []
for tokens in list_of_tokens:
s.append(self.inverse(tokens))
return sUsage
String
tok = Tokenizer(backend='character_based', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
tok = Tokenizer(backend='spacy', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
tok = Tokenizer(backend='basic_english', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))# Tokenizer uses character_based backend
original sentence: Oh, yeah I'm not sure...
tokenized: ['<bos>', 'O', 'h', ',', ' ', 'y', 'e', 'a', 'h', ' ', 'I', "'", 'm', ' ', 'n', 'o', 't', ' ', 's', 'u', 'r', 'e', '.', '.', '.', '<eos>']
un-tokenized: <bos>Oh, yeah I'm not sure...<eos>
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[11], line 11 8 print("tokenized: ", tokenized) 9 print("un-tokenized: ", tok.inverse(tokenized)) ---> 11 tok = Tokenizer(backend='spacy', bos=True, eos=True) 12 # str -> List[str] 13 s = "Oh, yeah I'm not sure..." Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos) 13 self.tokenizer = self.character_tokenizer 14 else: ---> 15 self.tokenizer = get_tokenizer(backend, language=language) 16 self.bos = bos 17 self.eos = eos NameError: name 'get_tokenizer' is not defined
List of strings
# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))Iterable
# Iterable -> Iterable
tok = Tokenizer()
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
it = tok(ds)
tokens = [token for token in it]
print(len(tokens))
print(tokens[:2])Numericalizer
https://pytorch.org/text/stable/vocab.html
# TODO: add more special characters
class Numericalizer():
def __init__(self, tokens_iter:Iterable, specials=["<pad>", "<unk>", "<bos>", "<eos>"]):
self._vocab = self.build_map_from_iter(tokens_iter, specials)
def build_map_from_iter(self,data_iter:Iterable, specials=None):
self._vocab = torchtext.vocab.build_vocab_from_iterator(data_iter, specials=specials)
if "<unk>" in specials:
self._vocab.set_default_index(self._vocab["<unk>"])
return self._vocab
@dispatch
def __call__(self, texts:List[str])->List[List[int]]:
# TODO: check self._vocab has been built
return [self._vocab[text] for text in texts]
@dispatch
def __call__(self, texts:List[List[str]]):
# TODO: use nested list comprehension
res = []
for row in texts:
res.append([self._vocab[text] for text in row])
return res
@dispatch
def __call__(self, text:str)->int:
return self._vocab[text]
@property
def vocab(self):
return(self._vocab)
@dispatch
def inverse(self, idx:int)->str:
return self._vocab.get_itos()[idx]
@dispatch
def inverse(self, indices:List[int])->List[str]:
return [self._vocab.get_itos()[i] for i in indices]Usage
tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
for _, text in data_iter:
yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[13], line 1 ----> 1 tok = Tokenizer() 2 # In the case of agnews, dataset is: [(index, text)] 3 def token_iterator(data_iter:Iterable)->Iterable: Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos) 13 self.tokenizer = self.character_tokenizer 14 else: ---> 15 self.tokenizer = get_tokenizer(backend, language=language) 16 self.bos = bos 17 self.eos = eos NameError: name 'get_tokenizer' is not defined
# print(num('<pad>'), num('<unk>'))# print(num.vocab['the'])
# print(num('the'))
# print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
# print(num.inverse(0))
# print(num.inverse([6,55]))
# print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))# tokens = tok(["here we go. asdflkj", "it was time..."])
# print(tokens)
# print([num(tok) for tok in tokens])
# print(num(tokens))Hugging Face tokenizers
https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
tokenizer = AutoTokenizer.from_pretrained("gpt2")sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)