Text tokenizers

# ---
# skip_exec: true
# skip_showdoc: true
# ---
# # torchtext
# import torchtext
# from torchtext.vocab import vocab
# from torchtext.data.utils import get_tokenizer
# from torchtext.datasets import AG_NEWS

Char-based Tokenizer


source

CharTokenizer

 CharTokenizer (vocabulary:List[str])

Initialize self. See help(type(self)) for accurate signature.

text = Path('../data/text/tiny_shakespeare.txt').read_text()
print(text[:25])

tokenizer = CharTokenizer.from_text(text)
print(len(tokenizer))
print(tokenizer.ctoi['a'])
print(tokenizer.itoc[39])
enc = tokenizer.encode("this is swell")
print(enc)
print(tokenizer.decode(enc))
First Citizen:
Before we 
65
39
a
tensor([58, 46, 47, 57,  1, 47, 57,  1, 57, 61, 43, 50, 50])
this is swell

Torchtext

https://pytorch.org/text/0.16.0/data_utils.html

class Tokenizer:
    def __init__(self,
                backend:str='spacy', # backend tokenizer default to spacy
                language:str='en', # language on which tokenization is applied
                bos:bool=False, # add beginning of sentence tag <bos>
                eos:bool=False, # add end of sentence tag <eos>
                ):
        if backend == 'spacy' and language == 'en':
            language = 'en_core_web_sm'
        if backend== 'character_based':
            self.tokenizer = self.character_tokenizer
        else:
            self.tokenizer = get_tokenizer(backend, language=language)
        self.bos = bos
        self.eos = eos
        self.backend = backend
        print(f"# Tokenizer uses {self.backend} backend")
    
    @staticmethod
    def character_tokenizer(text:str)->List[str]:
        return [c for c in text]
    
    @dispatch
    def __call__(self, text:str)->List[str]:
        res = self.tokenizer(text)
        if self.bos:
            res = ['<bos>'] + res
        if self.eos:
            res = res + ['<eos>']
        return(res)
    
    @dispatch
    def __call__(self, texts:List[str])->List[List[str]]:
        return [self(text) for text in texts]
    
    @dispatch # to replace Iterable
    # works with agnews type of dataset [(index, text)]
    def __call__(self, data_iter:Iterable)->Iterable:
        for _, text in data_iter:
            yield self(text)

    @dispatch    
    def inverse(self, tokens:List[str])->str:
        if self.backend == 'character_based':
            return ''.join(tokens)
        # TODO: take care of white spaces
        else:
            return ' '.join(tokens)

    @dispatch
    def inverse(self, list_of_tokens:List[List[str]])->List[str]:
        s = []
        for tokens in list_of_tokens:
            s.append(self.inverse(tokens))
        return s

Usage

String

tok = Tokenizer(backend='character_based', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='spacy', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='basic_english', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
# Tokenizer uses character_based backend
original sentence:  Oh, yeah I'm not sure...
tokenized:  ['<bos>', 'O', 'h', ',', ' ', 'y', 'e', 'a', 'h', ' ', 'I', "'", 'm', ' ', 'n', 'o', 't', ' ', 's', 'u', 'r', 'e', '.', '.', '.', '<eos>']
un-tokenized:  <bos>Oh, yeah I'm not sure...<eos>
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 11
      8 print("tokenized: ", tokenized)
      9 print("un-tokenized: ", tok.inverse(tokenized))
---> 11 tok = Tokenizer(backend='spacy', bos=True, eos=True)
     12 # str -> List[str]
     13 s = "Oh, yeah I'm not sure..."

Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos)
     13     self.tokenizer = self.character_tokenizer
     14 else:
---> 15     self.tokenizer = get_tokenizer(backend, language=language)
     16 self.bos = bos
     17 self.eos = eos

NameError: name 'get_tokenizer' is not defined

List of strings

# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

Iterable

# Iterable -> Iterable
tok = Tokenizer()
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
it = tok(ds)
tokens = [token for token in it]
print(len(tokens))
print(tokens[:2])

Numericalizer

https://pytorch.org/text/stable/vocab.html

# TODO: add more special characters
class Numericalizer():
    def __init__(self, tokens_iter:Iterable, specials=["<pad>", "<unk>", "<bos>", "<eos>"]):
        self._vocab = self.build_map_from_iter(tokens_iter, specials)
    
    def build_map_from_iter(self,data_iter:Iterable, specials=None):
        self._vocab = torchtext.vocab.build_vocab_from_iterator(data_iter, specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

    @dispatch
    def __call__(self, texts:List[str])->List[List[int]]:
        # TODO: check self._vocab has been built
        return [self._vocab[text] for text in texts]
    
    @dispatch
    def __call__(self, texts:List[List[str]]):
        # TODO: use nested list comprehension
        res = []
        for row in texts:
            res.append([self._vocab[text] for text in row])
        return res
        
    @dispatch
    def __call__(self, text:str)->int:
        return self._vocab[text]
    
    @property
    def vocab(self):
        return(self._vocab)
    
    @dispatch
    def inverse(self, idx:int)->str:
        return self._vocab.get_itos()[idx]

    @dispatch
    def inverse(self, indices:List[int])->List[str]:
        return [self._vocab.get_itos()[i] for i in indices]

Usage

tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
    for _, text in data_iter:
        yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 tok = Tokenizer()
      2 # In the case of agnews, dataset is: [(index, text)]
      3 def token_iterator(data_iter:Iterable)->Iterable:

Cell In[9], line 15, in Tokenizer.__init__(self, backend, language, bos, eos)
     13     self.tokenizer = self.character_tokenizer
     14 else:
---> 15     self.tokenizer = get_tokenizer(backend, language=language)
     16 self.bos = bos
     17 self.eos = eos

NameError: name 'get_tokenizer' is not defined
# print(num('<pad>'), num('<unk>'))
# print(num.vocab['the'])
# print(num('the'))
# print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
# print(num.inverse(0))
# print(num.inverse([6,55]))
# print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))
# tokens = tok(["here we go. asdflkj", "it was time..."])
# print(tokens)
# print([num(tok) for tok in tokens])
# print(num(tokens))

Hugging Face tokenizers

https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt

tokenizer = AutoTokenizer.from_pretrained("gpt2")
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)