Text tokenizers

Torchtext

https://pytorch.org/text/0.16.0/data_utils.html


source

Tokenizer

 Tokenizer (backend:str='spacy', language:str='en', bos:bool=False,
            eos:bool=False)

Initialize self. See help(type(self)) for accurate signature.

Type Default Details
backend str spacy backend tokenizer default to spacy
language str en language on which tokenization is applied
bos bool False add beginning of sentence tag
eos bool False add end of sentence tag

Usage

String

tok = Tokenizer(backend='character_based', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='spacy', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='basic_english', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
# Tokenizer uses character_based backend
original sentence:  Oh, yeah I'm not sure...
tokenized:  ['<bos>', 'O', 'h', ',', ' ', 'y', 'e', 'a', 'h', ' ', 'I', "'", 'm', ' ', 'n', 'o', 't', ' ', 's', 'u', 'r', 'e', '.', '.', '.', '<eos>']
un-tokenized:  <bos>Oh, yeah I'm not sure...<eos>
# Tokenizer uses spacy backend
original sentence:  Oh, yeah I'm not sure...
tokenized:  ['<bos>', 'Oh', ',', 'yeah', 'I', "'m", 'not', 'sure', '...', '<eos>']
un-tokenized:  <bos> Oh , yeah I 'm not sure ... <eos>
# Tokenizer uses basic_english backend
original sentence:  Oh, yeah I'm not sure...
tokenized:  ['<bos>', 'oh', ',', 'yeah', 'i', "'", 'm', 'not', 'sure', '.', '.', '.', '<eos>']
un-tokenized:  <bos> oh , yeah i ' m not sure . . . <eos>

List of strings

# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
original sentence:  ["Oh, yeah I don't know dude...", 'this is a test']
tokenized:  [['<bos>', 'oh', ',', 'yeah', 'i', 'don', "'", 't', 'know', 'dude', '.', '.', '.', '<eos>'], ['<bos>', 'this', 'is', 'a', 'test', '<eos>']]
un-tokenized:  ["<bos> oh , yeah i don ' t know dude . . . <eos>", '<bos> this is a test <eos>']

Iterable

# Iterable -> Iterable
tok = Tokenizer()
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
it = tok(ds)
tokens = [token for token in it]
print(len(tokens))
print(tokens[:2])
# Tokenizer uses spacy backend
(3, "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.")
7600
[['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.'], ['The', 'Race', 'is', 'On', ':', 'Second', 'Private', 'Team', 'Sets', 'Launch', 'Date', 'for', 'Human', 'Spaceflight', '(', 'SPACE.com', ')', 'SPACE.com', '-', 'TORONTO', ',', 'Canada', '--', 'A', 'second\\team', 'of', 'rocketeers', 'competing', 'for', 'the', ' ', '#', '36;10', 'million', 'Ansari', 'X', 'Prize', ',', 'a', 'contest', 'for\\privately', 'funded', 'suborbital', 'space', 'flight', ',', 'has', 'officially', 'announced', 'the', 'first\\launch', 'date', 'for', 'its', 'manned', 'rocket', '.']]
/Users/syl20/mambaforge/envs/nimrod/lib/python3.11/site-packages/torch/utils/data/datapipes/iter/combining.py:297: UserWarning: Some child DataPipes are not exhausted when __iter__ is called. We are resetting the buffer and each child DataPipe will read from the start again.
  warnings.warn("Some child DataPipes are not exhausted when __iter__ is called. We are resetting "

Numericalizer

https://pytorch.org/text/stable/vocab.html


source

Numericalizer

 Numericalizer (tokens_iter:Iterable, specials=['<pad>', '<unk>', '<bos>',
                '<eos>'])

Initialize self. See help(type(self)) for accurate signature.

Usage

tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
    for _, text in data_iter:
        yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)
print(num('<pad>'), num('<unk>'))
0 1
print(num.vocab['the'])
print(num('the'))
print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
print(num.inverse(0))
print(num.inverse([6,55]))
print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))
4
4
[2, 0, 1, 9, 58, 4, 1]
<pad>
['.', 'Monday']
[[2, 0], [1, 9, 58, 4, 1]]
tokens = tok(["here we go. asdflkj", "it was time..."])
print(tokens)
print([num(tok) for tok in tokens])
print(num(tokens))
[['here', 'we', 'go', '.', 'asdflkj'], ['it', 'was', 'time', '...']]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]

Hugging Face tokenizers

https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt

tokenizer = AutoTokenizer.from_pretrained("gpt2")
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)
['Using', 'Ġa', 'ĠTrans', 'former', 'Ġnetwork', 'Ġis', 'Ġsimple']
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
[12814, 257, 3602, 16354, 3127, 318, 2829]