Text tokenizers

Torchtext

https://pytorch.org/text/0.16.0/data_utils.html

Usage

String

tok = Tokenizer(backend='character_based', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='spacy', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

tok = Tokenizer(backend='basic_english', bos=True, eos=True)
# str -> List[str]
s = "Oh, yeah I'm not sure..."
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

List of strings

# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))

Iterable

# Iterable -> Iterable
tok = Tokenizer()
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
it = tok(ds)
tokens = [token for token in it]
print(len(tokens))
print(tokens[:2])

Numericalizer

https://pytorch.org/text/stable/vocab.html

Usage

tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
    for _, text in data_iter:
        yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)
print(num('<pad>'), num('<unk>'))
print(num.vocab['the'])
print(num('the'))
print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
print(num.inverse(0))
print(num.inverse([6,55]))
print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))
tokens = tok(["here we go. asdflkj", "it was time..."])
print(tokens)
print([num(tok) for tok in tokens])
print(num(tokens))

Hugging Face tokenizers

https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt

tokenizer = AutoTokenizer.from_pretrained("gpt2")
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)