= Tokenizer(backend='character_based', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
= Tokenizer(backend='spacy', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
= Tokenizer(backend='basic_english', bos=True, eos=True)
tok # str -> List[str]
= "Oh, yeah I'm not sure..."
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
Text tokenizers
Torchtext
https://pytorch.org/text/0.16.0/data_utils.html
Usage
String
List of strings
# List[str]->List[List[str]]
= ["Oh, yeah I don't know dude...", "this is a test"]
s = tok(s)
tokenized print("original sentence: ", s)
print("tokenized: ", tokenized)
print("un-tokenized: ", tok.inverse(tokenized))
Iterable
# Iterable -> Iterable
= Tokenizer()
tok = AG_NEWS(split='test') # data pipe
ds = next(iter(ds)) # (label, text)
sample print(sample)
= tok(ds)
it = [token for token in it]
tokens print(len(tokens))
print(tokens[:2])
Numericalizer
https://pytorch.org/text/stable/vocab.html
Usage
= Tokenizer()
tok # In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
for _, text in data_iter:
yield tok(text)
= token_iterator(ds)
tok_it# initialize numericalizer based on token iterator
= Numericalizer(tok_it) num
print(num('<pad>'), num('<unk>'))
print(num.vocab['the'])
print(num('the'))
print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
print(num.inverse(0))
print(num.inverse([6,55]))
print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))
= tok(["here we go. asdflkj", "it was time..."])
tokens print(tokens)
print([num(tok) for tok in tokens])
print(num(tokens))
Hugging Face tokenizers
https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
= AutoTokenizer.from_pretrained("gpt2") tokenizer
= "Using a Transformer network is simple"
sequence = tokenizer.tokenize(sequence)
tokens print(tokens)
= tokenizer.convert_tokens_to_ids(tokens)
ids print(ids)