# without pandas
with open('../data/text/names.txt', 'r') as f:
= f.read().splitlines()
list_of_words # with pandas
= pd.read_csv('../data/text/names.txt', names=['name'], header=None)
df = list(df.head().name)
list_of_words
= CharUnigram(list_of_words)
unigram print("sorted counts: ", unigram.counts)
print("sorted probs: ", unigram.probs)
print(len(unigram))
print(unigram.chars)
print(unigram._stoi)
print(unigram.stoi('a'))
print(unigram.itos(0))
N-gram Language Models
“old school” language modeling based on counting tokens in data
https://github.com/karpathy/makemore
Unigram
Usage
= pd.DataFrame.from_dict(unigram.counts, orient='index')
df ='bar') df.plot(kind
= []
samples for i in range(10000):
= unigram.sample()
s
samples.append(s)
# sampled
= Counter([c for w in samples for c in w])
count = pd.DataFrame.from_dict(count, orient='index')
df 0].sort_values(ascending=False).plot(kind='bar') df[
Bigram
class CharBigram():
def __init__(self):
pass
Usage
# data
with open('../data/text/names.txt', 'r') as f:
= f.read().splitlines()
data print("first lines of text: ", data[:10])
# data = ["this is a text"]
# bigram counts
= {}
bigrams = set()
unique_tokens for name in data:
= list(name)
line
unique_tokens.update(line)'<stop>')
line.append(0, '<stop>')
line.insert(for i,v in enumerate(range(len(line)-1)):
= (line[i], line[i+1])
bigram if bigram in bigrams:
+= 1
bigrams[bigram] else:
= 1
bigrams[bigram]
# print("unsorted: ", list(bigrams)[:10])
# print("sorted: ", sort_dict_by_value(bigrams))
Numericalization
= sorted(unique_tokens)
tokens # use same for start & stop in this case (separate lines of names)
# tokens.append('<start>')
'<stop>')
tokens.append(print(tokens)
= {v:i for i,v in enumerate(tokens)}
stoi = {i:v for i, v in enumerate(tokens)}
itos print(stoi, itos)
Matrix representation
= len(tokens)
n_toks print(n_toks)
= torch.zeros((n_toks, n_toks)).long()
N print(N.shape)
for bigram, value in bigrams.items():
= stoi[bigram[0]], stoi[bigram[1]]
idx1, idx2 = value
N[idx1, idx2]
'char_t+1')
plt.xlabel('char_t')
plt.ylabel(= [i for i, v in itos.items()]
i = [v for i,v in itos.items()]
v
plt.xticks(i, v)
plt.yticks(i, v)='lower') plt.imshow(N, origin
From counts to probabilities
print(N)
# smoothing avoids having log(0) = inf when computing NLL loss
= 1
smoothing = (N.float()+smoothing) / N.sum(1,keepdim=True)
P ='lower') plt.imshow(P, origin
= (N[6,:]/N[6,:].sum())
row_6 print(row_6)
print(row_6.sum())
= P[6, :]
p print(p.sum(), p.max(), torch.argmax(p))
Sampling
for i in range(10):
= []
res = stoi['<stop>']
prev while True:
# max prob sampling
next = int(torch.argmax(P[prev, :]))
# multinomial sampling
next = int(torch.multinomial(P[prev,:],num_samples=1,replacement=True))
if next == stoi['<stop>']:
print(''.join(res))
break
else:
next])
res.append(itos[= next prev
Log likelihood loss function
= {}
bigram_p for bigram, value in bigrams.items():
= stoi[bigram[0]], stoi[bigram[1]]
idx1, idx2 = P[idx1,idx2]
bigram_p[bigram]
print(bigram_p)
= {k: v.float() for k, v in sorted(bigram_p.items(), reverse=True, key=lambda x: x[1])}
bigram_p_sorted print(bigram_p_sorted)
# likelihood of full corpus = product of all bigram prods
= 0
l for bigram, prob in bigram_p_sorted.items():
+= torch.log(prob)
l
# negative log likelihood loss nll
= -l /len(bigram_p_sorted)
nll print(nll)
Generate training data
= "this"
word = [(word[i], word[i+1]) for i,c in enumerate(word) if i < len(word)-1]
sample print(list(zip(*sample)))
= [], []
xs, ys for word in data:
= [(stoi[word[i]], stoi[word[i+1]]) for i,c in enumerate(word) if i < len(word)-1]
sample = list(zip(*sample)) # inverse of zip
x, y
xs.append(torch.tensor(x)) ys.append(torch.tensor(y))
print('x:', xs[:3])
print('y', ys[:3])
1-hot encoded input
= [F.one_hot(x, num_classes=len(tokens)).float() for x in xs]
enc print(enc[:3])
0]) plt.imshow(enc[
= enc[0]
X print(X.shape)
‘Neural net’ modeling
we model the transition probability matrix by neural net activations
= torch.randn(27, 27) W
= X @ W
logits = logits.exp()
counts = counts / counts.sum(1, keepdims=True)
probs print(probs)
KenLM
We refer to efficient kenlm implementation for larger n-gram models usable for production
Preprocess data into kenlm format
tokens separated by space with new sentence at each line
= pd.read_csv('../data/text/names.txt', header=None, names=['name'])
df = df.name.apply(lambda x: list(x)) # str into list of char
df # df.apply(lambda x: x.append('<eos>')) # if eos needed
print(df.head())
= df.str.join(' ') # for kenlm input format tokens are separated by space
df_toks print(df_toks.head())
Unique tokens
df.head()# for row in df.iterrows():
# print(row)
= set()
tokens for k,v in df.items():
list(v))
tokens.update(
print(tokens)
len(tokens)
Save data to kenlm format for training
= df.to_csv('../data/text/names.kenlm.txt', header=None, index=None)
data_file ! bzip2 -kz ../data/text/names.kenlm.txt
! bzcat ../data/text/names.kenlm.txt.bz2 | head
Train KenLM n-gram model
https://lukesalamone.github.io/posts/running-simple-language-model/
KenLM requires data to be one sentence per line lowercase
# ! if [ ! -f "../data/text/names.2gram.arpa" ]; then lmplz --discount_fallback -o 2 < ../data/text/names.kenlm.txt.bz2>../data/text/names.2gram.arpa; fi
# ! if [ ! -f "../data/text/names.2gram.kenlm" ]; then build_binary ../data/text/names.2gram.arpa ../data/text/names.2gram.kenlm; fi
Test original Kenlm python api probs
= kenlm.LanguageModel('../data/text/names.2gram.kenlm')
model = "emma"
sentence = "e m m a"
tokenized # model.score("emma", bos = False, eos = False)
= ['<s>'] + list(sentence) + ['</s>']
words print(words)
= 0
final for i, (prob, length, oov) in enumerate(model.full_scores(tokenized)):
print(f'words: {words[i:i+length]} index:{i}, prob:{prob}, length:{length}, oov:{oov}')
+= prob
final
print(final)
print(model.score("e m m a"))
print(f'prob <s> e: {model.score("e", bos=True, eos=False)}')
print(f'prob e: {model.score("e", bos=False, eos=False)}')
print(f'prob <s> e m: {model.score("e m", bos=True, eos=False)}')
print(f'prob e m: {model.score("e m", bos=False, eos=False)}')
= kenlm.State()
state = kenlm.State()
state2
model.BeginSentenceWrite(state)= 0
accum += model.BaseScore(state, "e", state2)
accum print(f'prob <s> e: {accum}')
= state2, state
state, state2 += model.BaseScore(state, "m", state2)
accum print(f'prob <s> e m: {accum}')
Define LM vocabulary
# add special tokens to vocabulary
'<s>')
tokens.add('</s>')
tokens.add('<unk>')
tokens.add(print(tokens, len(tokens))
= list(tokens) vocab
Inference / Sampling from prob distributions
= KenLM('../data/text/names.2gram.kenlm', vocab)
lm = '<s> e m m'
init_char # probs = lm.nbest(len(vocab), log_prob=False)
# print(np.sum([p for char, p in probs]))
# res = [init_char]
# next = int(torch.multinomial(P[prev,:],num_samples=1,replacement=True))
for i in range(50):
lm.new_sentence_init()
lm.append(init_char)while True:
# nbest probs at current state
= lm.nbest(len(vocab), log_prob=False)
probs # print(probs)
# print(np.sum(probs))
# sample from prob distribution
try:
= int(torch.multinomial(torch.tensor([prob for char, prob in probs]),num_samples=1,replacement=True))
index_next except:
print("probs too small")
break
= probs[index_next][0]
char_next
lm.append(char_next)# print(init_char + '<s>')
if char_next == '</s>' or char_next == '<s>' and lm.text != init_char and (lm.text != init_char+' <s>'):
print(lm.text.replace(' ', ''))
break