N-gram Language Models

“old school” language modeling based on counting tokens in data

https://github.com/karpathy/makemore

Unigram

Usage

# without pandas
with open('../data/text/names.txt', 'r') as f:
    list_of_words = f.read().splitlines()
# with pandas
df = pd.read_csv('../data/text/names.txt', names=['name'], header=None)
list_of_words = list(df.head().name)

unigram = CharUnigram(list_of_words)
print("sorted counts: ", unigram.counts)
print("sorted probs: ", unigram.probs)
print(len(unigram))
print(unigram.chars)
print(unigram._stoi)
print(unigram.stoi('a'))
print(unigram.itos(0))

df = pd.DataFrame.from_dict(unigram.counts, orient='index')
df.plot(kind='bar')

samples = []
for i in range(10000):
    s = unigram.sample()
    samples.append(s)

# sampled
count = Counter([c for w in samples for c in w])
df = pd.DataFrame.from_dict(count, orient='index')
df[0].sort_values(ascending=False).plot(kind='bar')

Bigram

class CharBigram():
    def __init__(self):
        pass

Usage

# data
with open('../data/text/names.txt', 'r') as f:
    data = f.read().splitlines()
print("first lines of text: ", data[:10])

# data = ["this is a text"]

# bigram counts
bigrams = {}
unique_tokens = set()
for name in data:
    line = list(name)
    unique_tokens.update(line)
    line.append('<stop>')
    line.insert(0, '<stop>')
    for i,v in enumerate(range(len(line)-1)):
        bigram = (line[i], line[i+1])
        if bigram in bigrams:
            bigrams[bigram] += 1
        else:
            bigrams[bigram] = 1

# print("unsorted: ", list(bigrams)[:10])
# print("sorted: ", sort_dict_by_value(bigrams))

Numericalization

tokens = sorted(unique_tokens)
# use same for start & stop in this case (separate lines of names)
# tokens.append('<start>')
tokens.append('<stop>')
print(tokens)
stoi = {v:i for i,v in enumerate(tokens)}
itos = {i:v for i, v in enumerate(tokens)}
print(stoi, itos)

Matrix representation

n_toks = len(tokens)
print(n_toks)
N = torch.zeros((n_toks, n_toks)).long()
print(N.shape)

for bigram, value in bigrams.items():
    idx1, idx2 = stoi[bigram[0]], stoi[bigram[1]]
    N[idx1, idx2] = value

plt.xlabel('char_t+1')
plt.ylabel('char_t')
i = [i for i, v in itos.items()]
v = [v for i,v in itos.items()]
plt.xticks(i, v)
plt.yticks(i, v)
plt.imshow(N, origin='lower')

From counts to probabilities

print(N)

# smoothing avoids having log(0) = inf when computing NLL loss
smoothing = 1
P = (N.float()+smoothing) / N.sum(1,keepdim=True)
plt.imshow(P, origin='lower')

row_6 = (N[6,:]/N[6,:].sum())
print(row_6)
print(row_6.sum())

p = P[6, :]
print(p.sum(), p.max(), torch.argmax(p))

Sampling

for i in range(10):
    res = []
    prev = stoi['<stop>']
    while True:
        # max prob sampling
        next = int(torch.argmax(P[prev, :]))
        # multinomial sampling
        next = int(torch.multinomial(P[prev,:],num_samples=1,replacement=True))
        if next == stoi['<stop>']:
            print(''.join(res))
            break
        else:
            res.append(itos[next])
            prev = next

Log likelihood loss function

bigram_p = {}
for bigram, value in bigrams.items():
    idx1, idx2 = stoi[bigram[0]], stoi[bigram[1]]
    bigram_p[bigram] = P[idx1,idx2]

print(bigram_p)

bigram_p_sorted = {k: v.float() for k, v in sorted(bigram_p.items(), reverse=True, key=lambda x: x[1])}
print(bigram_p_sorted)

# likelihood of full corpus = product of all bigram prods
l = 0
for bigram, prob in bigram_p_sorted.items():
    l += torch.log(prob)

# negative log likelihood loss nll
nll = -l /len(bigram_p_sorted)
print(nll)

Generate training data

word = "this"
sample = [(word[i], word[i+1]) for i,c in enumerate(word) if i < len(word)-1]
print(list(zip(*sample)))

xs, ys = [], []
for word in data:
    sample = [(stoi[word[i]], stoi[word[i+1]]) for i,c in enumerate(word) if i < len(word)-1]
    x, y = list(zip(*sample)) # inverse of zip
    xs.append(torch.tensor(x))
    ys.append(torch.tensor(y))

print('x:', xs[:3])
print('y', ys[:3])

1-hot encoded input

enc = [F.one_hot(x, num_classes=len(tokens)).float() for x in xs]
print(enc[:3])

plt.imshow(enc[0])

X = enc[0]
print(X.shape)

‘Neural net’ modeling

we model the transition probability matrix by neural net activations

W = torch.randn(27, 27)

logits = X @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
print(probs)

KenLM

We refer to efficient kenlm implementation for larger n-gram models usable for production

Preprocess data into kenlm format

tokens separated by space with new sentence at each line

df = pd.read_csv('../data/text/names.txt', header=None, names=['name']) 
df = df.name.apply(lambda x: list(x)) # str into list of char
# df.apply(lambda x: x.append('<eos>')) # if eos needed
print(df.head())
df_toks = df.str.join(' ') # for kenlm input format tokens are separated by space
print(df_toks.head())

Unique tokens

df.head()
# for row in df.iterrows():
#     print(row)
tokens = set()
for k,v in df.items():
    tokens.update(list(v))

print(tokens)
len(tokens)

Save data to kenlm format for training

data_file = df.to_csv('../data/text/names.kenlm.txt', header=None, index=None)
! bzip2 -kz ../data/text/names.kenlm.txt
! bzcat ../data/text/names.kenlm.txt.bz2 | head

Train KenLM n-gram model

https://lukesalamone.github.io/posts/running-simple-language-model/

KenLM requires data to be one sentence per line lowercase

# ! if [ ! -f "../data/text/names.2gram.arpa" ]; then lmplz --discount_fallback -o 2 < ../data/text/names.kenlm.txt.bz2>../data/text/names.2gram.arpa; fi

# ! if [ ! -f "../data/text/names.2gram.kenlm" ]; then build_binary ../data/text/names.2gram.arpa ../data/text/names.2gram.kenlm; fi

Test original Kenlm python api probs

model = kenlm.LanguageModel('../data/text/names.2gram.kenlm')
sentence = "emma"
tokenized = "e m m a"
# model.score("emma", bos = False, eos = False)
words = ['<s>'] + list(sentence) + ['</s>']
print(words)
final = 0
for i, (prob, length, oov) in enumerate(model.full_scores(tokenized)):
    print(f'words: {words[i:i+length]} index:{i}, prob:{prob}, length:{length}, oov:{oov}')
    final += prob

print(final)
print(model.score("e m m a"))
print(f'prob <s> e: {model.score("e", bos=True, eos=False)}')
print(f'prob e: {model.score("e", bos=False, eos=False)}')
print(f'prob <s> e m: {model.score("e m", bos=True, eos=False)}')
print(f'prob e m: {model.score("e m", bos=False, eos=False)}')
state = kenlm.State()
state2 = kenlm.State()
model.BeginSentenceWrite(state)
accum = 0
accum += model.BaseScore(state, "e", state2)
print(f'prob <s> e: {accum}')
state, state2 = state2, state
accum += model.BaseScore(state, "m", state2)
print(f'prob <s> e m: {accum}')

Define LM vocabulary

# add special tokens to vocabulary
tokens.add('<s>')
tokens.add('</s>')
tokens.add('<unk>')
print(tokens, len(tokens))
vocab = list(tokens)

Inference / Sampling from prob distributions

lm = KenLM('../data/text/names.2gram.kenlm', vocab)
init_char = '<s> e m m'
# probs = lm.nbest(len(vocab), log_prob=False)
# print(np.sum([p for char, p in probs]))
# res = [init_char]
# next = int(torch.multinomial(P[prev,:],num_samples=1,replacement=True))
for i in range(50):
    lm.new_sentence_init()
    lm.append(init_char)
    while True:
        # nbest probs at current state
        probs = lm.nbest(len(vocab), log_prob=False)
        # print(probs)
        # print(np.sum(probs))
        # sample from prob distribution
        try:
            index_next = int(torch.multinomial(torch.tensor([prob for char, prob in probs]),num_samples=1,replacement=True))
        except:
            print("probs too small")
            break
        char_next = probs[index_next][0]
        lm.append(char_next)
        # print(init_char + '<s>')
        if char_next == '</s>' or char_next == '<s>' and lm.text != init_char and (lm.text != init_char+' <s>'):
            print(lm.text.replace(' ', ''))
            break