Copied!







import re 

def get_ngrams_and_continuations(input_str : str, L : int) -> tuple[list, list]:
    list_of_words = re.findall(r'\w+', input_str.lower())
    ngrams = [tuple(list_of_words[i:i+L]) for i in range(len(list_of_words)-L)]
    continuations = [list_of_words[i+L] for i in range(len(list_of_words)-L)]
    return ngrams, continuations

data = "this is my cat. This is my house. This is my dog. This is my computer."
ngrams, continuations = get_ngrams_and_continuations(data, 2)
for i in range(len(ngrams)):
    print(f"{ngrams[i]} -> {continuations[i]}")

import re 

def get_ngrams_and_continuations(input_str : str, L : int) -> tuple[list, list]:
    list_of_words = re.findall(r'\w+', input_str.lower())
    ngrams = [tuple(list_of_words[i:i+L]) for i in range(len(list_of_words)-L)]
    continuations = [list_of_words[i+L] for i in range(len(list_of_words)-L)]
    return ngrams, continuations

data = "this is my cat. This is my house. This is my dog. This is my computer."
ngrams, continuations = get_ngrams_and_continuations(data, 2)
for i in range(len(ngrams)):
    print(f"{ngrams[i]} -> {continuations[i]}")

('this', 'is') -> my
('is', 'my') -> cat
('my', 'cat') -> this
('cat', 'this') -> is
('this', 'is') -> my
('is', 'my') -> house
('my', 'house') -> this
('house', 'this') -> is
('this', 'is') -> my
('is', 'my') -> dog
('my', 'dog') -> this
('dog', 'this') -> is
('this', 'is') -> my
('is', 'my') -> computer





Copied!







import re 

def get_ngrams_and_continuations(input_str : str, L : int) -> tuple[list, list]:
    list_of_words = re.findall(r'\w+', input_str.lower())
    ngrams = [tuple(list_of_words[i:i+L]) for i in range(len(list_of_words)-L)]
    continuations = [list_of_words[i+L] for i in range(len(list_of_words)-L)]
    return ngrams, continuations

data = "this is my cat. This is my house. This is my dog. This is my computer."
ngrams, continuations = get_ngrams_and_continuations(data, 2)
for i in range(len(ngrams)):
    print(f"{ngrams[i]} -> {continuations[i]}")

import re 

def get_ngrams_and_continuations(input_str : str, L : int) -> tuple[list, list]:
    list_of_words = re.findall(r'\w+', input_str.lower())
    ngrams = [tuple(list_of_words[i:i+L]) for i in range(len(list_of_words)-L)]
    continuations = [list_of_words[i+L] for i in range(len(list_of_words)-L)]
    return ngrams, continuations

data = "this is my cat. This is my house. This is my dog. This is my computer."
ngrams, continuations = get_ngrams_and_continuations(data, 2)
for i in range(len(ngrams)):
    print(f"{ngrams[i]} -> {continuations[i]}")

('this', 'is') -> my
('is', 'my') -> cat
('my', 'cat') -> this
('cat', 'this') -> is
('this', 'is') -> my
('is', 'my') -> house
('my', 'house') -> this
('house', 'this') -> is
('this', 'is') -> my
('is', 'my') -> dog
('my', 'dog') -> this
('dog', 'this') -> is
('this', 'is') -> my
('is', 'my') -> computer





Copied!







from collections import defaultdict

def ngram_language_model(ngrams, continuations):
    model = defaultdict(lambda: defaultdict(int))
    for ngram, continuation in zip(ngrams, continuations):
        model[ngram][continuation] += 1
    
    # Convert counts to probabilities
    for ngram, continuation_counts in model.items():
        total_count = sum(continuation_counts.values())
        for continuation in continuation_counts:
            continuation_counts[continuation] /= total_count
            
    return model

model = ngram_language_model(ngrams, continuations)
for ngram, continuation_counts in model.items():
    print(f"{ngram}: {dict(continuation_counts)}")

from collections import defaultdict

def ngram_language_model(ngrams, continuations):
    model = defaultdict(lambda: defaultdict(int))
    for ngram, continuation in zip(ngrams, continuations):
        model[ngram][continuation] += 1
    
    # Convert counts to probabilities
    for ngram, continuation_counts in model.items():
        total_count = sum(continuation_counts.values())
        for continuation in continuation_counts:
            continuation_counts[continuation] /= total_count
            
    return model

model = ngram_language_model(ngrams, continuations)
for ngram, continuation_counts in model.items():
    print(f"{ngram}: {dict(continuation_counts)}")

('this', 'is'): {'my': 1.0}
('is', 'my'): {'cat': 0.25, 'house': 0.25, 'dog': 0.25, 'computer': 0.25}
('my', 'cat'): {'this': 1.0}
('cat', 'this'): {'is': 1.0}
('my', 'house'): {'this': 1.0}
('house', 'this'): {'is': 1.0}
('my', 'dog'): {'this': 1.0}
('dog', 'this'): {'is': 1.0}





Copied!







from collections import defaultdict

def ngram_language_model(ngrams, continuations):
    model = defaultdict(lambda: defaultdict(int))
    for ngram, continuation in zip(ngrams, continuations):
        model[ngram][continuation] += 1
    
    # Convert counts to probabilities
    for ngram, continuation_counts in model.items():
        total_count = sum(continuation_counts.values())
        for continuation in continuation_counts:
            continuation_counts[continuation] /= total_count
            
    return model

model = ngram_language_model(ngrams, continuations)
for ngram, continuation_counts in model.items():
    print(f"{ngram}: {dict(continuation_counts)}")

from collections import defaultdict

def ngram_language_model(ngrams, continuations):
    model = defaultdict(lambda: defaultdict(int))
    for ngram, continuation in zip(ngrams, continuations):
        model[ngram][continuation] += 1
    
    # Convert counts to probabilities
    for ngram, continuation_counts in model.items():
        total_count = sum(continuation_counts.values())
        for continuation in continuation_counts:
            continuation_counts[continuation] /= total_count
            
    return model

model = ngram_language_model(ngrams, continuations)
for ngram, continuation_counts in model.items():
    print(f"{ngram}: {dict(continuation_counts)}")

('this', 'is'): {'my': 1.0}
('is', 'my'): {'cat': 0.25, 'house': 0.25, 'dog': 0.25, 'computer': 0.25}
('my', 'cat'): {'this': 1.0}
('cat', 'this'): {'is': 1.0}
('my', 'house'): {'this': 1.0}
('house', 'this'): {'is': 1.0}
('my', 'dog'): {'this': 1.0}
('dog', 'this'): {'is': 1.0}





Copied!







import numpy as np
np.random.seed(41)  # For reproducibility

initial_text = "this is"

def generate_text(model, initial_text, n=2, length=10):
    words = initial_text.split()
    for _ in range(length):
        ngram = tuple(words[-n:])
        if ngram in model:
            continuations = list(model[ngram].keys())
            probabilities = list(model[ngram].values())
            next_word = np.random.choice(continuations, p=probabilities)
            words.append(next_word)
        else:
            break
    return ' '.join(words)

generate_text(model, initial_text, n=2, length=40)

import numpy as np
np.random.seed(41)  # For reproducibility

initial_text = "this is"

def generate_text(model, initial_text, n=2, length=10):
    words = initial_text.split()
    for _ in range(length):
        ngram = tuple(words[-n:])
        if ngram in model:
            continuations = list(model[ngram].keys())
            probabilities = list(model[ngram].values())
            next_word = np.random.choice(continuations, p=probabilities)
            words.append(next_word)
        else:
            break
    return ' '.join(words)

generate_text(model, initial_text, n=2, length=40)

'this is my cat this is my dog this is my house this is my house this is my house this is my dog this is my house this is my dog this is my dog this is my computer'





Copied!







import numpy as np
np.random.seed(41)  # For reproducibility

initial_text = "this is"

def generate_text(model, initial_text, n=2, length=10):
    words = initial_text.split()
    for _ in range(length):
        ngram = tuple(words[-n:])
        if ngram in model:
            continuations = list(model[ngram].keys())
            probabilities = list(model[ngram].values())
            next_word = np.random.choice(continuations, p=probabilities)
            words.append(next_word)
        else:
            break
    return ' '.join(words)

generate_text(model, initial_text, n=2, length=40)

import numpy as np
np.random.seed(41)  # For reproducibility

initial_text = "this is"

def generate_text(model, initial_text, n=2, length=10):
    words = initial_text.split()
    for _ in range(length):
        ngram = tuple(words[-n:])
        if ngram in model:
            continuations = list(model[ngram].keys())
            probabilities = list(model[ngram].values())
            next_word = np.random.choice(continuations, p=probabilities)
            words.append(next_word)
        else:
            break
    return ' '.join(words)

generate_text(model, initial_text, n=2, length=40)

'this is my cat this is my dog this is my house this is my house this is my house this is my dog this is my house this is my dog this is my dog this is my computer'





Copied!







models = {} # the key is the n-gram length L and the value is the model
for L in range(1, 5):
    ngrams, continuations = get_ngrams_and_continuations(data, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

def generate_text_with_fallback(models, initial_text, max_length=40):
    model_lengths = sorted(models.keys())[::-1]  # Start with the largest n-gram
    words = initial_text.split()
    for _ in range(max_length):
        for L in model_lengths:
            ngram = tuple(words[-L:])
            if ngram in models[L]:
                continuations = list(models[L][ngram].keys())
                probabilities = list(models[L][ngram].values())
                next_word = np.random.choice(continuations, p=probabilities)
                words.append(next_word)
                break
        else:
            break
    return ' '.join(words)

initial_text = "this is"
np.random.seed(41)  # For reproducibility
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

models = {} # the key is the n-gram length L and the value is the model
for L in range(1, 5):
    ngrams, continuations = get_ngrams_and_continuations(data, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

def generate_text_with_fallback(models, initial_text, max_length=40):
    model_lengths = sorted(models.keys())[::-1]  # Start with the largest n-gram
    words = initial_text.split()
    for _ in range(max_length):
        for L in model_lengths:
            ngram = tuple(words[-L:])
            if ngram in models[L]:
                continuations = list(models[L][ngram].keys())
                probabilities = list(models[L][ngram].values())
                next_word = np.random.choice(continuations, p=probabilities)
                words.append(next_word)
                break
        else:
            break
    return ' '.join(words)

initial_text = "this is"
np.random.seed(41)  # For reproducibility
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

this is my cat this is my house this is my dog this is my computer





Copied!







models = {} # the key is the n-gram length L and the value is the model
for L in range(1, 5):
    ngrams, continuations = get_ngrams_and_continuations(data, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

def generate_text_with_fallback(models, initial_text, max_length=40):
    model_lengths = sorted(models.keys())[::-1]  # Start with the largest n-gram
    words = initial_text.split()
    for _ in range(max_length):
        for L in model_lengths:
            ngram = tuple(words[-L:])
            if ngram in models[L]:
                continuations = list(models[L][ngram].keys())
                probabilities = list(models[L][ngram].values())
                next_word = np.random.choice(continuations, p=probabilities)
                words.append(next_word)
                break
        else:
            break
    return ' '.join(words)

initial_text = "this is"
np.random.seed(41)  # For reproducibility
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

models = {} # the key is the n-gram length L and the value is the model
for L in range(1, 5):
    ngrams, continuations = get_ngrams_and_continuations(data, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

def generate_text_with_fallback(models, initial_text, max_length=40):
    model_lengths = sorted(models.keys())[::-1]  # Start with the largest n-gram
    words = initial_text.split()
    for _ in range(max_length):
        for L in model_lengths:
            ngram = tuple(words[-L:])
            if ngram in models[L]:
                continuations = list(models[L][ngram].keys())
                probabilities = list(models[L][ngram].values())
                next_word = np.random.choice(continuations, p=probabilities)
                words.append(next_word)
                break
        else:
            break
    return ' '.join(words)

initial_text = "this is"
np.random.seed(41)  # For reproducibility
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

this is my cat this is my house this is my dog this is my computer





Copied!







with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()
    
models = {}
for L in range(1, 7):
    ngrams, continuations = get_ngrams_and_continuations(shakespeare_text, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()
    
models = {}
for L in range(1, 7):
    ngrams, continuations = get_ngrams_and_continuations(shakespeare_text, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model





Copied!







with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()
    
models = {}
for L in range(1, 7):
    ngrams, continuations = get_ngrams_and_continuations(shakespeare_text, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model

with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()
    
models = {}
for L in range(1, 7):
    ngrams, continuations = get_ngrams_and_continuations(shakespeare_text, L)
    model = ngram_language_model(ngrams, continuations)
    models[L] = model





Copied!







np.random.seed(45)  # For reproducibility
initial_text = "I believe"
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

np.random.seed(45)  # For reproducibility
initial_text = "I believe"
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

I believe thyself than i will trust a sickly appetite that loathes even as it longs but sure my sister if i were ripe for your persuasion you have said enough to shake me from the arm of the all noble theseus





Copied!







np.random.seed(45)  # For reproducibility
initial_text = "I believe"
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

np.random.seed(45)  # For reproducibility
initial_text = "I believe"
generated_text = generate_text_with_fallback(models, initial_text, max_length=40)
print(generated_text)

I believe thyself than i will trust a sickly appetite that loathes even as it longs but sure my sister if i were ripe for your persuasion you have said enough to shake me from the arm of the all noble theseus

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search

Bullshit generator: a case study for n-gram language models¶

Finding n-grams!¶

An N-Gram language model¶

Generating some bullshit¶

A fallback strategy¶

Generating some shakespeare!¶

Activities¶

Questions¶

Expected answers¶