Copied!







import re

text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""

words = re.findall(r"\b\w+\b", text.upper())
print(words)

conditional_probabilities = {}

import re

text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""

words = re.findall(r"\b\w+\b", text.upper())
print(words)

conditional_probabilities = {}

['NATURAL', 'LANGUAGE', 'PROCESSING', 'NLP', 'IS', 'A', 'SUBFIELD', 'OF', 'LINGUISTICS', 'COMPUTER', 'SCIENCE', 'INFORMATION', 'ENGINEERING', 'AND', 'ARTIFICIAL', 'INTELLIGENCE', 'CONCERNED', 'WITH', 'THE', 'INTERACTIONS', 'BETWEEN', 'COMPUTERS', 'AND', 'HUMAN', 'NATURAL', 'LANGUAGES', 'IN', 'PARTICULAR', 'HOW', 'TO', 'PROGRAM', 'COMPUTERS', 'TO', 'PROCESS', 'AND', 'ANALYZE', 'LARGE', 'AMOUNTS', 'OF', 'NATURAL', 'LANGUAGE', 'DATA', 'CHALLENGES', 'IN', 'NATURAL', 'LANGUAGE', 'PROCESSING', 'FREQUENTLY', 'INVOLVE', 'SPEECH', 'RECOGNITION', 'NATURAL', 'LANGUAGE', 'UNDERSTANDING', 'AND', 'NATURAL', 'LANGUAGE', 'GENERATION', 'NOWADAYS', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'HIGHLY', 'TRENDING', 'TECHNOLOGY', 'AND', 'IS', 'GAINING', 'POPULARITY', 'AMONG', 'NLP', 'DEVELOPERS', 'ALTHOUGH', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'MARVELOUS', 'TECHNOLOGY', 'AND', 'HAS', 'WONDERFUL', 'RESULTS', 'IT', 'IS', 'STILL', 'A', 'DEVELOPING', 'TECHNOLOGY', 'AND', 'ITS', 'ETHICAL', 'USE', 'IS', 'A', 'MAJOR', 'CONCERN']





Copied!







import re

text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""

words = re.findall(r"\b\w+\b", text.upper())
print(words)

conditional_probabilities = {}

import re

text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""

words = re.findall(r"\b\w+\b", text.upper())
print(words)

conditional_probabilities = {}

['NATURAL', 'LANGUAGE', 'PROCESSING', 'NLP', 'IS', 'A', 'SUBFIELD', 'OF', 'LINGUISTICS', 'COMPUTER', 'SCIENCE', 'INFORMATION', 'ENGINEERING', 'AND', 'ARTIFICIAL', 'INTELLIGENCE', 'CONCERNED', 'WITH', 'THE', 'INTERACTIONS', 'BETWEEN', 'COMPUTERS', 'AND', 'HUMAN', 'NATURAL', 'LANGUAGES', 'IN', 'PARTICULAR', 'HOW', 'TO', 'PROGRAM', 'COMPUTERS', 'TO', 'PROCESS', 'AND', 'ANALYZE', 'LARGE', 'AMOUNTS', 'OF', 'NATURAL', 'LANGUAGE', 'DATA', 'CHALLENGES', 'IN', 'NATURAL', 'LANGUAGE', 'PROCESSING', 'FREQUENTLY', 'INVOLVE', 'SPEECH', 'RECOGNITION', 'NATURAL', 'LANGUAGE', 'UNDERSTANDING', 'AND', 'NATURAL', 'LANGUAGE', 'GENERATION', 'NOWADAYS', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'HIGHLY', 'TRENDING', 'TECHNOLOGY', 'AND', 'IS', 'GAINING', 'POPULARITY', 'AMONG', 'NLP', 'DEVELOPERS', 'ALTHOUGH', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'MARVELOUS', 'TECHNOLOGY', 'AND', 'HAS', 'WONDERFUL', 'RESULTS', 'IT', 'IS', 'STILL', 'A', 'DEVELOPING', 'TECHNOLOGY', 'AND', 'ITS', 'ETHICAL', 'USE', 'IS', 'A', 'MAJOR', 'CONCERN']





Copied!







import numpy as np
print(np.random.choice(['one', 'two', 'three'], p=[0.5, 0.2, 0.3]))

import numpy as np
print(np.random.choice(['one', 'two', 'three'], p=[0.5, 0.2, 0.3]))

three





Copied!







import numpy as np
print(np.random.choice(['one', 'two', 'three'], p=[0.5, 0.2, 0.3]))

import numpy as np
print(np.random.choice(['one', 'two', 'three'], p=[0.5, 0.2, 0.3]))

three





Copied!







def generate_text(model : dict, starting_string : str, num_words : int) -> str:
    text = starting_string
    # Generate words based on your model (model should be the conditional probabilities!)
    return text

def generate_text(model : dict, starting_string : str, num_words : int) -> str:
    text = starting_string
    # Generate words based on your model (model should be the conditional probabilities!)
    return text





Copied!







def generate_text(model : dict, starting_string : str, num_words : int) -> str:
    text = starting_string
    # Generate words based on your model (model should be the conditional probabilities!)
    return text

def generate_text(model : dict, starting_string : str, num_words : int) -> str:
    text = starting_string
    # Generate words based on your model (model should be the conditional probabilities!)
    return text





Copied!







# Make your solution here

# Make your solution here





Copied!







# Make your solution here

# Make your solution here





Copied!







import matplotlib.pyplot as plt
def apply_temperature(P, tau):
    z =  P**(np.exp(-tau))
    return z / np.sum(z)
P = np.array([0.5, 0.4, 0.1])
print(apply_temperature(P, 0))
plt.figure(figsize=(10,3))

plt.subplot(1,3,2)
plt.bar(range(3), apply_temperature(P, 0))
plt.ylim(0,1)
plt.title('$\\tau$ = 0')
plt.subplot(1,3,3)
plt.bar(range(3), apply_temperature(P, 2))
plt.ylim(0,1)
plt.title('$\\tau$ = 2')
plt.subplot(1,3,1)
plt.bar(range(3), apply_temperature(P, -2))
plt.ylim(0,1)
plt.title('$\\tau$ = -2')
plt.suptitle('Higher temperature makes the distribution closer to uniform!')
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
def apply_temperature(P, tau):
    z =  P**(np.exp(-tau))
    return z / np.sum(z)
P = np.array([0.5, 0.4, 0.1])
print(apply_temperature(P, 0))
plt.figure(figsize=(10,3))

plt.subplot(1,3,2)
plt.bar(range(3), apply_temperature(P, 0))
plt.ylim(0,1)
plt.title('$\\tau$ = 0')
plt.subplot(1,3,3)
plt.bar(range(3), apply_temperature(P, 2))
plt.ylim(0,1)
plt.title('$\\tau$ = 2')
plt.subplot(1,3,1)
plt.bar(range(3), apply_temperature(P, -2))
plt.ylim(0,1)
plt.title('$\\tau$ = -2')
plt.suptitle('Higher temperature makes the distribution closer to uniform!')
plt.tight_layout()
plt.show()

[0.5 0.4 0.1]





Copied!







import matplotlib.pyplot as plt
def apply_temperature(P, tau):
    z =  P**(np.exp(-tau))
    return z / np.sum(z)
P = np.array([0.5, 0.4, 0.1])
print(apply_temperature(P, 0))
plt.figure(figsize=(10,3))

plt.subplot(1,3,2)
plt.bar(range(3), apply_temperature(P, 0))
plt.ylim(0,1)
plt.title('$\\tau$ = 0')
plt.subplot(1,3,3)
plt.bar(range(3), apply_temperature(P, 2))
plt.ylim(0,1)
plt.title('$\\tau$ = 2')
plt.subplot(1,3,1)
plt.bar(range(3), apply_temperature(P, -2))
plt.ylim(0,1)
plt.title('$\\tau$ = -2')
plt.suptitle('Higher temperature makes the distribution closer to uniform!')
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
def apply_temperature(P, tau):
    z =  P**(np.exp(-tau))
    return z / np.sum(z)
P = np.array([0.5, 0.4, 0.1])
print(apply_temperature(P, 0))
plt.figure(figsize=(10,3))

plt.subplot(1,3,2)
plt.bar(range(3), apply_temperature(P, 0))
plt.ylim(0,1)
plt.title('$\\tau$ = 0')
plt.subplot(1,3,3)
plt.bar(range(3), apply_temperature(P, 2))
plt.ylim(0,1)
plt.title('$\\tau$ = 2')
plt.subplot(1,3,1)
plt.bar(range(3), apply_temperature(P, -2))
plt.ylim(0,1)
plt.title('$\\tau$ = -2')
plt.suptitle('Higher temperature makes the distribution closer to uniform!')
plt.tight_layout()
plt.show()

[0.5 0.4 0.1]





Copied!





Copied!





Copied!







# Make your solution here!
with open('shakespeare.txt', 'r') as f:
    shakespeare_text = f.read()

    

# Make your solution here!
with open('shakespeare.txt', 'r') as f:
    shakespeare_text = f.read()





Copied!







# Make your solution here!
with open('shakespeare.txt', 'r') as f:
    shakespeare_text = f.read()

    

# Make your solution here!
with open('shakespeare.txt', 'r') as f:
    shakespeare_text = f.read()





Copied!





Copied!

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search

Linguistic Models: the XX-Century approach¶

Exercise 1: conditional probabilities for next word¶

Exercise 2: estimating a linguistic model¶

Exercise 3: suggest a next word¶

Exercise 4: make a text generator¶

Exercise 5: generation techniques¶

Temperature¶

Top-K¶

Implementing temperature or top-k¶

Exercise 6: reading from real data¶