In [2]:
Copied!
import numpy as np
import re
import numpy as np
import re
In [3]:
Copied!
text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""
words = re.findall(r"\b\w+\b", text.upper())
text = """Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence
concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze
large amounts of natural language data.
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
and natural language generation. Nowadays, Artificial Intelligence is a highly trending technology and is gaining popularity among NLP developers.
Although Artificial Intelligence is a marvelous technology and has wonderful results, it is still a developing technology and its ethical use is a major concern.
"""
words = re.findall(r"\b\w+\b", text.upper())
Exercise 01¶
In [5]:
Copied!
print(words)
print(words)
['NATURAL', 'LANGUAGE', 'PROCESSING', 'NLP', 'IS', 'A', 'SUBFIELD', 'OF', 'LINGUISTICS', 'COMPUTER', 'SCIENCE', 'INFORMATION', 'ENGINEERING', 'AND', 'ARTIFICIAL', 'INTELLIGENCE', 'CONCERNED', 'WITH', 'THE', 'INTERACTIONS', 'BETWEEN', 'COMPUTERS', 'AND', 'HUMAN', 'NATURAL', 'LANGUAGES', 'IN', 'PARTICULAR', 'HOW', 'TO', 'PROGRAM', 'COMPUTERS', 'TO', 'PROCESS', 'AND', 'ANALYZE', 'LARGE', 'AMOUNTS', 'OF', 'NATURAL', 'LANGUAGE', 'DATA', 'CHALLENGES', 'IN', 'NATURAL', 'LANGUAGE', 'PROCESSING', 'FREQUENTLY', 'INVOLVE', 'SPEECH', 'RECOGNITION', 'NATURAL', 'LANGUAGE', 'UNDERSTANDING', 'AND', 'NATURAL', 'LANGUAGE', 'GENERATION', 'NOWADAYS', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'HIGHLY', 'TRENDING', 'TECHNOLOGY', 'AND', 'IS', 'GAINING', 'POPULARITY', 'AMONG', 'NLP', 'DEVELOPERS', 'ALTHOUGH', 'ARTIFICIAL', 'INTELLIGENCE', 'IS', 'A', 'MARVELOUS', 'TECHNOLOGY', 'AND', 'HAS', 'WONDERFUL', 'RESULTS', 'IT', 'IS', 'STILL', 'A', 'DEVELOPING', 'TECHNOLOGY', 'AND', 'ITS', 'ETHICAL', 'USE', 'IS', 'A', 'MAJOR', 'CONCERN']
In [9]:
Copied!
unique_words = set(words)
model = {}
for w in unique_words:
model[w] = {}
for i in range(len(words)-1):
palavra_atual = words[i]
proxima_palavra = words[i+1]
if proxima_palavra in model[palavra_atual]:
model[palavra_atual][proxima_palavra] += 1
else:
model[palavra_atual][proxima_palavra] = 1
unique_words = set(words)
model = {}
for w in unique_words:
model[w] = {}
for i in range(len(words)-1):
palavra_atual = words[i]
proxima_palavra = words[i+1]
if proxima_palavra in model[palavra_atual]:
model[palavra_atual][proxima_palavra] += 1
else:
model[palavra_atual][proxima_palavra] = 1
In [10]:
Copied!
for w in model.keys():
s = 0
for k in model[w].keys():
s += model[w][k]
for k in model[w].keys():
model[w][k] /= s
for w in model.keys():
s = 0
for k in model[w].keys():
s += model[w][k]
for k in model[w].keys():
model[w][k] /= s
In [11]:
Copied!
model
model
Out[11]:
{'NOWADAYS': {'ARTIFICIAL': 1.0},
'ITS': {'ETHICAL': 1.0},
'DATA': {'CHALLENGES': 1.0},
'INVOLVE': {'SPEECH': 1.0},
'CONCERN': {},
'ARTIFICIAL': {'INTELLIGENCE': 1.0},
'COMPUTERS': {'AND': 0.5, 'TO': 0.5},
'BETWEEN': {'COMPUTERS': 1.0},
'ANALYZE': {'LARGE': 1.0},
'LARGE': {'AMOUNTS': 1.0},
'GENERATION': {'NOWADAYS': 1.0},
'HAS': {'WONDERFUL': 1.0},
'TRENDING': {'TECHNOLOGY': 1.0},
'DEVELOPING': {'TECHNOLOGY': 1.0},
'ALTHOUGH': {'ARTIFICIAL': 1.0},
'SCIENCE': {'INFORMATION': 1.0},
'TO': {'PROGRAM': 0.5, 'PROCESS': 0.5},
'GAINING': {'POPULARITY': 1.0},
'HUMAN': {'NATURAL': 1.0},
'POPULARITY': {'AMONG': 1.0},
'MARVELOUS': {'TECHNOLOGY': 1.0},
'PROCESSING': {'NLP': 0.5, 'FREQUENTLY': 0.5},
'STILL': {'A': 1.0},
'NLP': {'IS': 0.5, 'DEVELOPERS': 0.5},
'NATURAL': {'LANGUAGE': 0.8333333333333334, 'LANGUAGES': 0.16666666666666666},
'LINGUISTICS': {'COMPUTER': 1.0},
'INTERACTIONS': {'BETWEEN': 1.0},
'CHALLENGES': {'IN': 1.0},
'IS': {'A': 0.6666666666666666,
'GAINING': 0.16666666666666666,
'STILL': 0.16666666666666666},
'INFORMATION': {'ENGINEERING': 1.0},
'MAJOR': {'CONCERN': 1.0},
'DEVELOPERS': {'ALTHOUGH': 1.0},
'HOW': {'TO': 1.0},
'FREQUENTLY': {'INVOLVE': 1.0},
'A': {'SUBFIELD': 0.2,
'HIGHLY': 0.2,
'MARVELOUS': 0.2,
'DEVELOPING': 0.2,
'MAJOR': 0.2},
'IN': {'PARTICULAR': 0.5, 'NATURAL': 0.5},
'AMONG': {'NLP': 1.0},
'USE': {'IS': 1.0},
'AND': {'ARTIFICIAL': 0.14285714285714285,
'HUMAN': 0.14285714285714285,
'ANALYZE': 0.14285714285714285,
'NATURAL': 0.14285714285714285,
'IS': 0.14285714285714285,
'HAS': 0.14285714285714285,
'ITS': 0.14285714285714285},
'WONDERFUL': {'RESULTS': 1.0},
'PARTICULAR': {'HOW': 1.0},
'PROCESS': {'AND': 1.0},
'SPEECH': {'RECOGNITION': 1.0},
'RECOGNITION': {'NATURAL': 1.0},
'CONCERNED': {'WITH': 1.0},
'PROGRAM': {'COMPUTERS': 1.0},
'ETHICAL': {'USE': 1.0},
'HIGHLY': {'TRENDING': 1.0},
'SUBFIELD': {'OF': 1.0},
'INTELLIGENCE': {'CONCERNED': 0.3333333333333333, 'IS': 0.6666666666666666},
'COMPUTER': {'SCIENCE': 1.0},
'IT': {'IS': 1.0},
'WITH': {'THE': 1.0},
'OF': {'LINGUISTICS': 0.5, 'NATURAL': 0.5},
'LANGUAGE': {'PROCESSING': 0.4,
'DATA': 0.2,
'UNDERSTANDING': 0.2,
'GENERATION': 0.2},
'RESULTS': {'IT': 1.0},
'ENGINEERING': {'AND': 1.0},
'LANGUAGES': {'IN': 1.0},
'AMOUNTS': {'OF': 1.0},
'TECHNOLOGY': {'AND': 1.0},
'UNDERSTANDING': {'AND': 1.0},
'THE': {'INTERACTIONS': 1.0}}
2¶
In [16]:
Copied!
np.random.choice(
["batata", "alface", "cenoura"],
p=[0.1, 0.5, 0.4],
)
np.random.choice(
["batata", "alface", "cenoura"],
p=[0.1, 0.5, 0.4],
)
Out[16]:
np.str_('alface')
In [44]:
Copied!
def suggestion(model, word):
if word not in model:
raise Exception("Word is not in model")
next_words = list(model[word].keys())
probs = np.array([model[word][w] for w in next_words])
try:
choice = np.random.choice(next_words, p=probs)
except:
return 'A'
return choice
def suggestion(model, word):
if word not in model:
raise Exception("Word is not in model")
next_words = list(model[word].keys())
probs = np.array([model[word][w] for w in next_words])
try:
choice = np.random.choice(next_words, p=probs)
except:
return 'A'
return choice
In [ ]:
Copied!
3¶
In [46]:
Copied!
wordlist = []
w = "A"
print(w)
wordlist.append(w)
for _ in range(20):
w_ = suggestion(model, w)
w = w_
wordlist.append(w)
print(' '.join(wordlist))
wordlist = []
w = "A"
print(w)
wordlist.append(w)
for _ in range(20):
w_ = suggestion(model, w)
w = w_
wordlist.append(w)
print(' '.join(wordlist))
A A HIGHLY TRENDING TECHNOLOGY AND IS STILL A MAJOR CONCERN A MAJOR CONCERN A SUBFIELD OF LINGUISTICS COMPUTER SCIENCE INFORMATION ENGINEERING
In [47]:
Copied!
wordlist = []
for _ in range(20):
w = np.random.choice(words)
wordlist.append(w)
print(' '.join(wordlist))
wordlist = []
for _ in range(20):
w = np.random.choice(words)
wordlist.append(w)
print(' '.join(wordlist))
INTELLIGENCE LANGUAGE POPULARITY DATA IS NATURAL NATURAL DEVELOPERS NLP ARTIFICIAL ETHICAL NLP BETWEEN A NATURAL AND ITS IN MAJOR IS
In [ ]:
Copied!