import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import spacy

nlp = spacy.load("en_core_web_sm")

quote = "You can please some of the people all of the time, you can please all of the people some of the time, but you can't please all of the people all of the time"

quote = quote.replace("can't", "can not")
quote

'You can please some of the people all of the time, you can please all of the people some of the time, but you can not please all of the people all of the time'

q = nlp(quote)

import collections

counts = collections.Counter([str(x) for x in q])

counts.most_common()

[('of', 6),
 ('the', 6),
 ('all', 4),
 ('can', 3),
 ('please', 3),
 ('people', 3),
 ('time', 3),
 ('some', 2),
 (',', 2),
 ('you', 2),
 ('You', 1),
 ('but', 1),
 ('not', 1)]

!python -m spacy download en_core_web_lg

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 400.7/400.7 MB 35.1 MB/s  0:00:09:00:0100:01
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')

nlp = spacy.load("en_core_web_lg")

words = "cat dog tiger king queen castle waffle pancake" 
ans = nlp(words)

len(ans[0].vector)

300

ans[0].similarity(ans[1])

0.8016854524612427

ans[1].similarity(ans[-1])

0.22979263961315155

N = len(ans)
sim = np.zeros((N, N))
for i in range(N):
    for j in range(N):
        sim[i,j] = ans[i].similarity(ans[j])

labels = words.split(" ")
sns.heatmap(sim, annot=True, xticklabels=labels, yticklabels=labels);

Lecture 10 - More Text Normalization and NLP¶

Announcements:¶

Goals:¶

Tools for text normalization¶

Counting Frequencies¶

Word Vectors and Similarity¶