import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import spacy

nlp = spacy.load("en_core_web_sm")

quote = "You can please some of the people all of the time, you can please all of the people some of the time, but you can't please all of the people all of the time"

quote = quote.replace("can't", "can not")
quote

'You can please some of the people all of the time, you can please all of the people some of the time, but you can not please all of the people all of the time'

q = nlp(quote)

import collections

counts = collections.Counter([str(x) for x in q])

counts.most_common()

[('of', 6),
 ('the', 6),
 ('all', 4),
 ('can', 3),
 ('please', 3),
 ('people', 3),
 ('time', 3),
 ('some', 2),
 (',', 2),
 ('you', 2),
 ('You', 1),
 ('but', 1),
 ('not', 1)]

nlp = spacy.load("en_core_web_lg")

words = "cat dog tiger king queen castle waffle pancake" 
ans = nlp(words)

ans[0].vector

array([-0.15067  , -0.024468 , -0.23368  , -0.23378  , -0.18382  ,
        0.32711  , -0.22084  , -0.28777  ,  0.12759  ,  1.1656   ,
       -0.64163  , -0.098455 , -0.62397  ,  0.010431 , -0.25653  ,
        0.31799  ,  0.037779 ,  1.1904   , -0.17714  , -0.2595   ,
       -0.31461  ,  0.038825 , -0.15713  , -0.13484  ,  0.36936  ,
       -0.30562  , -0.40619  , -0.38965  ,  0.3686   ,  0.013963 ,
       -0.6895   ,  0.004066 , -0.1367   ,  0.32564  ,  0.24688  ,
       -0.14011  ,  0.53889  , -0.80441  , -0.1777   , -0.12922  ,
        0.16303  ,  0.14917  , -0.068429 , -0.33922  ,  0.18495  ,
       -0.082544 , -0.46892  ,  0.39581  , -0.13742  , -0.35132  ,
        0.22223  , -0.144    , -0.048287 ,  0.3379   , -0.31916  ,
        0.20526  ,  0.098624 , -0.23877  ,  0.045338 ,  0.43941  ,
        0.030385 , -0.013821 , -0.093273 , -0.18178  ,  0.19438  ,
       -0.3782   ,  0.70144  ,  0.16236  ,  0.0059111,  0.024898 ,
       -0.13613  , -0.11425  , -0.31598  , -0.14209  ,  0.028194 ,
        0.5419   , -0.42413  , -0.599    ,  0.24976  , -0.27003  ,
        0.14964  ,  0.29287  , -0.31281  ,  0.16543  , -0.21045  ,
       -0.4408   ,  1.2174   ,  0.51236  ,  0.56209  ,  0.14131  ,
        0.092514 ,  0.71396  , -0.021051 , -0.33704  , -0.20275  ,
       -0.36181  ,  0.22055  , -0.25665  ,  0.28425  , -0.16968  ,
        0.058029 ,  0.61182  ,  0.31576  , -0.079185 ,  0.35538  ,
       -0.51236  ,  0.4235   , -0.30033  , -0.22376  ,  0.15223  ,
       -0.048292 ,  0.23532  ,  0.46507  , -0.67579  , -0.32905  ,
        0.08446  , -0.22123  , -0.045333 ,  0.34463  , -0.1455   ,
       -0.18047  , -0.17887  ,  0.96879  , -1.0028   , -0.47343  ,
        0.28542  ,  0.56382  , -0.33211  , -0.38275  , -0.2749   ,
       -0.22955  , -0.24265  , -0.37689  ,  0.24822  ,  0.36941  ,
        0.14651  , -0.37864  ,  0.31134  , -0.28449  ,  0.36948  ,
       -2.8174   , -0.38319  , -0.022373 ,  0.56376  ,  0.40131  ,
       -0.42131  , -0.11311  , -0.17317  ,  0.1411   , -0.13194  ,
        0.18494  ,  0.097692 , -0.097341 , -0.23987  ,  0.16631  ,
       -0.28556  ,  0.0038654,  0.53292  , -0.32367  , -0.38744  ,
        0.27011  , -0.34181  , -0.27702  , -0.67279  , -0.10771  ,
       -0.062189 , -0.24783  , -0.070884 , -0.20898  ,  0.062404 ,
        0.022372 ,  0.13408  ,  0.1305   , -0.19546  , -0.46849  ,
        0.77731  , -0.043978 ,  0.3827   , -0.23376  ,  1.0457   ,
       -0.14371  , -0.3565   , -0.080713 , -0.31047  , -0.57822  ,
       -0.28067  , -0.069678 ,  0.068929 , -0.16227  , -0.63934  ,
       -0.62149  ,  0.11222  , -0.16969  , -0.54637  ,  0.49661  ,
        0.46565  ,  0.088294 , -0.48496  ,  0.69263  , -0.068977 ,
       -0.53709  ,  0.20802  , -0.42987  , -0.11921  ,  0.1174   ,
       -0.18443  ,  0.43797  , -0.1236   ,  0.3607   , -0.19608  ,
       -0.35366  ,  0.18808  , -0.5061   ,  0.14455  , -0.024368 ,
       -0.10772  , -0.0115   ,  0.58634  , -0.054461 ,  0.0076487,
       -0.056297 ,  0.27193  ,  0.23096  , -0.29296  , -0.24325  ,
        0.10317  , -0.10014  ,  0.7089   ,  0.17402  , -0.0037509,
       -0.46304  ,  0.11806  , -0.16457  , -0.38609  ,  0.14524  ,
        0.098122 , -0.12352  , -0.1047   ,  0.39047  , -0.3063   ,
       -0.65375  , -0.0044248, -0.033876 ,  0.037114 , -0.27472  ,
        0.0053147,  0.30737  ,  0.12528  , -0.19527  , -0.16461  ,
        0.087518 , -0.051107 , -0.16323  ,  0.521    ,  0.10822  ,
       -0.060379 , -0.71735  , -0.064327 ,  0.37043  , -0.41054  ,
       -0.2728   , -0.30217  ,  0.015771 , -0.43056  ,  0.35647  ,
        0.17188  , -0.54598  , -0.21541  , -0.044889 , -0.10597  ,
       -0.54391  ,  0.53908  ,  0.070938 ,  0.097839 ,  0.097908 ,
        0.17805  ,  0.18995  ,  0.49962  , -0.18529  ,  0.051234 ,
        0.019574 ,  0.24805  ,  0.3144   , -0.29304  ,  0.54235  ,
        0.46672  ,  0.26017  , -0.44705  ,  0.28287  , -0.033345 ,
       -0.33181  , -0.10902  , -0.023324 ,  0.2106   , -0.29633  ,
        0.81506  ,  0.038524 ,  0.46004  ,  0.17187  , -0.29804  ],
      dtype=float32)

ans[0].similarity(ans[1])

0.8016854524612427

ans[1].similarity(ans[-1])

0.22979263961315155

N = len(ans)
sim = np.zeros((N, N))
for i in range(N):
    for j in range(N):
        sim[i,j] = ans[i].similarity(ans[j])

labels = words.split(" ")
sns.heatmap(sim, annot=True, xticklabels=labels, yticklabels=labels);

Lecture 10 - More Text Normalization and NLP¶

Announcements:¶

Goals:¶

Counting Frequencies¶

Word Vectors and Similarity¶