import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

hob = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_25f/lectures/L09/hobbies.csv", header=None)
hob

hob[0].plot.hist()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[95], line 1
----> 1 hob[0].plot.hist()

File /opt/miniforge/lib/python3.12/site-packages/pandas/plotting/_core.py:1694, in PlotAccessor.hist(self, by, bins, **kwargs)
   1639 def hist(
   1640     self, by: IndexLabel | None = None, bins: int = 10, **kwargs
   1641 ) -> PlotAccessor:
   1642     """
   1643     Draw one histogram of the DataFrame's columns.
   1644 
   (...)   1692         >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
   1693     """
-> 1694     return self(kind="hist", by=by, bins=bins, **kwargs)

File /opt/miniforge/lib/python3.12/site-packages/pandas/plotting/_core.py:1185, in PlotAccessor.__call__(self, *args, **kwargs)
   1182             label_name = label_kw or data.columns
   1183             data.columns = label_name
-> 1185 return plot_backend.plot(data, kind=kind, **kwargs)

File /opt/miniforge/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__init__.py:71, in plot(data, kind, **kwargs)
     69         kwargs["ax"] = getattr(ax, "left_ax", ax)
     70 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71 plot_obj.generate()
     72 plt.draw_if_interactive()
     73 return plot_obj.result

File /opt/miniforge/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py:516, in MPLPlot.generate(self)
    514 @final
    515 def generate(self) -> None:
--> 516     self._compute_plot_data()
    517     fig = self.fig
    518     self._make_plot(fig)

File /opt/miniforge/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py:716, in MPLPlot._compute_plot_data(self)
    714 # no non-numeric frames or series allowed
    715 if is_empty:
--> 716     raise TypeError("no numeric data to plot")
    718 self.data = numeric_data.apply(type(self)._convert_to_ndarray)

TypeError: no numeric data to plot

hours_responses = [
    "8",
    "6.5",
    "12 hours",
    "5h",
    " 4 "
]

import re

def normalize(s):
    s = re.search("\d+", s)
    return s[0]

[normalize(s) for s in hours_responses]

['8', '6', '12', '5', '4']

words = hob.iloc[-1].item().split()
print("\n".join(words))

I
enjoy
working
out,
hanging
out
with
friends,
and
walking.

words = [w.strip("., ").lower() for w in words]
print("\n".join(words))

i
enjoy
working
out
hanging
out
with
friends
and
walking

import spacy

nlp = spacy.load('en_core_web_sm')

text = "Dr. Wehrwein doesn't work for the F.B.I. His résumé wouldn't qualify him for such a job."

ans = nlp(text)
list(ans)

[Dr.,
 Wehrwein,
 does,
 n't,
 work,
 for,
 the,
 F.B.I.,
 His,
 résumé,
 would,
 n't,
 qualify,
 him,
 for,
 such,
 a,
 job,
 .]

list(ans.sents) # sentence tokenization

[Dr. Wehrwein doesn't work for the F.B.I.,
 His résumé wouldn't qualify him for such a job.]

localhost = "Localhost is 127.0.0.1, whereas your home router is traditionally configured to be 192.168.0.1."
ans = nlp(localhost)
list(ans)

[Localhost,
 is,
 127.0.0.1,
 ,,
 whereas,
 your,
 home,
 router,
 is,
 traditionally,
 configured,
 to,
 be,
 192.168.0.1,
 .]

tok = list(ans)
tok[-1].is_punct

True

tok = [t for t in ans if (not t.is_stop and not t.is_punct)]
tok

[Localhost, 127.0.0.1, home, router, traditionally, configured, 192.168.0.1]

hobby = hob.iloc[0,0]
hobby

'I love riding my bike, recently I have been enjoying riding my dirt jumper (type of bike made for dirt jumps) to the bike park at the Civic sport complex area.\xa0'

ans = nlp(hobby)
tok = [t for t in ans if (not t.is_stop and not t.is_punct)]
pd.DataFrame({"Token" : [t for t in tok],
              "Lemma" : [t.lemma_ for t in tok], 
              "POS"   : [t.pos_ for t in tok]})

text = "I got rid of my shoes and went riding."
riding = nlp(text)
riding[2].lemma_

'VERB'

riding[-2].lemma_

'rid'

riding[-2] == riding[2]

False

list(ans.noun_chunks)

[I,
 my bike,
 I,
 my dirt jumper,
 type,
 bike,
 dirt jumps,
 the bike park,
 the Civic sport complex area]

ans.ents

(Civic,)

nlp("Jude Law visited New York City. Air Force One happened to be parked at JFK.").ents

(Jude Law, New York City, Air Force One, JFK)

nlp("Jude Law visited New York City. Air Force One happened to be parked at JFK.".lower()).ents

(new york city, air force one, jfk)

nlp("Big Bird visited New York City. Air Force One happened to be parked at JFK.".lower()).ents

(big bird, new york city, air force one, jfk)

from spacytextblob.spacytextblob import SpacyTextBlob
nlp.add_pipe("spacytextblob")

<spacytextblob.spacytextblob.SpacyTextBlob at 0x14a034eebe30>

yay = nlp("Today is a good day.")
boo = nlp("I'm feeling sad.")

print(yay._.blob.polarity)
print(boo._.blob.polarity)

0.7
-0.5

hob["Polarity"] = hob[0].apply(lambda x: nlp(x)._.blob.polarity)

sns.displot(data=hob, x="Polarity");

hob

	0
0	I love riding my bike, recently I have been en...
1	I like hanging out with friends by going on wa...
2	I love reading eastern fantasy/cultivation nov...
3	Hiking!
4	I love singing! It's technically in-school, bu...
5	I like video games
6	I manage a home media server in my downtime, i...
7	Gaming
8	I enjoy long distance running! I've been doing...
9	Rock climbing
10	Volleyball
11	playing bass guitar
12	I like to play the guitar and to cook
13	Video Games
14	I enjoy biking and hiking outdoors, and readin...
15	Basketball
16	Mountain biking
17	Reading
18	Archery. Painting. Reading.
19	I really love baseball. My parents and I watch...
20	Reading
21	Hiking
22	rock climbing
23	Golf and Swiming
24	Lifting!
25	I enjoy working out, hanging out with friends,...

	0	Polarity
0	I love riding my bike, recently I have been en...	0.175000
1	I like hanging out with friends by going on wa...	0.000000
2	I love reading eastern fantasy/cultivation nov...	0.500000
3	Hiking!	0.000000
4	I love singing! It's technically in-school, bu...	0.312500
5	I like video games	0.000000
6	I manage a home media server in my downtime, i...	0.500000
7	Gaming	0.000000
8	I enjoy long distance running! I've been doing...	0.171875
9	Rock climbing	0.000000
10	Volleyball	0.000000
11	playing bass guitar	-0.150000
12	I like to play the guitar and to cook	0.000000
13	Video Games	0.000000
14	I enjoy biking and hiking outdoors, and readin...	0.400000
15	Basketball	0.000000
16	Mountain biking	0.000000
17	Reading	0.000000
18	Archery. Painting. Reading.	0.000000
19	I really love baseball. My parents and I watch...	0.262500
20	Reading	0.000000
21	Hiking	0.000000
22	rock climbing	0.000000
23	Golf and Swiming	0.000000
24	Lifting!	0.000000
25	I enjoy working out, hanging out with friends,...	0.400000

Lecture 9 - Preprocessing and Cleaning: Text Normalization and Natural Language Processing¶

Announcements:¶

Goals:¶

Start of Quarter Survey¶

What I'd like to do:¶

Text Normalization¶

Tokenization¶

Other Text Normalization Operations¶

Tokenization - Example:¶

Stemming¶

Lemmatization¶

Part-of-speech tagging¶

Noun phrase parsing¶

Named entity recognition¶

Sentiment analysis¶

	Token	Lemma	POS
0	love	love	VERB
1	riding	rid	VERB
2	bike	bike	NOUN
3	recently	recently	ADV
4	enjoying	enjoy	VERB
5	riding	rid	VERB
6	dirt	dirt	NOUN
7	jumper	jumper	NOUN
8	type	type	NOUN
9	bike	bike	NOUN
10	dirt	dirt	NOUN
11	jumps	jump	NOUN
12	bike	bike	NOUN
13	park	park	NOUN
14	Civic	Civic	PROPN
15	sport	sport	NOUN
16	complex	complex	ADJ
17	area	area	NOUN
18			SPACE