import numpy as np
import seaborn as sns
import pandas as pd


import numpy as np
import imageio
import matplotlib.pyplot as plt


a = np.ones((3,2))
b = np.ones((3,3)) * 2
print(a.shape)
print(b.shape)
a * b

(3, 2)
(3, 3)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-5b52f5b3d735> in <module>
      3 print(a.shape)
      4 print(b.shape)
----> 5 a * b

ValueError: operands could not be broadcast together with shapes (3,2) (3,3)

a

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

b

array([[2., 2., 2.],
       [2., 2., 2.],
       [2., 2., 2.]])


b[:,:2]

array([[2., 2.],
       [2., 2.],
       [2., 2.]])


a * b[:,:2]

array([[2., 2.],
       [2., 2.],
       [2., 2.]])


c = np.ones((3, 1)) * 4
print(a)
print(c)

[[1. 1.]
 [1. 1.]
 [1. 1.]]
[[4.]
 [4.]
 [4.]]


print(a.shape)
print(c.shape)

(3, 2)
(3, 1)


print(a + c)

[[5. 5.]
 [5. 5.]
 [5. 5.]]


d = np.ones((3,))*4
d

array([4., 4., 4.])


print(a.shape)
print(d.shape)
a * d

(3, 2)
(3,)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-0647a4951320> in <module>
      1 print(a.shape)
      2 print(d.shape)
----> 3 a * d

ValueError: operands could not be broadcast together with shapes (3,2) (3,)


e = d[:,np.newaxis]
e.shape

(3, 1)


f = d.reshape((3, 1))
f.shape

(3, 1)

a*e

array([[4., 4.],
       [4., 4.],
       [4., 4.]])

a*f

array([[4., 4.],
       [4., 4.],
       [4., 4.]])


beans = imageio.imread("https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_23w/data/beans_200.jpeg")
beans = beans.astype(np.float32) / 255.0
plt.imshow(beans)

<matplotlib.image.AxesImage at 0x7fb76ae0e3a0>


fade = np.linspace(0,0.4,num=200)
print(fade.shape)
print(beans.shape)
print(fade)

(200,)
(200, 200, 3)
[0.         0.00201005 0.0040201  0.00603015 0.0080402  0.01005025
 0.0120603  0.01407035 0.0160804  0.01809045 0.0201005  0.02211055
 0.0241206  0.02613065 0.0281407  0.03015075 0.0321608  0.03417085
 0.0361809  0.03819095 0.04020101 0.04221106 0.04422111 0.04623116
 0.04824121 0.05025126 0.05226131 0.05427136 0.05628141 0.05829146
 0.06030151 0.06231156 0.06432161 0.06633166 0.06834171 0.07035176
 0.07236181 0.07437186 0.07638191 0.07839196 0.08040201 0.08241206
 0.08442211 0.08643216 0.08844221 0.09045226 0.09246231 0.09447236
 0.09648241 0.09849246 0.10050251 0.10251256 0.10452261 0.10653266
 0.10854271 0.11055276 0.11256281 0.11457286 0.11658291 0.11859296
 0.12060302 0.12261307 0.12462312 0.12663317 0.12864322 0.13065327
 0.13266332 0.13467337 0.13668342 0.13869347 0.14070352 0.14271357
 0.14472362 0.14673367 0.14874372 0.15075377 0.15276382 0.15477387
 0.15678392 0.15879397 0.16080402 0.16281407 0.16482412 0.16683417
 0.16884422 0.17085427 0.17286432 0.17487437 0.17688442 0.17889447
 0.18090452 0.18291457 0.18492462 0.18693467 0.18894472 0.19095477
 0.19296482 0.19497487 0.19698492 0.19899497 0.20100503 0.20301508
 0.20502513 0.20703518 0.20904523 0.21105528 0.21306533 0.21507538
 0.21708543 0.21909548 0.22110553 0.22311558 0.22512563 0.22713568
 0.22914573 0.23115578 0.23316583 0.23517588 0.23718593 0.23919598
 0.24120603 0.24321608 0.24522613 0.24723618 0.24924623 0.25125628
 0.25326633 0.25527638 0.25728643 0.25929648 0.26130653 0.26331658
 0.26532663 0.26733668 0.26934673 0.27135678 0.27336683 0.27537688
 0.27738693 0.27939698 0.28140704 0.28341709 0.28542714 0.28743719
 0.28944724 0.29145729 0.29346734 0.29547739 0.29748744 0.29949749
 0.30150754 0.30351759 0.30552764 0.30753769 0.30954774 0.31155779
 0.31356784 0.31557789 0.31758794 0.31959799 0.32160804 0.32361809
 0.32562814 0.32763819 0.32964824 0.33165829 0.33366834 0.33567839
 0.33768844 0.33969849 0.34170854 0.34371859 0.34572864 0.34773869
 0.34974874 0.35175879 0.35376884 0.35577889 0.35778894 0.35979899
 0.36180905 0.3638191  0.36582915 0.3678392  0.36984925 0.3718593
 0.37386935 0.3758794  0.37788945 0.3798995  0.38190955 0.3839196
 0.38592965 0.3879397  0.38994975 0.3919598  0.39396985 0.3959799
 0.39798995 0.4       ]


fade[:,np.newaxis,np.newaxis].shape

(200, 1, 1)


plt.imshow(np.clip(beans + fade[:, np.newaxis, np.newaxis],0,1))

<matplotlib.image.AxesImage at 0x7fb7690d55b0>


plt.imshow(np.clip(beans + fade[np.newaxis, :, np.newaxis], 0, 1))

<matplotlib.image.AxesImage at 0x7fb768eec6d0>


data_url = "https://fw.cs.wwu.edu/~wehrwes/courses/data311_21f/data/NHANES/NHANES.csv"
cols_renamed = {"SEQN": "SEQN",
                "RIAGENDR": "Gender", # 1 = M, 2 = F
                "RIDAGEYR": "Age", # years
                "BMXWT": "Weight", # kg
                "BMXHT": "Height", # cm
                "BMXLEG": "Leg", # cm
                "BMXARML": "Arm", # cm
                "BMXARMC": "Arm Cir", # cm
                "BMXWAIST": "Waist Cir"} # cm

df = pd.read_csv(data_url)
df = df.rename(cols_renamed, axis='columns')
df = df.drop("SEQN", axis='columns')
df = df[df["Age"] >= 21]


ht_col = df["Height"]
ht_col

2       158.3
5       150.2
6       151.1
8       170.6
10      178.6
        ...  
8697    180.1
8699    156.5
8700    164.9
8701    162.6
8703    175.8
Name: Height, Length: 5193, dtype: float64


df["Height-z"] = (ht_col - ht_col.mean()) / ht_col.std()
df["Height-z"]

2      -0.787712
5      -1.589290
6      -1.500226
8       0.429499
10      1.221180
          ...   
8697    1.369621
8699   -0.965840
8700   -0.134575
8701   -0.362183
8703    0.944092
Name: Height-z, Length: 5193, dtype: float64


sns.histplot(x="Height-z", data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb76157d520>


x = np.linspace(-5,5,num=10000)

sns.lineplot(x=x, y = np.exp(x))

<matplotlib.axes._subplots.AxesSubplot at 0x7fb76136b400>


import pandas as pd
import spacy

/usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML
  warnings.warn("Can't initialize NVML")


hob = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_23w/lectures/L10/hobbies.csv", header=None)
hob


hob.iloc[119,0]

'I enjoy regularly going to the gym, as well as playing sports such as soccer and skiing over the winter. Ive also been playing video games since I was a child and managed to build my first PC about a year or two back, which was a hassle in itself. 3 hobbys but whatever.\xa0'


nlp = spacy.load('en_core_web_sm')
ans = nlp(hob.iloc[119,0])
tok = [t for t in ans] # tokenization
print(tok)

[I, enjoy, regularly, going, to, the, gym, ,, as, well, as, playing, sports, such, as, soccer, and, skiing, over, the, winter, ., I, ve, also, been, playing, video, games, since, I, was, a, child, and, managed, to, build, my, first, PC, about, a, year, or, two, back, ,, which, was, a, hassle, in, itself, ., 3, hobbys, but, whatever, .,  ]


tok = [t for t in ans if (not t.is_stop and not t.is_punct)] # stopword and punctuation removal
print(tok)

[enjoy, regularly, going, gym, playing, sports, soccer, skiing, winter, ve, playing, video, games, child, managed, build, PC, year, hassle, 3, hobbys,  ]


lem = [t.lemma_ for t in tok] # lemmatization
print([str(t) for t in tok])
print([str(t) for t in lem])


pd.DataFrame({"Token" : [t for t in ans],
              "Lemma" : [t.lemma_ for t in ans], 
              "POS"   : [t.pos_ for t in ans]})


print(list(ans.noun_chunks))

	0
0	I enjoy playing chess with people who are bett...
1	soccer
2	Running
3	I like to play Dungeons and Dragons
4	Building computers, cars, the WWU Racing team,...
...	...
119	I enjoy regularly going to the gym, as well as...
120	Skiing
121	Playing video games
122	I like skiing
123	Playing basketball and getting my nails painted

Lecture 10 - Preprocessing and Cleaning: Outliers; Numerical Normalization; Text Normalization¶

Announcements:¶

Goals:¶

Outliers¶

Dealing with outliers¶

A wee bit more `numpy` magic¶

Broadcasting¶

Numerical Normalization¶

$z$-scores¶

0-1 normalization¶

Exponentiation¶

Text Normalization¶

Lecture 10 - Preprocessing and Cleaning: Outliers; Numerical Normalization; Text Normalization¶

Announcements:¶

Goals:¶

Outliers¶

Dealing with outliers¶

A wee bit more numpy magic¶

Broadcasting¶

Numerical Normalization¶

$z$-scores¶

0-1 normalization¶

Exponentiation¶

Text Normalization¶

A wee bit more `numpy` magic¶