import numpy as np
import seaborn as sns
import pandas as pd
Last talks (for a little while anyway) this week:
Data Ethics 1 due tonight
numpy
magic¶import numpy as np
import imageio
import matplotlib.pyplot as plt
Reminder: Array-Array elementwise operations require the arrays to have the same shape (and number of dimensions).
a = np.ones((3,2))
b = np.ones((3,3)) * 2
print(a.shape)
print(b.shape)
a * b
(3, 2) (3, 3)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-3-5b52f5b3d735> in <module> 3 print(a.shape) 4 print(b.shape) ----> 5 a * b ValueError: operands could not be broadcast together with shapes (3,2) (3,3)
a
array([[1., 1.], [1., 1.], [1., 1.]])
b
array([[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]])
b[:,:2]
array([[2., 2.], [2., 2.], [2., 2.]])
a * b[:,:2]
array([[2., 2.], [2., 2.], [2., 2.]])
Exception: if a corresponding dimension is 1 in one array, the values will be repeated ("broadcast") along that dimension.
c = np.ones((3, 1)) * 4
print(a)
print(c)
[[1. 1.] [1. 1.] [1. 1.]] [[4.] [4.] [4.]]
print(a.shape)
print(c.shape)
(3, 2) (3, 1)
print(a + c)
[[5. 5.] [5. 5.] [5. 5.]]
Missing a singleton dimension? Use np.reshape
or np.newaxis
.
d = np.ones((3,))*4
d
array([4., 4., 4.])
print(a.shape)
print(d.shape)
a * d
(3, 2) (3,)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-13-0647a4951320> in <module> 1 print(a.shape) 2 print(d.shape) ----> 3 a * d ValueError: operands could not be broadcast together with shapes (3,2) (3,)
e = d[:,np.newaxis]
e.shape
(3, 1)
f = d.reshape((3, 1))
f.shape
(3, 1)
a*e
array([[4., 4.], [4., 4.], [4., 4.]])
a*f
array([[4., 4.], [4., 4.], [4., 4.]])
beans = imageio.imread("https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_23w/data/beans_200.jpeg")
beans = beans.astype(np.float32) / 255.0
plt.imshow(beans)
<matplotlib.image.AxesImage at 0x7fb76ae0e3a0>
Example task: Introduce a vertical "haze" gradient effect. In other words, make each row brighter by an amount that increases as you go down the image.
fade = np.linspace(0,0.4,num=200)
print(fade.shape)
print(beans.shape)
print(fade)
(200,) (200, 200, 3) [0. 0.00201005 0.0040201 0.00603015 0.0080402 0.01005025 0.0120603 0.01407035 0.0160804 0.01809045 0.0201005 0.02211055 0.0241206 0.02613065 0.0281407 0.03015075 0.0321608 0.03417085 0.0361809 0.03819095 0.04020101 0.04221106 0.04422111 0.04623116 0.04824121 0.05025126 0.05226131 0.05427136 0.05628141 0.05829146 0.06030151 0.06231156 0.06432161 0.06633166 0.06834171 0.07035176 0.07236181 0.07437186 0.07638191 0.07839196 0.08040201 0.08241206 0.08442211 0.08643216 0.08844221 0.09045226 0.09246231 0.09447236 0.09648241 0.09849246 0.10050251 0.10251256 0.10452261 0.10653266 0.10854271 0.11055276 0.11256281 0.11457286 0.11658291 0.11859296 0.12060302 0.12261307 0.12462312 0.12663317 0.12864322 0.13065327 0.13266332 0.13467337 0.13668342 0.13869347 0.14070352 0.14271357 0.14472362 0.14673367 0.14874372 0.15075377 0.15276382 0.15477387 0.15678392 0.15879397 0.16080402 0.16281407 0.16482412 0.16683417 0.16884422 0.17085427 0.17286432 0.17487437 0.17688442 0.17889447 0.18090452 0.18291457 0.18492462 0.18693467 0.18894472 0.19095477 0.19296482 0.19497487 0.19698492 0.19899497 0.20100503 0.20301508 0.20502513 0.20703518 0.20904523 0.21105528 0.21306533 0.21507538 0.21708543 0.21909548 0.22110553 0.22311558 0.22512563 0.22713568 0.22914573 0.23115578 0.23316583 0.23517588 0.23718593 0.23919598 0.24120603 0.24321608 0.24522613 0.24723618 0.24924623 0.25125628 0.25326633 0.25527638 0.25728643 0.25929648 0.26130653 0.26331658 0.26532663 0.26733668 0.26934673 0.27135678 0.27336683 0.27537688 0.27738693 0.27939698 0.28140704 0.28341709 0.28542714 0.28743719 0.28944724 0.29145729 0.29346734 0.29547739 0.29748744 0.29949749 0.30150754 0.30351759 0.30552764 0.30753769 0.30954774 0.31155779 0.31356784 0.31557789 0.31758794 0.31959799 0.32160804 0.32361809 0.32562814 0.32763819 0.32964824 0.33165829 0.33366834 0.33567839 0.33768844 0.33969849 0.34170854 0.34371859 0.34572864 0.34773869 0.34974874 0.35175879 0.35376884 0.35577889 0.35778894 0.35979899 0.36180905 0.3638191 0.36582915 0.3678392 0.36984925 0.3718593 0.37386935 0.3758794 0.37788945 0.3798995 0.38190955 0.3839196 0.38592965 0.3879397 0.38994975 0.3919598 0.39396985 0.3959799 0.39798995 0.4 ]
fade[:,np.newaxis,np.newaxis].shape
(200, 1, 1)
plt.imshow(np.clip(beans + fade[:, np.newaxis, np.newaxis],0,1))
<matplotlib.image.AxesImage at 0x7fb7690d55b0>
plt.imshow(np.clip(beans + fade[np.newaxis, :, np.newaxis], 0, 1))
<matplotlib.image.AxesImage at 0x7fb768eec6d0>
Let's load up the NHANES body measurement dataset.
data_url = "https://fw.cs.wwu.edu/~wehrwes/courses/data311_21f/data/NHANES/NHANES.csv"
cols_renamed = {"SEQN": "SEQN",
"RIAGENDR": "Gender", # 1 = M, 2 = F
"RIDAGEYR": "Age", # years
"BMXWT": "Weight", # kg
"BMXHT": "Height", # cm
"BMXLEG": "Leg", # cm
"BMXARML": "Arm", # cm
"BMXARMC": "Arm Cir", # cm
"BMXWAIST": "Waist Cir"} # cm
df = pd.read_csv(data_url)
df = df.rename(cols_renamed, axis='columns')
df = df.drop("SEQN", axis='columns')
df = df[df["Age"] >= 21]
In the NHANES dataset, heights and other length measurements are given in centimeters.
ht_col = df["Height"]
ht_col
2 158.3 5 150.2 6 151.1 8 170.6 10 178.6 ... 8697 180.1 8699 156.5 8700 164.9 8701 162.6 8703 175.8 Name: Height, Length: 5193, dtype: float64
Question: If you're 160cm tall, are you short? tall? average? Answer:
To compute a $z$-score:
In math: $$ \hat{x}_i = \frac{x_i -\mu}{\sigma}$$
In pandas:
df["Height-z"] = (ht_col - ht_col.mean()) / ht_col.std()
df["Height-z"]
2 -0.787712 5 -1.589290 6 -1.500226 8 0.429499 10 1.221180 ... 8697 1.369621 8699 -0.965840 8700 -0.134575 8701 -0.362183 8703 0.944092 Name: Height-z, Length: 5193, dtype: float64
sns.histplot(x="Height-z", data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb76157d520>
Nice properties of $z$-scores:
If we need to to make values non-negative, can exponentiate: $$ \hat{x}_i = e^{x_i}$$
x = np.linspace(-5,5,num=10000)
sns.lineplot(x=x, y = np.exp(x))
<matplotlib.axes._subplots.AxesSubplot at 0x7fb76136b400>
Text normalization: transforming the various ways text can apear into standard or canonical forms. Often needed to convert text data into tabular data.
import pandas as pd
import spacy
/usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML warnings.warn("Can't initialize NVML")
Responses to the survey prompt:
Name one hobby or activity you enjoy outside of school.
hob = pd.read_csv("https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_23w/lectures/L10/hobbies.csv", header=None)
hob
0 | |
---|---|
0 | I enjoy playing chess with people who are bett... |
1 | soccer |
2 | Running |
3 | I like to play Dungeons and Dragons |
4 | Building computers, cars, the WWU Racing team,... |
... | ... |
119 | I enjoy regularly going to the gym, as well as... |
120 | Skiing |
121 | Playing video games |
122 | I like skiing |
123 | Playing basketball and getting my nails painted |
124 rows × 1 columns
hob.iloc[119,0]
'I enjoy regularly going to the gym, as well as playing sports such as soccer and skiing over the winter. Ive also been playing video games since I was a child and managed to build my first PC about a year or two back, which was a hassle in itself. 3 hobbys but whatever.\xa0'
nlp = spacy.load('en_core_web_sm')
ans = nlp(hob.iloc[119,0])
tok = [t for t in ans] # tokenization
print(tok)
[I, enjoy, regularly, going, to, the, gym, ,, as, well, as, playing, sports, such, as, soccer, and, skiing, over, the, winter, ., I, ve, also, been, playing, video, games, since, I, was, a, child, and, managed, to, build, my, first, PC, about, a, year, or, two, back, ,, which, was, a, hassle, in, itself, ., 3, hobbys, but, whatever, ., ]
tok = [t for t in ans if (not t.is_stop and not t.is_punct)] # stopword and punctuation removal
print(tok)
[enjoy, regularly, going, gym, playing, sports, soccer, skiing, winter, ve, playing, video, games, child, managed, build, PC, year, hassle, 3, hobbys, ]
lem = [t.lemma_ for t in tok] # lemmatization
print([str(t) for t in tok])
print([str(t) for t in lem])
pd.DataFrame({"Token" : [t for t in ans],
"Lemma" : [t.lemma_ for t in ans],
"POS" : [t.pos_ for t in ans]})
print(list(ans.noun_chunks))
Tools for text normalization?
sed
(stream editor) or tr
(translate)spacy
, nltk
(support tokenizing, stemming, lemmatizing, etc.)