import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data_url = "https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_25f/data/NHANES/NHANES.csv"

cols_renamed = {"SEQN": "SEQN",
                "RIAGENDR": "Gender", # 1 = M, 2 = F
                "RIDAGEYR": "Age", # years
                "BMXWT": "Weight", # kg
                "BMXHT": "Height", # cm
                "BMXLEG": "Leg", # cm
                "BMXARML": "Arm", # cm
                "BMXARMC": "Arm Cir", # cm
                "BMXWAIST": "Waist Cir"} # cm

df = pd.read_csv(data_url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8704 entries, 0 to 8703
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      8704 non-null   float64
 1   RIAGENDR  8704 non-null   float64
 2   RIDAGEYR  8704 non-null   float64
 3   BMXWT     8580 non-null   float64
 4   BMXHT     8016 non-null   float64
 5   BMXLEG    6703 non-null   float64
 6   BMXARML   8177 non-null   float64
 7   BMXARMC   8173 non-null   float64
 8   BMXWAIST  7601 non-null   float64
dtypes: float64(9)
memory usage: 612.1 KB

df

df = df.rename(cols_renamed, axis='columns')
df = df.drop("SEQN", axis='columns') # we don't care about sequence # - it's just an ID
df

# First few rows:
import random
df.iloc[random.randint(0, df.shape[0]),:]

Gender       1.000000e+00
Age          5.397605e-79
Weight       8.900000e+00
Height                NaN
Leg                   NaN
Arm          1.500000e+01
Arm Cir      1.550000e+01
Waist Cir             NaN
Name: 5752, dtype: float64

# Two tallest people:
df.sort_values("Height", ascending=False).iloc[:2,:]

# What's that in feet?
df.sort_values("Height", ascending=False).iloc[:2,:]["Height"] / 2.54 / 12

2614    6.486220
8247    6.423885
Name: Height, dtype: float64

# How many standard deviations above the mean are those? Are they outliers?
# compute z-scores
ht = df["Height"]
ht_z = (ht - ht.mean()) / ht.std()
ht_z.sort_values(ascending=False)

2614    1.846835
8247    1.761472
8130    1.752487
3828    1.747994
2455    1.747994
          ...   
8649         NaN
8670         NaN
8678         NaN
8685         NaN
8690         NaN
Name: Height, Length: 8704, dtype: float64

# summary statistics
df.describe()

# how many rows and columns does the table have?
df.shape

(8704, 8)

# what percent of each column is missing?
df.isna().sum() / df.shape[0] * 100

Gender        0.000000
Age           0.000000
Weight        1.424632
Height        7.904412
Leg          22.989430
Arm           6.054688
Arm Cir       6.100643
Waist Cir    12.672335
dtype: float64

# what does the pattern of missing data look like?
plt.imshow(df.isna().iloc[:800,:].T.to_numpy(), aspect='auto', interpolation='none')

<matplotlib.image.AxesImage at 0x1542e7f515e0>

print(len(df.columns))

8

# make a figure with a 4-row, 2-column grid of axes:
fig, axes = plt.subplots(4, 2, figsize=(8, 16))

for col, ax in zip(df.columns, axes.flatten()):
    sns.histplot(data=df[col].values, ax=ax)
    ax.set_title(col)

sns.pairplot(data=df);

sns.pairplot(data=df[df["Age"]>=21])

<seaborn.axisgrid.PairGrid at 0x1542e493f5f0>

	SEQN	RIAGENDR	RIDAGEYR	BMXWT	BMXHT	BMXLEG	BMXARML	BMXARMC	BMXWAIST
0	93703.0	2.0	2.0	13.7	88.6	NaN	18.0	16.2	48.2
1	93704.0	1.0	2.0	13.9	94.2	NaN	18.6	15.2	50.0
2	93705.0	2.0	66.0	79.5	158.3	37.0	36.0	32.0	101.8
3	93706.0	1.0	18.0	66.3	175.7	46.6	38.8	27.0	79.3
4	93707.0	1.0	13.0	45.4	158.4	38.1	33.8	21.5	64.1
...	...	...	...	...	...	...	...	...	...
8699	102952.0	2.0	70.0	49.0	156.5	34.4	32.6	25.1	82.2
8700	102953.0	1.0	42.0	97.4	164.9	38.2	36.6	40.6	114.8
8701	102954.0	2.0	41.0	69.1	162.6	39.2	35.2	26.8	86.4
8702	102955.0	2.0	14.0	111.9	156.6	39.2	35.0	44.5	113.5
8703	102956.0	1.0	38.0	111.5	175.8	42.5	38.0	40.0	122.0

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
0	2.0	2.0	13.7	88.6	NaN	18.0	16.2	48.2
1	1.0	2.0	13.9	94.2	NaN	18.6	15.2	50.0
2	2.0	66.0	79.5	158.3	37.0	36.0	32.0	101.8
3	1.0	18.0	66.3	175.7	46.6	38.8	27.0	79.3
4	1.0	13.0	45.4	158.4	38.1	33.8	21.5	64.1
...	...	...	...	...	...	...	...	...
8699	2.0	70.0	49.0	156.5	34.4	32.6	25.1	82.2
8700	1.0	42.0	97.4	164.9	38.2	36.6	40.6	114.8
8701	2.0	41.0	69.1	162.6	39.2	35.2	26.8	86.4
8702	2.0	14.0	111.9	156.6	39.2	35.0	44.5	113.5
8703	1.0	38.0	111.5	175.8	42.5	38.0	40.0	122.0

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
2614	1.0	65.0	97.5	197.7	44.0	44.3	30.9	100.4
8247	1.0	34.0	89.9	195.8	46.0	49.9	32.5	88.2

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
count	8704.000000	8.704000e+03	8580.000000	8016.000000	6703.000000	8177.000000	8173.000000	7601.000000
mean	1.509076	3.443865e+01	65.138508	156.593401	38.643980	33.667996	29.193589	89.928851
std	0.499946	2.537904e+01	32.890754	22.257858	4.158013	7.229185	7.970648	22.805093
min	1.000000	5.397605e-79	3.200000	78.300000	24.800000	9.400000	11.200000	40.000000
25%	1.000000	1.100000e+01	43.100000	151.400000	35.800000	32.000000	23.800000	73.900000
50%	2.000000	3.100000e+01	67.750000	161.900000	38.800000	35.800000	30.100000	91.200000
75%	2.000000	5.800000e+01	85.600000	171.200000	41.500000	38.400000	34.700000	105.300000
max	2.000000	8.000000e+01	242.600000	197.700000	55.000000	49.900000	56.300000	169.500000

Lecture 13 - Introduction to Exploratory Data Analysis¶

Announcements:¶

Goals:¶

Hypothesis-Driven vs Exploratory Analysis¶

The Dataset¶

So you have a dataset. What now?¶

So you loaded a dataset. What now?¶

Look at some rows¶

Compute summary statistics (of numerical columns)¶

Maybe take stock of missing data?¶

Look at distribution of each column¶

Look at pairwise scatter plots of all pairs of (numerical) columns¶