import pandas as pd


data_url = "https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_23w/data/NHANES/NHANES.csv"


cols_renamed = {"SEQN": "SEQN",
                "RIAGENDR": "Gender", # 1 = M, 2 = F
                "RIDAGEYR": "Age", # years
                "BMXWT": "Weight", # kg
                "BMXHT": "Height", # cm
                "BMXLEG": "Leg", # cm
                "BMXARML": "Arm", # cm
                "BMXARMC": "Arm Cir", # cm
                "BMXWAIST": "Waist Cir"} # cm

df = pd.read_csv(data_url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8704 entries, 0 to 8703
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      8704 non-null   float64
 1   RIAGENDR  8704 non-null   float64
 2   RIDAGEYR  8704 non-null   float64
 3   BMXWT     8580 non-null   float64
 4   BMXHT     8016 non-null   float64
 5   BMXLEG    6703 non-null   float64
 6   BMXARML   8177 non-null   float64
 7   BMXARMC   8173 non-null   float64
 8   BMXWAIST  7601 non-null   float64
dtypes: float64(9)
memory usage: 612.1 KB

df


df = df.rename(cols_renamed, axis='columns')
df = df.drop("SEQN", axis='columns') # we don't care about sequence # - it's just an ID
df


# First few rows:
df.iloc[:5]


# Two tallest people:
df.sort_values(by="Height", ascending=False).iloc[:2]


# What's that in feet, inches?
df.sort_values(by="Height", ascending=False).iloc[:2]["Height"] / 2.54 / 12

2614    6.486220
8247    6.423885
Name: Height, dtype: float64


df.describe()


len(df) # how many values are there?

8704


df.isna().sum() / len(df) * 100 # what percent of each column is missing?

Gender        0.000000
Age           0.000000
Weight        1.424632
Height        7.904412
Leg          22.989430
Arm           6.054688
Arm Cir       6.100643
Waist Cir    12.672335
dtype: float64


import matplotlib.pyplot as plt
plt.imshow(df.isna().iloc[:800,:].T.to_numpy(), aspect='auto', interpolation='none')

<matplotlib.image.AxesImage at 0x7f1377aeb520>


print(len(df.columns))

8


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, axes = plt.subplots(4, 2, figsize=(12, 24))

for col, ax in zip(df.columns, axes.flatten()):
    sns.histplot(data=df[col].values, ax=ax)
    ax.set_title(col)
plt.show()


sns.pairplot(data=df)

<seaborn.axisgrid.PairGrid at 0x7f1377ab6fa0>


sns.pairplot(data=df[df["Age"]>=21])

<seaborn.axisgrid.PairGrid at 0x7f1377a9e910>

	SEQN	RIAGENDR	RIDAGEYR	BMXWT	BMXHT	BMXLEG	BMXARML	BMXARMC	BMXWAIST
0	93703.0	2.0	2.0	13.7	88.6	NaN	18.0	16.2	48.2
1	93704.0	1.0	2.0	13.9	94.2	NaN	18.6	15.2	50.0
2	93705.0	2.0	66.0	79.5	158.3	37.0	36.0	32.0	101.8
3	93706.0	1.0	18.0	66.3	175.7	46.6	38.8	27.0	79.3
4	93707.0	1.0	13.0	45.4	158.4	38.1	33.8	21.5	64.1
...	...	...	...	...	...	...	...	...	...
8699	102952.0	2.0	70.0	49.0	156.5	34.4	32.6	25.1	82.2
8700	102953.0	1.0	42.0	97.4	164.9	38.2	36.6	40.6	114.8
8701	102954.0	2.0	41.0	69.1	162.6	39.2	35.2	26.8	86.4
8702	102955.0	2.0	14.0	111.9	156.6	39.2	35.0	44.5	113.5
8703	102956.0	1.0	38.0	111.5	175.8	42.5	38.0	40.0	122.0

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
0	2.0	2.0	13.7	88.6	NaN	18.0	16.2	48.2
1	1.0	2.0	13.9	94.2	NaN	18.6	15.2	50.0
2	2.0	66.0	79.5	158.3	37.0	36.0	32.0	101.8
3	1.0	18.0	66.3	175.7	46.6	38.8	27.0	79.3
4	1.0	13.0	45.4	158.4	38.1	33.8	21.5	64.1
...	...	...	...	...	...	...	...	...
8699	2.0	70.0	49.0	156.5	34.4	32.6	25.1	82.2
8700	1.0	42.0	97.4	164.9	38.2	36.6	40.6	114.8
8701	2.0	41.0	69.1	162.6	39.2	35.2	26.8	86.4
8702	2.0	14.0	111.9	156.6	39.2	35.0	44.5	113.5
8703	1.0	38.0	111.5	175.8	42.5	38.0	40.0	122.0

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
0	2.0	2.0	13.7	88.6	NaN	18.0	16.2	48.2
1	1.0	2.0	13.9	94.2	NaN	18.6	15.2	50.0
2	2.0	66.0	79.5	158.3	37.0	36.0	32.0	101.8
3	1.0	18.0	66.3	175.7	46.6	38.8	27.0	79.3
4	1.0	13.0	45.4	158.4	38.1	33.8	21.5	64.1

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
2614	1.0	65.0	97.5	197.7	44.0	44.3	30.9	100.4
8247	1.0	34.0	89.9	195.8	46.0	49.9	32.5	88.2

	Gender	Age	Weight	Height	Leg	Arm	Arm Cir	Waist Cir
count	8704.000000	8.704000e+03	8580.000000	8016.000000	6703.000000	8177.000000	8173.000000	7601.000000
mean	1.509076	3.443865e+01	65.138508	156.593401	38.643980	33.667996	29.193589	89.928851
std	0.499946	2.537904e+01	32.890754	22.257858	4.158013	7.229185	7.970648	22.805093
min	1.000000	5.397605e-79	3.200000	78.300000	24.800000	9.400000	11.200000	40.000000
25%	1.000000	1.100000e+01	43.100000	151.400000	35.800000	32.000000	23.800000	73.900000
50%	2.000000	3.100000e+01	67.750000	161.900000	38.800000	35.800000	30.100000	91.200000
75%	2.000000	5.800000e+01	85.600000	171.200000	41.500000	38.400000	34.700000	105.300000
max	2.000000	8.000000e+01	242.600000	197.700000	55.000000	49.900000	56.300000	169.500000

Lecture 12 - Introduction to Exploratory Data Analysis¶

Announcements:¶

Goals:¶

Exploratory Analysis¶

The Dataset¶

Demo Dataset¶

So you have a dataset. What now?¶

So you loaded a dataset. What now?¶

Look at some rows¶

Compute summary statistics (of numerical columns)¶

Maybe take stock of missing data?¶

Look at distribution of each column¶

Look at pairwise scatter plots of all pairs of (numerical) columns¶