import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

yellow_url = "/cluster/academic/DATA311/202620/yellow_tripdata_2018-06_small.csv"

flights_url = "/cluster/academic/DATA311/202620/nycflights.csv"

flights = pd.read_csv(flights_url)
flights

flights.info()

<class 'pandas.DataFrame'>
RangeIndex: 32735 entries, 0 to 32734
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   year       32735 non-null  int64
 1   month      32735 non-null  int64
 2   day        32735 non-null  int64
 3   dep_time   32735 non-null  int64
 4   dep_delay  32735 non-null  int64
 5   arr_time   32735 non-null  int64
 6   arr_delay  32735 non-null  int64
 7   carrier    32735 non-null  str  
 8   tailnum    32735 non-null  str  
 9   flight     32735 non-null  int64
 10  origin     32735 non-null  str  
 11  dest       32735 non-null  str  
 12  air_time   32735 non-null  int64
 13  distance   32735 non-null  int64
 14  hour       32735 non-null  int64
 15  minute     32735 non-null  int64
dtypes: int64(12), str(4)
memory usage: 4.4 MB

flights.sample(n=10)

sns.displot(data=flights, x="dep_time", col="origin")

<seaborn.axisgrid.FacetGrid at 0x14b8dfa12900>

flights.groupby("carrier")["arr_delay"].mean().sort_values(ascending=False)

carrier
HA    28.088235
FL    19.553746
OO    16.666667
EV    16.210424
YV    15.094340
F9    12.478261
MQ    10.092142
B6     9.725632
WN     8.883426
9E     8.038325
UA     4.358232
VX     2.414487
US     1.803970
AA     1.398683
DL     0.906756
AS   -11.333333
Name: arr_delay, dtype: float64

sns.relplot(data=flights, x="distance", y="arr_delay")

<seaborn.axisgrid.FacetGrid at 0x14b8deeb6cf0>

	year	month	day	dep_time	dep_delay	arr_time	arr_delay	carrier	tailnum	flight	origin	dest	air_time	distance	hour	minute
0	2013	6	30	940	15	1216	-4	VX	N626VA	407	JFK	LAX	313	2475	9	40
1	2013	5	7	1657	-3	2104	10	DL	N3760C	329	JFK	SJU	216	1598	16	57
2	2013	12	8	859	-1	1238	11	DL	N712TW	422	JFK	LAX	376	2475	8	59
3	2013	5	14	1841	-4	2122	-34	DL	N914DL	2391	JFK	TPA	135	1005	18	41
4	2013	7	21	1102	-3	1230	-8	9E	N823AY	3652	LGA	ORF	50	296	11	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
32730	2013	10	8	752	-8	921	-28	9E	N8505Q	3611	JFK	PIT	63	340	7	52
32731	2013	7	7	812	-3	1043	8	DL	N6713Y	1429	JFK	LAS	286	2248	8	12
32732	2013	9	3	1057	-1	1319	-19	UA	N77871	1545	EWR	IAH	180	1400	10	57
32733	2013	10	15	844	56	1045	60	B6	N258JB	1273	JFK	CHS	93	636	8	44
32734	2013	3	28	1813	-3	1942	-23	UA	N36272	1053	EWR	CLE	59	404	18	13

	year	month	day	dep_time	dep_delay	arr_time	arr_delay	carrier	tailnum	flight	origin	dest	air_time	distance	hour	minute
7898	2013	11	3	1930	-5	2102	-30	EV	N708EV	5287	LGA	MSN	118	812	19	30
12189	2013	1	14	1011	71	1312	52	AA	N5BSAA	647	JFK	MIA	150	1089	10	11
4401	2013	7	21	1443	-3	1708	-6	UA	N14228	1222	EWR	LAS	303	2227	14	43
830	2013	7	25	658	-2	1012	3	DL	N905DE	1879	LGA	FLL	154	1076	6	58
2215	2013	12	29	905	-3	1211	7	B6	N828JB	653	JFK	PBI	154	1028	9	5
7033	2013	4	25	1341	26	1457	24	EV	N12540	5793	EWR	RIC	58	277	13	41
18186	2013	11	11	2043	-2	2335	-9	UA	N38268	1115	EWR	TPA	147	997	20	43
25194	2013	3	23	746	1	1032	22	DL	N784NC	807	EWR	ATL	138	746	7	46
23982	2013	1	16	1715	75	2024	69	AA	N3FAAA	565	JFK	DFW	226	1391	17	15
20614	2013	11	18	2115	-10	2306	-22	EV	N13903	3845	EWR	GSP	95	594	21	15

Lecture 13 - Exploratory Data Analysis Activity¶

The Data¶

Option 1: Yellow Cab Data¶

Option 2: Flight Data¶

The Plan¶

Exploratory Analysis - 20 minutes¶

Generalization of Questions/Strategies - 5 minutes¶