import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# GoogleTrends AnchorBank Library from github
import gtab


data = pd.read_csv('vgsales_12_4_2021.csv')
data = data.iloc[:400]
data['NA_Sales'] = data["NA_Sales"].astype('float64')
aggregation_functions = {'Rank':'first', 'Name': 'first', 'Platform':'first', 'Year':'first', 'Genre':'first', 'Publisher':'first','NA_Sales': 'sum', 'EU_Sales':'sum', 'JP_Sales':'sum', 'Other_Sales':'sum','Global_Sales': 'sum'}
df_new = data.groupby(data['Name']).aggregate(aggregation_functions)
df_new = df_new.sort_values(by="Rank")

df_new


sns.displot(data, x = "Global_Sales")
plt.title("Heavily Skewed Data Distribution")

Text(0.5, 1.0, 'Heavily Skewed Data Distribution')


t = gtab.GTAB()
t.set_options(pytrends_config= {"timeframe": "2004-01-01 2021-01-01"})
## Grab hype values from queries here: Create list of names
# Google has a cap on how many queries per day we can make, so if we 
# need to pull a lot frequently for testing we should save it in a csv
names_lst = df_new['Name']
hype_lst = []

# For loop to query for hype for sample of games
for i in names_lst:
    hype = t.new_query(names_lst[i]);
    hype = hype['max_ratio'].max()
    hype_lst.append(hype)

#Save data to CSV
df_hype = pd.DataFrame(hype_lst)
df_hype.to_csv('Hype_vals.csv',index=False)


# Import hype data that was pulled using GTAB
hype_lst = pd.read_csv('Hype_vals.csv', index_col=False)
hype_lst = hype_lst.set_index(df_new.index)
# Add hype data to DataFrame
df_new['hype'] = hype_lst
df_new


#Transformations of response variable
df_new["Global_Sales"] = df_new["Global_Sales"].transform(lambda x: 1/x).astype(float)
sns.displot(df_new['Global_Sales'])

<seaborn.axisgrid.FacetGrid at 0x7f75e38a70f0>


# Ordinal encoding for our categorical variables platform, genre, and publisher
enc = OrdinalEncoder()
df_new[['Platform']]= enc.fit_transform(df_new[['Platform']])
df_new[['Genre']]= enc.fit_transform(df_new[['Genre']])
df_new[['Publisher']]= enc.fit_transform(df_new[['Publisher']])
df_new.head(10)


scalar = StandardScaler()
x = df_new.drop(columns = ['Name','NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Rank','Genre','Platform','Year','Global_Sales'])
x = scalar.fit_transform(x)
y = np.asarray(df_new['Global_Sales'])
y = scalar.fit_transform(y.reshape(-1,1))

#Split training and test data into 80/20 split, then split validation off of training data.
XtrainZ, XtestZ, YtrainZ, YtestZ = train_test_split(x, y,train_size=0.8,test_size=0.2)
XtrainZ, XvalZ, YtrainZ, YvalZ = train_test_split(XtrainZ, YtrainZ,test_size=0.2)


# Model baseline that uses median to predict values
baseline_reg = DummyRegressor(strategy="median")
baseline_reg.fit(XtrainZ,YtrainZ)
baseline_reg.score(XvalZ, YvalZ)

-0.06581935060585375


# Model baseline that uses mean to predict values, not as good due to the skewness of the distribution of our response variable
baseline_reg = DummyRegressor(strategy="mean")
baseline_reg.fit(XtrainZ,YtrainZ)
baseline_reg.score(XvalZ, YvalZ)

-0.001925256166505518


poly = PolynomialFeatures(4)
transTrainx = poly.fit_transform(XtrainZ)
transValx = poly.fit_transform(XvalZ)
transTestx = poly.fit_transform(XtestZ)

lr = LinearRegression().fit(XtrainZ, YtrainZ)

print(lr.score(XtrainZ, YtrainZ))
print(lr.score(XvalZ, YvalZ))
pred = lr.predict(XvalZ)
sklearn.metrics.mean_squared_error(YvalZ, pred, squared = False)

0.007907021627495125
0.07877507674375972

0.8857204689307384


# Base Decision Tree regressor implementation

DT = DecisionTreeRegressor(max_depth=2)
regr = DT.fit(XtrainZ, YtrainZ)
print(regr.score(XtrainZ,YtrainZ))
print(regr.score(XvalZ, YvalZ))

0.13459630557278024
0.0004551252969434705


# Dictionary of parameters of DecisionTreeRegressor for hyperparameter tuning
parameters={
            "splitter":["best","random"],
            "max_depth":[1,3,5,7,9,11,12],
            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
            "max_features":["auto","log2","sqrt",None],
            "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90]}

# Used GridSearchCV for hyperparameter tuning
hypertune = GridSearchCV(DT,param_grid=parameters,scoring='neg_mean_squared_error',cv=5,verbose=3);
hypertune.fit(x,y)


# Implement new DT model with parameters provided by hyperparameter tuning
tuned_DT = DecisionTreeRegressor()
# Automatically set params of DT regressor provided by hypertuning
tuned_DT.set_params(**hypertune.best_params_)
# Refit model and check the score/MSE
tuned_regr = tuned_DT.fit(XtrainZ, YtrainZ)
print(tuned_regr.score(XtrainZ,YtrainZ))
print(tuned_regr.score(XvalZ, YvalZ))
pred = tuned_regr.predict(XvalZ)
sklearn.metrics.mean_squared_error(YvalZ, pred, squared = False)

0.19043501161699483
0.09038778887738508

0.8801201877564524


# >>> Load df_new into frame and see how each feature relates to each other on a visual level.
# >>> Then discuss about how to talk about game features intuitively
# Other things to consider: 
# >>> things we could have done to improve the score
# >>> problems we faced with this data
# - what could this kind of research be used for practically?


sns.pairplot(data = df_new, vars = ["Publisher", "Genre", "hype", "Global_Sales"])

<seaborn.axisgrid.PairGrid at 0x7f75d8b25d30>

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
Name
Wii Sports	1	Wii Sports	Wii	2006	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74
Super Mario Bros.	2	Super Mario Bros.	NES	1985	Platform	Nintendo	32.48	3.581.3	6.96	0.99	45.31
Mario Kart Wii	3	Mario Kart Wii	Wii	2008	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82
Wii Sports Resort	4	Wii Sports Resort	Wii	2009	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00
Pokemon Red and Blue	5	Pokemon Red and Blue	GB	1996	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37
...	...	...	...	...	...	...	...	...	...	...	...
The Legend of Zelda: Spirit Tracks	396	The Legend of Zelda: Spirit Tracks	PS4	2015	Action	Konami Digital Entertainment	1.08	1.35	0.48	0.47	3.38
WWF War Zone	397	WWF War Zone	DS	2009	Action	Nintendo	1.43	0.94	0.74	0.27	3.38
The Legend of Zelda: Majora's Mask	398	The Legend of Zelda: Majora's Mask	PS	1998	Fighting	Acclaim Entertainment	2.47	0.76	0.00	0.13	3.36
Professor Layton and the Unwound Future	399	Professor Layton and the Unwound Future	N64	2000	Action	Nintendo	1.90	0.67	0.73	0.06	3.36
Rugrats: Search For Reptar	400	Rugrats: Search For Reptar	DS	2008	Puzzle	Nintendo	0.65	1.61	0.82	0.28	3.36

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	hype
Name
Wii Sports	1	Wii Sports	Wii	2006	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74	5.000000
Super Mario Bros.	2	Super Mario Bros.	NES	1985	Platform	Nintendo	32.48	3.581.3	6.96	0.99	45.31	1.250000
Mario Kart Wii	3	Mario Kart Wii	Wii	2008	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82	14.434786
Wii Sports Resort	4	Wii Sports Resort	Wii	2009	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00	2.702703
Pokemon Red and Blue	5	Pokemon Red and Blue	GB	1996	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37	0.290000
...	...	...	...	...	...	...	...	...	...	...	...	...
The Legend of Zelda: Spirit Tracks	396	The Legend of Zelda: Spirit Tracks	PS4	2015	Action	Konami Digital Entertainment	1.08	1.35	0.48	0.47	3.38	1.428571
WWF War Zone	397	WWF War Zone	DS	2009	Action	Nintendo	1.43	0.94	0.74	0.27	3.38	0.062400
The Legend of Zelda: Majora's Mask	398	The Legend of Zelda: Majora's Mask	PS	1998	Fighting	Acclaim Entertainment	2.47	0.76	0.00	0.13	3.36	0.210000
Professor Layton and the Unwound Future	399	Professor Layton and the Unwound Future	N64	2000	Action	Nintendo	1.90	0.67	0.73	0.06	3.36	0.560000
Rugrats: Search For Reptar	400	Rugrats: Search For Reptar	DS	2008	Puzzle	Nintendo	0.65	1.61	0.82	0.28	3.36	0.005678

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	hype
Name
Wii Sports	1	Wii Sports	16.0	2006	10.0	15.0	41.49	29.02	3.77	8.46	0.012086	5.000000
Super Mario Bros.	2	Super Mario Bros.	8.0	1985	4.0	15.0	32.48	3.581.3	6.96	0.99	0.022070	1.250000
Mario Kart Wii	3	Mario Kart Wii	16.0	2008	6.0	15.0	15.85	12.88	3.79	3.31	0.027917	14.434786
Wii Sports Resort	4	Wii Sports Resort	16.0	2009	10.0	15.0	15.75	11.01	3.28	2.96	0.030303	2.702703
Pokemon Red and Blue	5	Pokemon Red and Blue	3.0	1996	7.0	15.0	11.27	8.89	10.22	1.00	0.031878	0.290000
Tetris	6	Tetris	3.0	1989	5.0	15.0	26.17	2.260.69	6.03	0.69	0.027902	23.620559
New Super Mario Bros.	7	New Super Mario Bros.	2.0	2006	4.0	15.0	11.38	9.23	6.50	2.90	0.033322	0.500000
Wii Play	8	Wii Play	16.0	2006	3.0	15.0	14.03	9.2	2.93	2.85	0.034459	5.263158
New Super Mario Bros. Wii	9	New Super Mario Bros. Wii	16.0	2009	4.0	15.0	14.59	7.06	4.70	2.26	0.034941	0.460000
Duck Hunt	10	Duck Hunt	8.0	1984	8.0	15.0	26.93	0.63	0.28	0.47	0.035323	0.470000

Final Project DATA 311 - Game Sales¶

Jason Li, Caleb Ponce, Jeremy Tran¶

Abstract:¶

The Data:¶

So... What's with the Scores?¶

Game Features Intuitively¶

Publisher¶

Genre¶

Hype¶

Score Improvements?¶

Conclusion¶