Sales prediction#
Version with data splitting.
Setup#
import numpy as np
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
Data#
Import data#
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/advertising.csv')
Data structure#
df
Market | TV | radio | newspaper | sales | |
---|---|---|---|---|---|
0 | 1 | 230.1 | 37.8 | 69.2 | 22.1 |
1 | 2 | 44.5 | 39.3 | 45.1 | 10.4 |
2 | 3 | 17.2 | 45.9 | 69.3 | 9.3 |
3 | 4 | 151.5 | 41.3 | 58.5 | 18.5 |
4 | 5 | 180.8 | 10.8 | 58.4 | 12.9 |
... | ... | ... | ... | ... | ... |
195 | 196 | 38.2 | 3.7 | 13.8 | 7.6 |
196 | 197 | 94.2 | 4.9 | 8.1 | 9.7 |
197 | 198 | 177.0 | 9.3 | 6.4 | 12.8 |
198 | 199 | 283.6 | 42.0 | 66.2 | 25.5 |
199 | 200 | 232.1 | 8.6 | 8.7 | 13.4 |
200 rows × 5 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Market 200 non-null int64
1 TV 200 non-null float64
2 radio 200 non-null float64
3 newspaper 200 non-null float64
4 sales 200 non-null float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB
Data corrections#
# variable Market is categorical
df['Market'] = df['Market'].astype('category')
Variable lists#
# define outcome variable as y_label
y_label = 'sales'
# select features
features = df.drop(columns=[y_label, 'Market']).columns
# create feature data
X = df[features]
# create response
y = df[y_label]
Data splitting#
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42)
# data exploration set
df_train = pd.DataFrame(X_train.copy())
df_train = df_train.join(pd.DataFrame(y_train))
Analysis#
df_train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
TV | 160.0 | 150.019375 | 84.418857 | 0.7 | 77.750 | 150.65 | 218.825 | 296.4 |
radio | 160.0 | 22.875625 | 14.805216 | 0.0 | 9.825 | 21.20 | 36.425 | 49.6 |
newspaper | 160.0 | 29.945625 | 20.336449 | 0.3 | 12.875 | 25.60 | 44.500 | 100.9 |
sales | 160.0 | 14.100000 | 5.108754 | 1.6 | 10.475 | 13.20 | 17.325 | 27.0 |
alt.Chart(df_train).mark_bar().encode(
alt.X(alt.repeat("column"), type="quantitative", bin=True),
y='count()',
).properties(
width=150,
height=150
).repeat(
column=['sales', 'TV', 'radio', 'newspaper']
)
alt.Chart(df_train).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative')
).properties(
width=150,
height=150
).repeat(
row=['sales', 'TV', 'radio', 'newspaper'],
column=['sales', 'TV', 'radio', 'newspaper']
).interactive()
# inspect correlation between outcome and possible predictors
corr = df_train.corr()
corr[y_label].sort_values(ascending=False)
sales 1.000000
TV 0.768874
radio 0.592373
newspaper 0.237874
Name: sales, dtype: float64
# take a look at all correlations
corr.style.background_gradient(cmap='Blues')
TV | radio | newspaper | sales | |
---|---|---|---|---|
TV | 1.000000 | 0.053872 | 0.019084 | 0.768874 |
radio | 0.053872 | 1.000000 | 0.388074 | 0.592373 |
newspaper | 0.019084 | 0.388074 | 1.000000 | 0.237874 |
sales | 0.768874 | 0.592373 | 0.237874 | 1.000000 |
Model#
Select model#
# select the linear regression model
reg = LinearRegression()
Training and validation#
# cross-validation with 5 folds
scores = cross_val_score(reg,
X_train,
y_train,
cv=5,
scoring='neg_mean_squared_error') *-1
# store cross-validation scores
df_scores = pd.DataFrame({"lr": scores})
df_scores
lr | |
---|---|
1 | 4.192954 |
2 | 1.500644 |
3 | 2.109080 |
4 | 2.541355 |
5 | 4.372931 |
# reset index to match the number of folds
df_scores.index += 1
# print dataframe
df_scores.style.background_gradient(cmap='Blues')
alt.Chart(df_scores.reset_index()).mark_line(
point=alt.OverlayMarkDef()
).encode(
x=alt.X("index", bin=False, title="Fold", axis=alt.Axis(tickCount=5)),
y=alt.Y("lr", aggregate="mean", title="Mean squared error (MSE)")
)
df_scores.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
lr | 5.0 | 2.943393 | 1.279083 | 1.500644 | 2.10908 | 2.541355 | 4.192954 | 4.372931 |
Fit model#
# Fit the model to the complete training data
reg.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
Coefficients
# intercept
intercept = pd.DataFrame({
"Name": ["Intercept"],
"Coefficient":[reg.intercept_]}
)
# make a slope table
slope = pd.DataFrame({
"Name": features,
"Coefficient": reg.coef_}
)
# combine estimates of intercept and slopes
table = pd.concat([intercept, slope], ignore_index=True, sort=False)
round(table, 3)
Name | Coefficient | |
---|---|---|
0 | Intercept | 2.979 |
1 | TV | 0.045 |
2 | radio | 0.189 |
3 | newspaper | 0.003 |
Evaluation on test set#
# obtain predictions
y_pred = reg.predict(X_test)
# R squared
r2_score(y_test, y_pred).round(3)
0.899
# MSE
mean_squared_error(y_test, y_pred).round(3)
3.174
# RMSE
mean_squared_error(y_test, y_pred, squared=False).round(3)
1.782
# MAE
mean_absolute_error(y_test, y_pred).round(3)
1.461