#General.
import numpy as np
import pandas as pd
#EDA Visualizations.
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import seaborn as sns
#Statistical Analysis.
from scipy.stats import f_oneway as F
from scipy.stats import levene as L
from scipy.stats import pearsonr as PR
from scipy.stats import ttest_ind as TT
import statsmodels.api as SM
boston_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
boston_df = pd.read_csv(boston_url)
boston_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)
#Set display options.
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
boston_df
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 5.33 | 36.2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 9.67 | 22.4 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 9.08 | 20.6 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 5.64 | 23.9 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 6.48 | 22.0 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 7.88 | 11.9 |
506 rows × 13 columns
sns.set_style("darkgrid")
fig, ax = plt.subplots(1, 1, figsize = (15, 7))
sns.boxplot(x = boston_df['MEDV']*1000,
color = '#FDE725')
#Title setup.
ax.set_title('Distribution of Median Value of Owner-Occupied Homes', fontsize = 24)
#X-axis setup.
ax.set_xlabel("Median Value of Home", fontsize = 22)
ax.xaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 14, rotation = 25)
fig.savefig('Stats1.png');
fig, ax = plt.subplots(1, 1, figsize = (15, 7))
sns.histplot(y = boston_df['CHAS'],
binwidth = 0.5,
color = '#440154')
#Title setup.
ax.set_title('Charles River Proximity ', fontsize = 24)
#X-axis setup.
ax.set_xlabel("Count", fontsize = 22)
ax.tick_params(axis = 'x', which = 'major', labelsize = 14)
#Y-axis setup.
ax.set_ylabel("Charles River Proximity", fontsize = 22)
plt.yticks([0.25, 0.75], ["Other", "Tract Bounds \n River "])
ax.tick_params(axis = 'y', which = 'major', labelsize = 14, rotation = 25)
fig.savefig('Stats2.png');
#Create Age Groups
def age_group(x):
if x <= 35 :
return("35% or Less")
elif x < 70 :
return("Between 35% and 70%")
else:
return("70% or More")
boston_df['AGEG'] = boston_df['AGE'].apply(lambda x: age_group(x))
fig, ax = plt.subplots(1, 1, figsize = (15, 11.5))
sns.boxplot(x = boston_df['AGEG'],
order = ["35% or Less",
"Between 35% and 70%",
"70% or More"],
y = boston_df['MEDV']*1000,
palette = 'viridis_r')
#Title setup.
ax.set_title('Distribution of Median Value of Owner-Occupied Homes vs Home Ages', fontsize = 24)
#X-axis setup.
ax.set_xlabel("% of Homes Built Prior to 1940", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("Median Value of Home", fontsize = 22)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 14, rotation = 25)
fig.savefig('Stats3.png');
fig, ax = plt.subplots(1, 1, figsize = (15, 7.5))
sns.scatterplot(x = boston_df['NOX'],
y = boston_df['INDUS'],
s = 75,
color = '#404788')
#Title setup.
ax.set_title('Nitric Oxide Concentrations vs Proportion of Non-Retail Business Acres', fontsize = 24)
#X-axis setup.
ax.set_xlabel("Nitric Oxide Concentration (ppm)", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("% of Non-Retail Business Acres", fontsize = 22)
ax.yaxis.set_major_formatter(SMF('{x:,.0f}%'))
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)
fig.savefig('Stats4.png');
fig, ax = plt.subplots(1, 1, figsize = (15, 7.5))
sns.histplot(x = boston_df['PTRATIO'],
bins = 10,
binrange = (12.5, 22.5),
color = '#238A8D')
#Title setup.
ax.set_title('Distribution of Pupil to Teacher Ratio', fontsize = 24)
#X-axis setup.
ax.set_xlabel("Pupil to Teacher Ratio", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("Count", fontsize = 22)
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)
fig.savefig('Stats5.png');
L(boston_df[boston_df['CHAS'] == 1]['MEDV'],
boston_df[boston_df['CHAS'] == 0]['MEDV'],
center = 'mean')
LeveneResult(statistic=8.751904896045998, pvalue=0.003238119367639829)
TT(boston_df[boston_df['CHAS'] == 1]['MEDV'],
boston_df[boston_df['CHAS'] == 0]['MEDV'],
equal_var = 'FALSE')
Ttest_indResult(statistic=3.996437466090509, pvalue=7.390623170519905e-05)
L(boston_df[boston_df['AGEG'] == "35% or Less"]['MEDV'],
boston_df[boston_df['AGEG'] == "Between 35% and 70%"]['MEDV'],
boston_df[boston_df['AGEG'] == "70% or More"]['MEDV'],
center = 'mean')
LeveneResult(statistic=2.780620029374844, pvalue=0.06295337343259205)
F(boston_df[boston_df['AGEG'] == "35% or Less"]['MEDV'],
boston_df[boston_df['AGEG'] == "Between 35% and 70%"]['MEDV'],
boston_df[boston_df['AGEG'] == "70% or More"]['MEDV'])
F_onewayResult(statistic=36.40764999196599, pvalue=1.7105011022702984e-15)
PR(boston_df['NOX'],
boston_df['INDUS'])
(0.7636514469209159, 7.913361061232396e-98)
X = boston_df['DIS']
y = boston_df['MEDV']*1000
X = SM.add_constant(X)
model = SM.OLS(y,X).fit()
pred = model.predict(X)
model.summary()
Dep. Variable: | MEDV | R-squared: | 0.062 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.061 |
Method: | Least Squares | F-statistic: | 33.58 |
Date: | Sat, 03 Apr 2021 | Prob (F-statistic): | 1.21e-08 |
Time: | 11:18:40 | Log-Likelihood: | -5319.2 |
No. Observations: | 506 | AIC: | 1.064e+04 |
Df Residuals: | 504 | BIC: | 1.065e+04 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 1.839e+04 | 817.389 | 22.499 | 0.000 | 1.68e+04 | 2e+04 |
DIS | 1091.6130 | 188.378 | 5.795 | 0.000 | 721.509 | 1461.717 |
Omnibus: | 139.779 | Durbin-Watson: | 0.570 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 305.104 |
Skew: | 1.466 | Prob(JB): | 5.59e-67 |
Kurtosis: | 5.424 | Cond. No. | 9.32 |