#General.
import numpy as np
import pandas as pd

#EDA Visualizations.
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import seaborn as sns

#Statistical Analysis.
from scipy.stats import f_oneway as F
from scipy.stats import levene as L
from scipy.stats import pearsonr as PR
from scipy.stats import ttest_ind as TT
import statsmodels.api as SM


boston_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
boston_df = pd.read_csv(boston_url)
boston_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

#Set display options.
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


boston_df


sns.set_style("darkgrid")

fig, ax = plt.subplots(1, 1, figsize = (15, 7))

sns.boxplot(x = boston_df['MEDV']*1000,
            color = '#FDE725')

#Title setup.
ax.set_title('Distribution of Median Value of Owner-Occupied Homes', fontsize = 24)

#X-axis setup.
ax.set_xlabel("Median Value of Home", fontsize = 22)
ax.xaxis.set_major_formatter(SMF('${x:,.0f}'))

ax.tick_params(axis = 'both', which = 'major', labelsize = 14, rotation = 25)

fig.savefig('Stats1.png');


fig, ax = plt.subplots(1, 1, figsize = (15, 7))

sns.histplot(y = boston_df['CHAS'],
             binwidth = 0.5,
             color = '#440154')

#Title setup.
ax.set_title('Charles River Proximity ', fontsize = 24)

#X-axis setup.
ax.set_xlabel("Count", fontsize = 22)
ax.tick_params(axis = 'x', which = 'major', labelsize = 14)
#Y-axis setup.
ax.set_ylabel("Charles River Proximity", fontsize = 22)
plt.yticks([0.25, 0.75], ["Other", "Tract Bounds \n River       "])

ax.tick_params(axis = 'y', which = 'major', labelsize = 14, rotation = 25)

fig.savefig('Stats2.png');


#Create Age Groups
def age_group(x):
    if x <= 35 :
        return("35% or Less")
    elif x < 70 :
        return("Between 35% and 70%")
    else:
        return("70% or More")

boston_df['AGEG'] = boston_df['AGE'].apply(lambda x: age_group(x))

fig, ax = plt.subplots(1, 1, figsize = (15, 11.5))

sns.boxplot(x = boston_df['AGEG'],
            order = ["35% or Less",
                     "Between 35% and 70%",
                     "70% or More"],
            y = boston_df['MEDV']*1000,
            palette = 'viridis_r')

#Title setup.
ax.set_title('Distribution of Median Value of Owner-Occupied Homes vs Home Ages', fontsize = 24)

#X-axis setup.
ax.set_xlabel("% of Homes Built Prior to 1940", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("Median Value of Home", fontsize = 22)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))

ax.tick_params(axis = 'both', which = 'major', labelsize = 14, rotation = 25)

fig.savefig('Stats3.png');


fig, ax = plt.subplots(1, 1, figsize = (15, 7.5))

sns.scatterplot(x = boston_df['NOX'],
                y = boston_df['INDUS'],
                s = 75,
                color = '#404788')

#Title setup.
ax.set_title('Nitric Oxide Concentrations vs Proportion of Non-Retail Business Acres', fontsize = 24)

#X-axis setup.
ax.set_xlabel("Nitric Oxide Concentration (ppm)", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("% of Non-Retail Business Acres", fontsize = 22)
ax.yaxis.set_major_formatter(SMF('{x:,.0f}%'))

ax.tick_params(axis = 'both', which = 'major', labelsize = 14)

fig.savefig('Stats4.png');


fig, ax = plt.subplots(1, 1, figsize = (15, 7.5))

sns.histplot(x = boston_df['PTRATIO'],
             bins = 10,
             binrange = (12.5, 22.5),
             color = '#238A8D')

#Title setup.
ax.set_title('Distribution of Pupil to Teacher Ratio', fontsize = 24)

#X-axis setup.
ax.set_xlabel("Pupil to Teacher Ratio", fontsize = 22)
#Y-axis setup.
ax.set_ylabel("Count", fontsize = 22)

ax.tick_params(axis = 'both', which = 'major', labelsize = 14)

fig.savefig('Stats5.png');


L(boston_df[boston_df['CHAS'] == 1]['MEDV'],
  boston_df[boston_df['CHAS'] == 0]['MEDV'],
  center = 'mean')

LeveneResult(statistic=8.751904896045998, pvalue=0.003238119367639829)


TT(boston_df[boston_df['CHAS'] == 1]['MEDV'],
   boston_df[boston_df['CHAS'] == 0]['MEDV'],
   equal_var = 'FALSE')

Ttest_indResult(statistic=3.996437466090509, pvalue=7.390623170519905e-05)


L(boston_df[boston_df['AGEG'] == "35% or Less"]['MEDV'],
  boston_df[boston_df['AGEG'] == "Between 35% and 70%"]['MEDV'],
  boston_df[boston_df['AGEG'] == "70% or More"]['MEDV'],
  center = 'mean')

LeveneResult(statistic=2.780620029374844, pvalue=0.06295337343259205)


F(boston_df[boston_df['AGEG'] == "35% or Less"]['MEDV'],
  boston_df[boston_df['AGEG'] == "Between 35% and 70%"]['MEDV'],
  boston_df[boston_df['AGEG'] == "70% or More"]['MEDV'])

F_onewayResult(statistic=36.40764999196599, pvalue=1.7105011022702984e-15)


PR(boston_df['NOX'],
   boston_df['INDUS'])

(0.7636514469209159, 7.913361061232396e-98)


X = boston_df['DIS']
y = boston_df['MEDV']*1000
X = SM.add_constant(X)

model = SM.OLS(y,X).fit()
pred = model.predict(X)

model.summary()

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	LSTAT	MEDV
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	4.98	24.0
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	9.14	21.6
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	4.03	34.7
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	2.94	33.4
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	5.33	36.2
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0.0	0.573	6.593	69.1	2.4786	1.0	273.0	21.0	9.67	22.4
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	21.0	9.08	20.6
503	0.06076	0.0	11.93	0.0	0.573	6.976	91.0	2.1675	1.0	273.0	21.0	5.64	23.9
504	0.10959	0.0	11.93	0.0	0.573	6.794	89.3	2.3889	1.0	273.0	21.0	6.48	22.0
505	0.04741	0.0	11.93	0.0	0.573	6.030	80.8	2.5050	1.0	273.0	21.0	7.88	11.9

Dep. Variable:	MEDV	R-squared:	0.062
Model:	OLS	Adj. R-squared:	0.061
Method:	Least Squares	F-statistic:	33.58
Date:	Sat, 03 Apr 2021	Prob (F-statistic):	1.21e-08
Time:	11:18:40	Log-Likelihood:	-5319.2
No. Observations:	506	AIC:	1.064e+04
Df Residuals:	504	BIC:	1.065e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	1.839e+04	817.389	22.499	0.000	1.68e+04	2e+04
DIS	1091.6130	188.378	5.795	0.000	721.509	1461.717

Omnibus:	139.779	Durbin-Watson:	0.570
Prob(Omnibus):	0.000	Jarque-Bera (JB):	305.104
Skew:	1.466	Prob(JB):	5.59e-67
Kurtosis:	5.424	Cond. No.	9.32

Visualization and Statistical Analysis with Housing for Boston, MA¶

Final for Statistics for Data Science with Python¶

Rohan Lewis¶

2020.04.03¶

I. Setup¶

1. Packages¶

2. Load Data¶

II. EDA Visualizations¶

1) Median Value of Owner-Occupied Homes¶

2) Charles River¶

3) Median Value of Owner-Occupied Homes vs Home Ages¶

4) Nitric Oxide Concentrations vs Proportion of Non-Retail Business Acres¶

5) Pupil to Teacher Ratio¶

III. Statistical Analysis¶

1) Median Value of Owner-Occupied Homes and Charles River¶

Levene Test¶

T-Test for Independent Samples¶

2) Median Value of Owner-Occupied Homes and Home Ages¶

Levene Test¶

ANOVA¶

3) Nitric Oxide Concentrations and Proportion of Non-Retail Business Acres¶

Pearson Correlation Test¶

4) Weighted Distance to Boston Employment Centres and Median Value of Owner-Occupied Homes¶

Regression Analysis¶