For my final project, I chose the prostate cancer data from Datasets for "The Elements of Statistical Learning.
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import numpy as np
import pandas as pd
import seaborn as sns
from warnings import filterwarnings as fw
fw("ignore")
prostate = pd.read_table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data", sep = "\t", index_col = 0)
prostate = prostate[['age', 'lbph', 'lcp',
'lcavol', 'lweight', 'pgg45',
'gleason', 'lpsa', 'svi']].rename(columns = {"age": "Age",
"lbph": "logBPH",
"lcp": "logCP",
"lcavol": "logCV",
"lweight": "logPW",
"pgg45": "Gleason2",
"gleason": "Gleason1",
"lpsa": "logPSA",
"svi": "SVI"})
prostate
Age | logBPH | logCP | logCV | logPW | Gleason2 | Gleason1 | logPSA | SVI | |
---|---|---|---|---|---|---|---|---|---|
1 | 50 | -1.386294 | -1.386294 | -0.579818 | 2.769459 | 0 | 6 | -0.430783 | 0 |
2 | 58 | -1.386294 | -1.386294 | -0.994252 | 3.319626 | 0 | 6 | -0.162519 | 0 |
3 | 74 | -1.386294 | -1.386294 | -0.510826 | 2.691243 | 20 | 7 | -0.162519 | 0 |
4 | 58 | -1.386294 | -1.386294 | -1.203973 | 3.282789 | 0 | 6 | -0.162519 | 0 |
5 | 62 | -1.386294 | -1.386294 | 0.751416 | 3.432373 | 0 | 6 | 0.371564 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93 | 68 | -1.386294 | 1.321756 | 2.830268 | 3.876396 | 60 | 7 | 4.385147 | 1 |
94 | 44 | -1.386294 | 2.169054 | 3.821004 | 3.896909 | 40 | 7 | 4.684443 | 1 |
95 | 52 | -1.386294 | 2.463853 | 2.907447 | 3.396185 | 10 | 7 | 5.143124 | 1 |
96 | 68 | 1.558145 | 1.558145 | 2.882564 | 3.773910 | 80 | 7 | 5.477509 | 1 |
97 | 68 | 0.438255 | 2.904165 | 3.471966 | 3.974998 | 20 | 7 | 5.582932 | 1 |
97 rows × 9 columns
The objective of this study is to predict various outcomes. Here are the questions that will be answered.
There are no missing values and several variables have already been log transformed.
sns.set_style('darkgrid')
def hist_helper(df, ax, var, name, b, c):
ax.hist(df[var],
bins = b,
color = [c],
stacked = True)
ax.set_title("Distribution of " + name, fontsize = 18)
ax.set_xlabel(xlabel = name, fontsize = 13)
if (var == 'SVI'):
plt.sca(ax)
plt.xticks(ticks = [0.25, 0.75], labels = ['No', 'Yes'], ha = 'center', rotation = 45)
elif (var == 'Gleason1'):
plt.sca(ax)
plt.xticks(ticks = [6.375, 7.125, 7.875, 8.625], labels = [6, 7, 8, 9])
ax.set_ylabel(ylabel = "Frequency", fontsize = 14)
ax.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
ax.tick_params(axis = 'both', labelsize = 12)
fig, ax = plt.subplots(3, 3, figsize = (16, 12))
c1 = '#39568CFF'
c2 = '#29AF7FFF'
hist_helper(prostate, ax[0][0], 'Age', 'Age', 10, c1)
hist_helper(prostate, ax[0][1], 'logBPH', 'log(Benign Prostatic Hyperplasia)', 10, c2)
hist_helper(prostate, ax[0][2], 'logCP', 'log(Capsular Penetration)', 10, c1)
hist_helper(prostate, ax[1][0], 'logCV', 'log(Cancer Volume)', 10, c2)
hist_helper(prostate, ax[1][1], 'logPW', 'log(Prostate Weight)', 10, c1)
hist_helper(prostate, ax[1][2], 'Gleason2', '% that is Gleason 4 or 5', 10, c2)
hist_helper(prostate, ax[2][0], 'Gleason1', 'Gleason Score', 4, c1)
hist_helper(prostate, ax[2][1], 'logPSA', 'log(Prostate-Specific Antigen)', 10, c2)
hist_helper(prostate, ax[2][2], 'SVI', 'Seminal Vesicle Invasion', 2, c1)
fig.tight_layout()
fig.savefig('Supervised1 - Histograms.png');
float_columns = [x for x in prostate.columns if x not in ['Gleason1', 'SVI']]
prostate[float_columns].skew().sort_values()
Age -0.828476 logCV -0.250304 logPSA -0.000434 logPW 0.063541 logBPH 0.133813 logCP 0.728634 Gleason2 0.968105 dtype: float64
Age
is left skewed.
logCV
, logPSA
, and logPW
have been appropriately log transformed and are close to normal distribution.
logBPH
, logCP
, and Gleason2
all have over 40% of their observations in the least value bin.
Gleason1
and SVI
are both categorical and are unbalanced.
#https://seaborn.pydata.org/examples/many_pairwise_correlations.html
sns.set_theme(style = "white")
# Compute the correlation matrix.
corr = prostate.corr()
#Maximum and minimum.
vmax = corr[corr < 1].max().max()
vmin = corr.min().min()
# Generate a mask for the upper triangle.
mask = np.triu(np.ones_like(corr, dtype = bool))
# Set up the matplotlib figure.
fig, ax = plt.subplots(figsize = (20, 15))
# Draw the heatmap with the mask and correct aspect ratio.
sns.heatmap(corr,
mask = mask,
cmap = 'viridis_r',
vmax = vmax,
vmin = vmin,
square = True,
linewidths = .5,
cbar_kws = {"shrink": .7})
plt.title("Correlation of Prostate Predictors", x = .6, y = .9, fontsize = 24,)
xl, xt = plt.xticks()
plt.xticks(ticks = xl[0:-1],
labels = xt[0:-1],
fontsize = 18,
ha = 'center',
rotation = 45);
yl, yt = plt.yticks()
plt.yticks(ticks = yl[1:],
labels = yt[1:],
fontsize = 18,
rotation = 45)
fig.savefig('Supervised2 - Correlations.png', bbox_inches = 'tight', pad_inches = 0.05);
Gleason1
and Gleason2
have the highest correlation at around 0.75.
logCV
and logPSA
, logCP
and logCV
, and logCP
and SVI
also have high correlations.
No variables will be removed, as multicollinearity is not an issue.
When set as the outcome variable,
The chart below summarizes the corresponding labels created:
Prostate-Specific Antigen (ng/mL) | log(PSA) | Output Classification Label |
---|---|---|
$< 4$ | $< 1.3863$ | Low Risk |
$4 - 10$ | $1.3863 - 2.3026$ | Borderline Risk |
$> 10$ | $> 2.3026$ | High Risk |
prostate['y_Gleason'] = prostate['Gleason1'].replace([6,7,8,9], [0,1,2,2])
def psa_risk(p):
if (p < 1.3863) :
v = 0
elif (p > 2.3026) :
v = 2
else :
v = 1
return(v)
prostate['y_PSA'] = prostate['logPSA'].apply(lambda p: psa_risk(p))
prostate['y_SVI'] = prostate['SVI']
The data will be split into training (70%) and test (30%) sets. Then for each of the 3 outcome variables, the appropriate variables will be kept as predictors and set as the outcome.
from sklearn.model_selection import train_test_split as TTS
train, test = TTS(prostate, test_size = 0.3, random_state = 31313)
non_pred = ['Age', 'logBPH', 'logCP', 'logCV', 'logPW', 'Gleason2']
XG_train = train[non_pred + ['logPSA', 'SVI']]
XG_test = test[non_pred + ['logPSA', 'SVI']]
yG_train = train['y_Gleason'].values
yG_test = test['y_Gleason'].values
XP_train = train[non_pred + ['Gleason1', 'SVI']]
XP_test = test[non_pred + ['Gleason1', 'SVI']]
yP_train = train['y_PSA'].values
yP_test = test['y_PSA'].values
XS_train = train[non_pred + ['Gleason1', 'logPSA']]
XS_test = test[non_pred + ['Gleason1', 'logPSA']]
yS_train = train['y_SVI'].values
yS_test = test['y_SVI'].values
I am using three supervised learning algorithms:
Each model will be run in a validation curve simulation to optimize its respective parameter/hyperparameter.
Data entered to KNN will always have been scaled beforehand.
In addition, each model will be run for all three training sets.
from sklearn.model_selection import validation_curve as VC
#Graphs Validation Curve of Train and Validation Sets given a model, X, y, and name of model.
def model_vc(model, X, y, name, f):
#Random Forest.
if (name == "Random Forest"):
param_name = "max_depth"
param_range = list(range(2, 11))
proper = "Maximum Depth"
#K-Nearest Neighbor.
elif name == "K-Nearest Neighbor":
param_name = "n_neighbors"
param_range = [n for n in range(1, 44)]
proper = "# of Neighbors"
#SVM.
else:
param_name = "C"
param_range = [10**n for n in range(-5, 6)]
proper = "C"
#Validation Curve with 3-fold.
train_scores, valid_scores = VC(model,
X,
y,
param_name = param_name,
param_range = param_range,
cv = 3)
#Get Accuracies.
train_list = []
valid_list = []
for p in range(len(param_range)) :
train_list.append(sum(train_scores[p]) / 3)
valid_list.append(sum(valid_scores[p]) / 3)
#Plot.
fig, ax = plt.subplots(figsize = (10, 7))
ax.plot(param_range, train_list, c = 'b', lw = 3, label = "Train Set")
ax.plot(param_range, valid_list, c = 'g', lw = 3, label = "Validation Set")
#Title.
ax.set_title("Train and Validation Set Accuracies", fontsize = 22)
#X-Axis.
ax.set_xlabel(xlabel = proper, fontsize = 18)
if (name == "Random Forest"):
ax.set_xticks(param_range)
elif (name == "K-Nearest Neighbor"):
ax.set_xticks([5*x for x in range(10)])
else:
ax.set_xscale('log')
proper = "Hyperparameter C Value"
ax.set_xlabel(xlabel = proper, fontsize = 18)
#Y-Axis.
ax.set_ylabel(ylabel = "Accuracy", fontsize = 18)
ax.yaxis.set_major_formatter(SMF('{x: 0.5f}'))
ax.tick_params(axis = 'both', labelsize = 16)
#Legend and Footnote.
ax.legend(fontsize = 18, loc = 'center right')
plt.figtext(0.9, 0, "3-fold cross-validation values were averaged.", ha = 'right', fontsize = 12)
fig.savefig("Supervised" + f + ' - ' + name + '.png');
from sklearn.ensemble import RandomForestClassifier as RFC
model = RFC(n_estimators = 10,
random_state = 31313)
model_vc(model, XG_train, yG_train, "Random Forest", '3')
A maximum depth of 2 will be used.
dtrf_gs_model = RFC(n_estimators = 10,
max_depth = 2,
random_state = 31313).fit(XG_train, yG_train)
model_vc(model, XP_train, yP_train, "Random Forest", '4')
A maximum depth of 6 will be used.
dtrf_psa_model = RFC(n_estimators = 10,
max_depth = 6,
random_state = 31313).fit(XP_train, yP_train)
model_vc(model, XS_train, yS_train, "Random Forest", '5')
A maximum depth of 3 will be used.
dtrf_svi_model = RFC(n_estimators = 10,
max_depth = 3,
random_state = 31313).fit(XS_train, yS_train)
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.preprocessing import StandardScaler as SS
model = KNC()
model_vc(model, SS().fit_transform(XG_train), yG_train, "K-Nearest Neighbor", '6')
$N = 7$ will be used.
knn_gs_model = KNC(n_neighbors = 11).fit(SS().fit_transform(XG_train), yG_train)
model_vc(model, SS().fit_transform(XP_train), yP_train, "K-Nearest Neighbor", '7')
$N = 15$ will be used.
knn_psa_model = KNC(n_neighbors = 15).fit(SS().fit_transform(XP_train), yP_train)
model_vc(model, SS().fit_transform(XS_train), yS_train, "K-Nearest Neighbor", '8')
$N = 3$ will be used.
knn_svi_model = KNC(n_neighbors = 3).fit(SS().fit_transform(XS_train), yS_train)
from sklearn.svm import SVC
model = SVC(kernel = 'poly', degree = 3)
model_vc(model, XG_train, yG_train, "SVM (Polynomial Degree 3)", '9')
$C = 10^{2}$ will be used.
poly_gs_model = SVC(kernel = 'poly',
degree = 3,
C = 1e2).fit(XG_train, yG_train)
model_vc(model, XP_train, yP_train, "SVM (Polynomial Degree 3)", '10')
$C = 10^{2}$ will be used.
poly_psa_model = SVC(kernel = 'poly',
degree = 3,
C = 1e2).fit(XP_train, yP_train)
model_vc(model, XS_train, yS_train, "SVM (Polynomial Degree 3)", '11')
$C = 10^{2}$ will be used.
poly_svi_model = SVC(kernel = 'poly',
degree = 3,
C = 1e2).fit(XS_train, yS_train)
from sklearn import metrics as m
from sklearn.metrics import accuracy_score as ac
from sklearn.metrics import precision_score as ps
from sklearn.metrics import recall_score as rs
from sklearn.metrics import f1_score as fs
#Returns the Jaccard and F1 score for the models.
def summary(Xys, models, outcome):
accuracy = []
precision = []
recall = []
f1s = []
#Calculate scores from each model and append to lists.
for m in range(3) :
for Xy in Xys:
X = Xy[0]
if (m == 1):
X = SS().fit_transform(X)
y_pred = models[m].predict(X)
accuracy.append(ac(Xy[1], y_pred))
precision.append(ps(Xy[1], y_pred, average = 'weighted'))
recall.append(rs(Xy[1], y_pred, average = 'weighted'))
f1s.append(fs(Xy[1], y_pred, average = 'weighted'))
#Names.
dtrf = "Decision Tree/Random Forest"
knn = "K-Nearest Neighbor"
svm = "Support Vector Machine"
train = ", Train Set"
test = ", Test Set"
#Output dataframe.
data = {"Algorithm": [dtrf + train, dtrf + test,
knn + train, knn + test,
svm + train, svm + test],
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-Score": f1s}
report = pd.DataFrame(data = data).set_index("Algorithm")
return(report.style.set_caption("Predicting " + outcome))
summary([(XG_train, yG_train),
(XG_test, yG_test)],
[dtrf_gs_model, knn_gs_model, poly_gs_model],
"Gleason Score")
Accuracy | Precision | Recall | F1-Score | |
---|---|---|---|---|
Algorithm | ||||
Decision Tree/Random Forest, Train Set | 0.940299 | 0.886720 | 0.940299 | 0.912061 |
Decision Tree/Random Forest, Test Set | 0.933333 | 0.872464 | 0.933333 | 0.901515 |
K-Nearest Neighbor, Train Set | 0.761194 | 0.723105 | 0.761194 | 0.735301 |
K-Nearest Neighbor, Test Set | 0.666667 | 0.715556 | 0.666667 | 0.654040 |
Support Vector Machine, Train Set | 0.970149 | 0.971763 | 0.970149 | 0.965589 |
Support Vector Machine, Test Set | 0.966667 | 0.968182 | 0.966667 | 0.961499 |
summary([(XP_train, yP_train),
(XP_test, yP_test)],
[dtrf_psa_model, knn_psa_model, poly_psa_model],
"Prostate-Specific Antigen")
Accuracy | Precision | Recall | F1-Score | |
---|---|---|---|---|
Algorithm | ||||
Decision Tree/Random Forest, Train Set | 0.955224 | 0.956522 | 0.955224 | 0.955366 |
Decision Tree/Random Forest, Test Set | 0.766667 | 0.772222 | 0.766667 | 0.767273 |
K-Nearest Neighbor, Train Set | 0.716418 | 0.702927 | 0.716418 | 0.697149 |
K-Nearest Neighbor, Test Set | 0.700000 | 0.764975 | 0.700000 | 0.720939 |
Support Vector Machine, Train Set | 0.761194 | 0.765320 | 0.761194 | 0.761288 |
Support Vector Machine, Test Set | 0.666667 | 0.754497 | 0.666667 | 0.696667 |
summary([(XS_train, yS_train),
(XS_test, yS_test)],
[dtrf_svi_model, knn_svi_model, poly_svi_model],
"Seminal Vesicle Invasion")
Accuracy | Precision | Recall | F1-Score | |
---|---|---|---|---|
Algorithm | ||||
Decision Tree/Random Forest, Train Set | 0.970149 | 0.971215 | 0.970149 | 0.969177 |
Decision Tree/Random Forest, Test Set | 0.833333 | 0.828157 | 0.833333 | 0.829630 |
K-Nearest Neighbor, Train Set | 0.910448 | 0.917260 | 0.910448 | 0.912801 |
K-Nearest Neighbor, Test Set | 0.900000 | 0.905820 | 0.900000 | 0.901778 |
Support Vector Machine, Train Set | 0.925373 | 0.924038 | 0.925373 | 0.921515 |
Support Vector Machine, Test Set | 0.833333 | 0.828157 | 0.833333 | 0.829630 |
from sklearn.metrics import plot_confusion_matrix as PCM
import matplotlib
def confusion(titles, outcome, models, Xs, ys, labels, f):
data = ["Train Set", "Test Set"]
titles = ["Random Forest (10 Trees, Maximum Depth = " + str(titles[0]) + ")",
"K-Nearest Neighbors (N = " + str(titles[1]) + ")",
"Support Vector Machine (Polynomial Kernel, Degree 3)"]
fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (15, 23))
fig.suptitle(outcome, fontsize = 30, y = 0.92)
plt.figtext(x = 0.5, y = 0.88, ha = 'center', s = titles[0], fontsize = 26)
plt.figtext(x = 0.5, y = 0.62, ha = 'center', s = titles[1], fontsize = 26)
plt.figtext(x = 0.5, y = 0.35, ha = 'center', s = titles[2], fontsize = 26)
matplotlib.rcParams.update({'font.size': 30})
for r in range(3):
for c in range(2):
ax = axes[r][c]
X = Xs[c]
if (r == 1):
X = SS().fit_transform(X)
PCM(estimator = models[r],
X = X,
y_true = ys[c],
ax = ax,
display_labels = labels,
include_values = True)
ax.set_title(data[c], fontsize = 20)
ax.set_xlabel(xlabel = "Predicted Value", fontsize = 18)
ax.set_ylabel(ylabel = "True Value", fontsize = 18)
ax.tick_params(axis = 'both', labelsize = 16)
ax.images[-1].colorbar.remove()
ax.grid(False)
fig.savefig("Supervised" + f + " Matrix.png");
confusion(titles = (2, 7),
outcome = "Gleason Score",
models = [dtrf_gs_model, knn_gs_model, poly_gs_model],
Xs = (XG_train, XG_test),
ys = (yG_train, yG_test),
labels = ["6", "7", "8 or 9"],
f = "15 - Gleason")
confusion(titles = (2, 10),
outcome = "Risk Associated with Prostate-Specific Antigen",
models = [dtrf_psa_model, knn_psa_model, poly_psa_model],
Xs = (XP_train, XP_test),
ys = (yP_train, yP_test),
labels = ["Low", "Borderline", "High"],
f = "16 - PSA")
Prostate-Specific Antigen (ng/mL) | log(PSA) | Output Classification Label |
---|---|---|
$< 4$ | $< 1.3863$ | Low Risk |
$4 - 10$ | $1.3863 - 2.3026$ | Borderline Risk |
$> 10$ | $> 2.3026$ | High Risk |
confusion(titles = (3, 3),
outcome = "Seminal Vesicle Invasion",
models = [dtrf_svi_model, knn_svi_model, poly_svi_model],
Xs = (XS_train, XS_test),
ys = (yS_train, yS_test),
labels = ["No", "Yes"],
f = "17 - SVI")