import math
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import numpy as np
import pandas as pd
import seaborn as sns
from warnings import filterwarnings as fw
fw("ignore")


prostate = pd.read_table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data", sep = "\t", index_col = 0)
prostate = prostate[['age', 'lbph', 'lcp',
                     'lcavol', 'lweight', 'pgg45',
                     'gleason', 'lpsa', 'svi']].rename(columns = {"age": "Age",
                                                                  "lbph": "logBPH",
                                                                  "lcp": "logCP",
                                                                  "lcavol": "logCV",
                                                                  "lweight": "logPW",
                                                                  "pgg45": "Gleason2",
                                                                  "gleason": "Gleason1",
                                                                  "lpsa": "logPSA",
                                                                  "svi": "SVI"})
prostate


sns.set_style('darkgrid')

def hist_helper(df, ax, var, name, b, c):
    ax.hist(df[var],
            bins = b,
            color = [c],
            stacked = True)
    ax.set_title("Distribution of " + name, fontsize = 18)
    ax.set_xlabel(xlabel = name, fontsize = 13)
    if (var == 'SVI'):
        plt.sca(ax)
        plt.xticks(ticks = [0.25, 0.75], labels = ['No', 'Yes'], ha = 'center', rotation = 45)
    elif (var == 'Gleason1'):
        plt.sca(ax)
        plt.xticks(ticks = [6.375, 7.125, 7.875, 8.625], labels = [6, 7, 8, 9])
    ax.set_ylabel(ylabel = "Frequency", fontsize = 14)
    ax.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
    ax.tick_params(axis = 'both', labelsize = 12)

fig, ax = plt.subplots(3, 3, figsize = (16, 12))

c1 = '#39568CFF'
c2 = '#29AF7FFF'

hist_helper(prostate, ax[0][0], 'Age', 'Age', 10, c1)
hist_helper(prostate, ax[0][1], 'logBPH', 'log(Benign Prostatic Hyperplasia)', 10, c2)
hist_helper(prostate, ax[0][2], 'logCP', 'log(Capsular Penetration)', 10, c1)

hist_helper(prostate, ax[1][0], 'logCV', 'log(Cancer Volume)', 10, c2)
hist_helper(prostate, ax[1][1], 'logPW', 'log(Prostate Weight)', 10, c1)
hist_helper(prostate, ax[1][2], 'Gleason2', '% that is Gleason 4 or 5', 10, c2)


hist_helper(prostate, ax[2][0], 'Gleason1', 'Gleason Score', 4, c1)
hist_helper(prostate, ax[2][1], 'logPSA', 'log(Prostate-Specific Antigen)', 10, c2)
hist_helper(prostate, ax[2][2], 'SVI', 'Seminal Vesicle Invasion', 2, c1)

fig.tight_layout()
fig.savefig('Supervised1 - Histograms.png');


float_columns = [x for x in prostate.columns if x not in ['Gleason1', 'SVI']]

prostate[float_columns].skew().sort_values()

Age        -0.828476
logCV      -0.250304
logPSA     -0.000434
logPW       0.063541
logBPH      0.133813
logCP       0.728634
Gleason2    0.968105
dtype: float64


#https://seaborn.pydata.org/examples/many_pairwise_correlations.html
sns.set_theme(style = "white")

# Compute the correlation matrix.
corr = prostate.corr()

#Maximum and minimum.
vmax = corr[corr < 1].max().max()
vmin = corr.min().min()

# Generate a mask for the upper triangle.
mask = np.triu(np.ones_like(corr, dtype = bool))

# Set up the matplotlib figure.
fig, ax = plt.subplots(figsize = (20, 15))

# Draw the heatmap with the mask and correct aspect ratio.
sns.heatmap(corr,
            mask = mask,
            cmap = 'viridis_r',
            vmax = vmax,
            vmin = vmin,
            square = True,
            linewidths = .5,
            cbar_kws = {"shrink": .7})

plt.title("Correlation of Prostate Predictors", x = .6, y = .9, fontsize = 24,)
xl, xt = plt.xticks()
plt.xticks(ticks = xl[0:-1],
           labels = xt[0:-1],
           fontsize = 18,
           ha = 'center',
           rotation = 45);

yl, yt = plt.yticks()
plt.yticks(ticks = yl[1:],
           labels = yt[1:],
           fontsize = 18,
           rotation = 45)

fig.savefig('Supervised2 - Correlations.png', bbox_inches = 'tight', pad_inches = 0.05);


prostate['y_Gleason'] = prostate['Gleason1'].replace([6,7,8,9], [0,1,2,2])

def psa_risk(p):
    if (p < 1.3863) :
        v = 0
    elif (p > 2.3026) :
        v = 2
    else :
        v = 1
    return(v)

prostate['y_PSA'] = prostate['logPSA'].apply(lambda p: psa_risk(p))

prostate['y_SVI'] = prostate['SVI']


from sklearn.model_selection import train_test_split as TTS
train, test = TTS(prostate, test_size = 0.3, random_state = 31313)

non_pred = ['Age', 'logBPH', 'logCP', 'logCV', 'logPW', 'Gleason2']

XG_train = train[non_pred + ['logPSA', 'SVI']]
XG_test = test[non_pred + ['logPSA', 'SVI']]
yG_train = train['y_Gleason'].values
yG_test = test['y_Gleason'].values

XP_train = train[non_pred + ['Gleason1', 'SVI']]
XP_test = test[non_pred + ['Gleason1', 'SVI']]
yP_train = train['y_PSA'].values
yP_test = test['y_PSA'].values

XS_train = train[non_pred + ['Gleason1', 'logPSA']]
XS_test = test[non_pred + ['Gleason1', 'logPSA']]
yS_train = train['y_SVI'].values
yS_test = test['y_SVI'].values


from sklearn.model_selection import validation_curve as VC

#Graphs Validation Curve of Train and Validation Sets given a model, X, y, and name of model.
def model_vc(model, X, y, name, f):
    
    #Random Forest.
    if (name == "Random Forest"):
        param_name = "max_depth"
        param_range = list(range(2, 11))
        proper = "Maximum Depth"
    
    #K-Nearest Neighbor.
    elif name == "K-Nearest Neighbor":
        param_name = "n_neighbors"
        param_range = [n for n in range(1, 44)]
        proper = "# of Neighbors"
    
    #SVM.
    else:
        param_name = "C"
        param_range = [10**n for n in range(-5, 6)]
        proper = "C"
    
    #Validation Curve with 3-fold.
    train_scores, valid_scores = VC(model,
                                    X,
                                    y,
                                    param_name = param_name,
                                    param_range = param_range,
                                    cv = 3)
    
    #Get Accuracies.
    train_list = []
    valid_list = []
    for p in range(len(param_range)) :
        train_list.append(sum(train_scores[p]) / 3)
        valid_list.append(sum(valid_scores[p]) / 3)
    
    #Plot.
    fig, ax = plt.subplots(figsize = (10, 7))
    ax.plot(param_range, train_list, c = 'b', lw = 3, label = "Train Set")
    ax.plot(param_range, valid_list, c = 'g', lw = 3, label = "Validation Set")
    #Title.
    ax.set_title("Train and Validation Set Accuracies", fontsize = 22)

    #X-Axis.
    ax.set_xlabel(xlabel = proper, fontsize = 18)
    if (name == "Random Forest"):
        ax.set_xticks(param_range)
    elif (name == "K-Nearest Neighbor"):
        ax.set_xticks([5*x for x in range(10)])
    else:    
        ax.set_xscale('log')
        proper = "Hyperparameter C Value"
    ax.set_xlabel(xlabel = proper, fontsize = 18)        

    #Y-Axis.
    ax.set_ylabel(ylabel = "Accuracy", fontsize = 18)
    ax.yaxis.set_major_formatter(SMF('{x: 0.5f}'))
    ax.tick_params(axis = 'both', labelsize = 16)
    
    #Legend and Footnote.
    ax.legend(fontsize = 18, loc = 'center right')
    plt.figtext(0.9, 0, "3-fold cross-validation values were averaged.", ha = 'right', fontsize = 12)
    fig.savefig("Supervised" + f + ' - ' + name + '.png');


from sklearn.ensemble import RandomForestClassifier as RFC

model = RFC(n_estimators = 10,
            random_state = 31313)


model_vc(model, XG_train, yG_train, "Random Forest", '3')


dtrf_gs_model = RFC(n_estimators = 10,
                    max_depth = 2,
                    random_state = 31313).fit(XG_train, yG_train)


model_vc(model, XP_train, yP_train, "Random Forest", '4')


dtrf_psa_model = RFC(n_estimators = 10,
                     max_depth = 6,
                     random_state = 31313).fit(XP_train, yP_train)


model_vc(model, XS_train, yS_train, "Random Forest", '5')


dtrf_svi_model = RFC(n_estimators = 10,
                     max_depth = 3,
                     random_state = 31313).fit(XS_train, yS_train)


from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.preprocessing import StandardScaler as SS

model = KNC()


model_vc(model, SS().fit_transform(XG_train), yG_train, "K-Nearest Neighbor", '6')


knn_gs_model = KNC(n_neighbors = 11).fit(SS().fit_transform(XG_train), yG_train)


model_vc(model, SS().fit_transform(XP_train), yP_train, "K-Nearest Neighbor", '7')


knn_psa_model = KNC(n_neighbors = 15).fit(SS().fit_transform(XP_train), yP_train)


model_vc(model, SS().fit_transform(XS_train), yS_train, "K-Nearest Neighbor", '8')


knn_svi_model = KNC(n_neighbors = 3).fit(SS().fit_transform(XS_train), yS_train)


from sklearn.svm import SVC

model = SVC(kernel = 'poly', degree = 3)


model_vc(model, XG_train, yG_train, "SVM (Polynomial Degree 3)", '9')


poly_gs_model = SVC(kernel = 'poly',
                    degree = 3,
                    C = 1e2).fit(XG_train, yG_train)


model_vc(model, XP_train, yP_train, "SVM (Polynomial Degree 3)", '10')


poly_psa_model = SVC(kernel = 'poly',
                     degree = 3,
                     C = 1e2).fit(XP_train, yP_train)


model_vc(model, XS_train, yS_train, "SVM (Polynomial Degree 3)", '11')


poly_svi_model = SVC(kernel = 'poly',
                     degree = 3,
                     C = 1e2).fit(XS_train, yS_train)


from sklearn import metrics as m
from sklearn.metrics import accuracy_score as ac
from sklearn.metrics import precision_score as ps
from sklearn.metrics import recall_score as rs
from sklearn.metrics import f1_score as fs

#Returns the Jaccard and F1 score for the models.
def summary(Xys, models, outcome):
    accuracy = []
    precision = []
    recall = []
    f1s = []
    
    #Calculate scores from each model and append to lists.
    for m in range(3) :
        for Xy in Xys:
            X = Xy[0]
            if (m == 1):
                X = SS().fit_transform(X)
            
            y_pred = models[m].predict(X)
            accuracy.append(ac(Xy[1], y_pred))
            precision.append(ps(Xy[1], y_pred, average = 'weighted'))
            recall.append(rs(Xy[1], y_pred, average = 'weighted'))
            f1s.append(fs(Xy[1], y_pred, average = 'weighted'))
            
    #Names.
    dtrf = "Decision Tree/Random Forest"
    knn = "K-Nearest Neighbor"
    svm = "Support Vector Machine"
    train = ", Train Set"
    test = ", Test Set"
    
    #Output dataframe.
    data = {"Algorithm": [dtrf + train, dtrf + test,
                          knn + train, knn + test,
                          svm + train, svm + test],
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1s}
    
    report = pd.DataFrame(data = data).set_index("Algorithm")

    return(report.style.set_caption("Predicting " + outcome))


summary([(XG_train, yG_train),
         (XG_test, yG_test)],
        [dtrf_gs_model, knn_gs_model, poly_gs_model],
        "Gleason Score")


summary([(XP_train, yP_train),
         (XP_test, yP_test)],
        [dtrf_psa_model, knn_psa_model, poly_psa_model],
        "Prostate-Specific Antigen")


summary([(XS_train, yS_train),
         (XS_test, yS_test)],
        [dtrf_svi_model, knn_svi_model, poly_svi_model],
        "Seminal Vesicle Invasion")


from sklearn.metrics import plot_confusion_matrix as PCM
import matplotlib

def confusion(titles, outcome, models, Xs, ys, labels, f):

    data = ["Train Set", "Test Set"]
    titles = ["Random Forest (10 Trees, Maximum Depth = " + str(titles[0]) + ")",
              "K-Nearest Neighbors (N = " + str(titles[1]) + ")",
              "Support Vector Machine (Polynomial Kernel, Degree 3)"]
    
    fig, axes = plt.subplots(nrows = 3, ncols = 2, figsize = (15, 23))
    fig.suptitle(outcome, fontsize = 30, y = 0.92)
    plt.figtext(x = 0.5, y = 0.88, ha = 'center', s = titles[0], fontsize = 26)
    plt.figtext(x = 0.5, y = 0.62, ha = 'center', s = titles[1], fontsize = 26)
    plt.figtext(x = 0.5, y = 0.35, ha = 'center', s = titles[2], fontsize = 26)
    matplotlib.rcParams.update({'font.size': 30})
 
    for r in range(3):
        for c in range(2):
            ax = axes[r][c]
            
            X = Xs[c]
            if (r == 1):
                X = SS().fit_transform(X)
        
            PCM(estimator = models[r],
                X = X,                
                y_true = ys[c],
                ax = ax,
                display_labels = labels,
                include_values = True)
        
            ax.set_title(data[c], fontsize = 20)
            ax.set_xlabel(xlabel = "Predicted Value", fontsize = 18)
            ax.set_ylabel(ylabel = "True Value", fontsize = 18)
            ax.tick_params(axis = 'both', labelsize = 16)
            ax.images[-1].colorbar.remove()
            ax.grid(False)
    
    fig.savefig("Supervised" + f + " Matrix.png");


confusion(titles = (2, 7),
          outcome = "Gleason Score",
          models = [dtrf_gs_model, knn_gs_model, poly_gs_model],
          Xs = (XG_train, XG_test),
          ys = (yG_train, yG_test),
          labels = ["6", "7", "8 or 9"],
          f = "15 - Gleason")


confusion(titles = (2, 10),
          outcome = "Risk Associated with Prostate-Specific Antigen",
          models = [dtrf_psa_model, knn_psa_model, poly_psa_model],
          Xs = (XP_train, XP_test),
          ys = (yP_train, yP_test),
          labels = ["Low", "Borderline", "High"],
          f = "16 - PSA")


confusion(titles = (3, 3),
          outcome = "Seminal Vesicle Invasion",
          models = [dtrf_svi_model, knn_svi_model, poly_svi_model],
          Xs = (XS_train, XS_test),
          ys = (yS_train, yS_test),
          labels = ["No", "Yes"],
          f = "17 - SVI")

	Age	logBPH	logCP	logCV	logPW	Gleason2	Gleason1	logPSA	SVI
1	50	-1.386294	-1.386294	-0.579818	2.769459	0	6	-0.430783	0
2	58	-1.386294	-1.386294	-0.994252	3.319626	0	6	-0.162519	0
3	74	-1.386294	-1.386294	-0.510826	2.691243	20	7	-0.162519	0
4	58	-1.386294	-1.386294	-1.203973	3.282789	0	6	-0.162519	0
5	62	-1.386294	-1.386294	0.751416	3.432373	0	6	0.371564	0
...	...	...	...	...	...	...	...	...	...
93	68	-1.386294	1.321756	2.830268	3.876396	60	7	4.385147	1
94	44	-1.386294	2.169054	3.821004	3.896909	40	7	4.684443	1
95	52	-1.386294	2.463853	2.907447	3.396185	10	7	5.143124	1
96	68	1.558145	1.558145	2.882564	3.773910	80	7	5.477509	1
97	68	0.438255	2.904165	3.471966	3.974998	20	7	5.582932	1

Prostate-Specific Antigen (ng/mL)	log(PSA)	Output Classification Label
$< 4$	$< 1.3863$	Low Risk
$4 - 10$	$1.3863 - 2.3026$	Borderline Risk
$> 10$	$> 2.3026$	High Risk

	Accuracy	Precision	Recall	F1-Score
Algorithm
Decision Tree/Random Forest, Train Set	0.940299	0.886720	0.940299	0.912061
Decision Tree/Random Forest, Test Set	0.933333	0.872464	0.933333	0.901515
K-Nearest Neighbor, Train Set	0.761194	0.723105	0.761194	0.735301
K-Nearest Neighbor, Test Set	0.666667	0.715556	0.666667	0.654040
Support Vector Machine, Train Set	0.970149	0.971763	0.970149	0.965589
Support Vector Machine, Test Set	0.966667	0.968182	0.966667	0.961499

	Accuracy	Precision	Recall	F1-Score
Algorithm
Decision Tree/Random Forest, Train Set	0.955224	0.956522	0.955224	0.955366
Decision Tree/Random Forest, Test Set	0.766667	0.772222	0.766667	0.767273
K-Nearest Neighbor, Train Set	0.716418	0.702927	0.716418	0.697149
K-Nearest Neighbor, Test Set	0.700000	0.764975	0.700000	0.720939
Support Vector Machine, Train Set	0.761194	0.765320	0.761194	0.761288
Support Vector Machine, Test Set	0.666667	0.754497	0.666667	0.696667

	Accuracy	Precision	Recall	F1-Score
Algorithm
Decision Tree/Random Forest, Train Set	0.970149	0.971215	0.970149	0.969177
Decision Tree/Random Forest, Test Set	0.833333	0.828157	0.833333	0.829630
K-Nearest Neighbor, Train Set	0.910448	0.917260	0.910448	0.912801
K-Nearest Neighbor, Test Set	0.900000	0.905820	0.900000	0.901778
Support Vector Machine, Train Set	0.925373	0.924038	0.925373	0.921515
Support Vector Machine, Test Set	0.833333	0.828157	0.833333	0.829630

Supervised Learning with Prostate Cancer¶

Final for Supervised Learning: Classification¶

Rohan Lewis¶

2021.07.13¶

I. Introduction¶

II. Data¶

1. Packages¶

2. Read Data¶

3. Objective¶

III. Exploratory Data Analysis¶

1. Distributions and Histograms¶

2. Skew¶

3. Correlation¶

IV. Classification Models¶

1. Outcome Variables.¶

2. Train Test Split¶

3. Models¶

4. Decision Tree (Random Forest)¶

i. Setup¶

ii. Gleason Score¶

iii. Prostate-Specific Antigen¶

iv. Seminal Vesicle Invasion¶

5. K-Nearest Neighbors¶

i. Setup¶

ii. Gleason Score¶

iii. Prostate-Specific Antigen¶

iv. Seminal Vesicle Invasion¶

6. Support Vector Machine (Polynomial of degree 3)¶

i. Setup¶

ii. Gleason Score¶

iii. Prostate-Specific Antigen¶

iv. Seminal Vesicle Invasion¶

IV. Results¶

1. Function¶

V. Confusion Matrices and Predictions¶