import math
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import seaborn as sns
from warnings import filterwarnings as fw
fw("ignore")


prostate = pd.read_table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data", sep = "\t", index_col = 0)
prostate = prostate[['age', 'lbph', 'lcp',
                     'lcavol', 'lweight', 'pgg45',
                     'gleason', 'lpsa', 'svi']].rename(columns = {"age": "Age",
                                                                  "lbph": "logBPH",
                                                                  "lcp": "logCP",
                                                                  "lcavol": "logCV",
                                                                  "lweight": "logPW",
                                                                  "pgg45": "Gleason2",
                                                                  "gleason": "Gleason1",
                                                                  "lpsa": "logPSA",
                                                                  "svi": "SVI"})
prostate


sns.set_style('darkgrid')

def hist_helper(df, ax, var, name, b, c):
    ax.hist(df[var],
            bins = b,
            color = [c],
            stacked = True)
    ax.set_title("Distribution of " + name, fontsize = 18)
    ax.set_xlabel(xlabel = name, fontsize = 13)
    if (var == 'SVI'):
        plt.sca(ax)
        plt.xticks(ticks = [0.25, 0.75], labels = ['No', 'Yes'], ha = 'center', rotation = 45)
    elif (var == 'Gleason1'):
        plt.sca(ax)
        plt.xticks(ticks = [6.375, 7.125, 7.875, 8.625], labels = [6, 7, 8, 9])
    ax.set_ylabel(ylabel = "Frequency", fontsize = 14)
    ax.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
    ax.tick_params(axis = 'both', labelsize = 12)

fig, ax = plt.subplots(3, 3, figsize = (16, 12))

c1 = '#39568CFF'
c2 = '#29AF7FFF'

hist_helper(prostate, ax[0][0], 'Age', 'Age', 10, c1)
hist_helper(prostate, ax[0][1], 'logBPH', 'log(Benign Prostatic Hyperplasia)', 10, c2)
hist_helper(prostate, ax[0][2], 'logCP', 'log(Capsular Penetration)', 10, c1)

hist_helper(prostate, ax[1][0], 'logCV', 'log(Cancer Volume)', 10, c2)
hist_helper(prostate, ax[1][1], 'logPW', 'log(Prostate Weight)', 10, c1)
hist_helper(prostate, ax[1][2], 'Gleason2', '% that is Gleason 4 or 5', 10, c2)


hist_helper(prostate, ax[2][0], 'Gleason1', 'Gleason Score', 4, c1)
hist_helper(prostate, ax[2][1], 'logPSA', 'log(Prostate-Specific Antigen)', 10, c2)
hist_helper(prostate, ax[2][2], 'SVI', 'Seminal Vesicle Invasion', 2, c1)

fig.tight_layout()
fig.savefig('Unsupervised1 - Histograms.png');


float_columns = [x for x in prostate.columns if x not in ['Gleason1', 'SVI']]

prostate[float_columns].skew().sort_values()

Age        -0.828476
logCV      -0.250304
logPSA     -0.000434
logPW       0.063541
logBPH      0.133813
logCP       0.728634
Gleason2    0.968105
dtype: float64


#https://seaborn.pydata.org/examples/many_pairwise_correlations.html
sns.set_theme(style = "white")

# Compute the correlation matrix.
corr = prostate.corr()

#Maximum and minimum.
vmax = corr[corr < 1].max().max()
vmin = corr.min().min()

# Generate a mask for the upper triangle.
mask = np.triu(np.ones_like(corr, dtype = bool))

# Set up the matplotlib figure.
fig, ax = plt.subplots(figsize = (16, 16))

# Draw the heatmap with the mask and correct aspect ratio.
sns.heatmap(corr,
            mask = mask,
            cmap = 'viridis_r',
            vmax = vmax,
            vmin = vmin,
            square = True,
            linewidths = .5,
            cbar_kws = {"shrink": .7})

plt.title("Correlation of Prostate Predictors", fontsize = 24)
xl, xt = plt.xticks()
plt.xticks(ticks = xl[0:-1],
           labels = xt[0:-1],
           fontsize = 18,
           ha = 'center',
           rotation = 45);

yl, yt = plt.yticks()
plt.yticks(ticks = yl[1:],
           labels = yt[1:],
           fontsize = 18,
           rotation = 45)

fig.savefig('Unsupervised2 - Correlations.png');


from sklearn.model_selection import train_test_split as TTS
train, validation = TTS(prostate, test_size = 0.3, random_state = 31313)



#Create grid of pairplots, colored by cluster labels.
def pair_plot(df, model, name, palette, f):
    
    #Append cluster labels to train data.
    temp = pd.DataFrame.copy(df)
    temp[name] = model.labels_

    #Setup.
    sns.set_style('darkgrid')
    sns.set(font_scale = 1.75)

    #Pairplot setup.
    g = sns.pairplot(temp,
                 hue = name,
                 palette = palette,
                 markers = 'd',
                 corner = True,
                 height = 2,
                 plot_kws = {'s': 40,
                             'alpha': 0.8,
                             'lw': 0.5,
                             'edgecolor': 'k'})

    #Set main title.
    g.fig.suptitle("Pairwise Scatter Plots by Cluster (" + name + ")",
                   y = .98,
                   size = 32)
 
    #Remove legend.
    g._legend.remove()

    #Create new legend
    l = g.fig.legend(title = 'Cluster',
                     handles = g._legend_data.values(),
                     labels = g._legend_data.keys(),
                     fontsize = '24',
                     title_fontsize = '28',
                     ncol = 1,
                     frameon = False)

    #Legend location and background.
    l.set_bbox_to_anchor((0.88, 0.65))
    l.get_frame().set_facecolor('white')
    
    g.savefig("Unsupervised" + f + ' - ' + name + '.png');


from sklearn.cluster import KMeans as KM

distortions = []
inertias = []
centers = []
K = range(1, 11)

for k in K:
    
    # Building and fitting the model
    model_km = KM(n_clusters = k, init = 'k-means++').fit(train)

    distortions.append(sum(np.min(cdist(train, model_km.cluster_centers_, 'euclidean'), axis=1)) / train.shape[0])
    inertias.append(model_km.inertia_)
    centers.append(model_km.cluster_centers_)
    
#Graph helper
def elbow_graph(axis, title, y, c) :
    
    sns.set_theme(style = "darkgrid")
    
    #Lines.
    axis.plot(K, y, lw = 3, color = c)
    
    #Subtitle.
    axis.set_title(title, fontsize = 24)
    
    #Axes.
    axis.set_xlabel("# of Clusters", fontsize = 20)
    axis.set_ylabel("Distance Metric", fontsize = 20)
    for tick in axis.get_yticklabels():
        tick.set_rotation(25)
    if (title == "Inertia"):
        axis.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
    axis.tick_params(axis = 'both', labelsize = 14)
    
    axis.annotate('Elbow Point at n = 3',
            xy = (3, 1.02 * y[2]),
            xytext = (6, 2 * y[2]),
            fontsize = 16,
            color = c,
            arrowprops = dict(facecolor = c,
                              connectionstyle = "arc3, rad = 0"))

    
#Full graph.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 7))

elbow_graph(ax1, "Distortion", distortions, '#1F968B')

elbow_graph(ax2, "Inertia", inertias, '#481567')

fig.savefig('Unsupervised3 - Elbow.png');


model_k = KM(n_clusters = 3, init = 'k-means++').fit(train)

pair_plot(train, model_k, 'KMeans', 'viridis', '4')


from scipy.cluster.hierarchy import dendrogram as D
from scipy.cluster.hierarchy import set_link_color_palette as SLCP
from sklearn.cluster import AgglomerativeClustering as AC

#https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    SLCP(("#FDE725",
          "#2D708E",
          "#73D055",
          "#453781",
          "#20A387",
          "#440154"))
    
    D(linkage_matrix, color_threshold = 45, above_threshold_color = 'grey', **kwargs)


model_a = AC(distance_threshold = 0, n_clusters = None, linkage = 'ward').fit(train)
sns.set_style('white')

fig = plt.figure(figsize = (16, 9))
#https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
plt.title('Hierarchical Agglomerative Clustering Dendrogram', fontsize = 24)
plot_dendrogram(model_a, truncate_mode = 'level', p = 9)
plt.xlabel("Index of Point", fontsize = 20)

#Remove axes and ticks.
ax = plt.gca()
for side in ["top", "right", "bottom", "left"]:
    ax.spines[side].set_visible(False)

plt.yscale("symlog")
ax.set_yticks([])
ax.tick_params(axis = 'both', labelsize = 12)


fig.savefig('Unsupervised5 - Dendrogram.png');


model_h = AC(n_clusters = 6, linkage = 'ward').fit(train)

pair_plot(train, model_h, 'HAC', 'viridis', '6')


from sklearn.cluster import MeanShift as MS

model_m = MS().fit(train)

palette = sns.set_palette(sns.color_palette(['#1F968B', '#481567']))

pair_plot(train, model_m, 'MeanShift', palette, '7')


model_h_final = model_h.fit(validation)

pair_plot(validation, model_h_final, 'HAC - Validation', 'viridis', '8')

	Age	logBPH	logCP	logCV	logPW	Gleason2	Gleason1	logPSA	SVI
1	50	-1.386294	-1.386294	-0.579818	2.769459	0	6	-0.430783	0
2	58	-1.386294	-1.386294	-0.994252	3.319626	0	6	-0.162519	0
3	74	-1.386294	-1.386294	-0.510826	2.691243	20	7	-0.162519	0
4	58	-1.386294	-1.386294	-1.203973	3.282789	0	6	-0.162519	0
5	62	-1.386294	-1.386294	0.751416	3.432373	0	6	0.371564	0
...	...	...	...	...	...	...	...	...	...
93	68	-1.386294	1.321756	2.830268	3.876396	60	7	4.385147	1
94	44	-1.386294	2.169054	3.821004	3.896909	40	7	4.684443	1
95	52	-1.386294	2.463853	2.907447	3.396185	10	7	5.143124	1
96	68	1.558145	1.558145	2.882564	3.773910	80	7	5.477509	1
97	68	0.438255	2.904165	3.471966	3.974998	20	7	5.582932	1

Clustering with Prostate Cancer¶

Final for Unsupervised Learning¶

Rohan Lewis¶

2021.03.24¶

I. Introduction¶

II. Data¶

1. Packages¶

2. Read Data¶

3. Objective¶

III. Exploratory Data Analysis¶

1. Distributions and Histograms¶

2. Skew¶

3. Correlation¶

IV. Clustering Models¶

1. KMeans Clustering¶

2. Hierarchical Agglomerative Clustering¶

3. MeanShift Clustering¶

V. Final Model Selection¶