For my final project, I chose the prostate cancer data from Datasets for "The Elements of Statistical Learning.
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter as SMF
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import seaborn as sns
from warnings import filterwarnings as fw
fw("ignore")
prostate = pd.read_table("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data", sep = "\t", index_col = 0)
prostate = prostate[['age', 'lbph', 'lcp',
'lcavol', 'lweight', 'pgg45',
'gleason', 'lpsa', 'svi']].rename(columns = {"age": "Age",
"lbph": "logBPH",
"lcp": "logCP",
"lcavol": "logCV",
"lweight": "logPW",
"pgg45": "Gleason2",
"gleason": "Gleason1",
"lpsa": "logPSA",
"svi": "SVI"})
prostate
Age | logBPH | logCP | logCV | logPW | Gleason2 | Gleason1 | logPSA | SVI | |
---|---|---|---|---|---|---|---|---|---|
1 | 50 | -1.386294 | -1.386294 | -0.579818 | 2.769459 | 0 | 6 | -0.430783 | 0 |
2 | 58 | -1.386294 | -1.386294 | -0.994252 | 3.319626 | 0 | 6 | -0.162519 | 0 |
3 | 74 | -1.386294 | -1.386294 | -0.510826 | 2.691243 | 20 | 7 | -0.162519 | 0 |
4 | 58 | -1.386294 | -1.386294 | -1.203973 | 3.282789 | 0 | 6 | -0.162519 | 0 |
5 | 62 | -1.386294 | -1.386294 | 0.751416 | 3.432373 | 0 | 6 | 0.371564 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
93 | 68 | -1.386294 | 1.321756 | 2.830268 | 3.876396 | 60 | 7 | 4.385147 | 1 |
94 | 44 | -1.386294 | 2.169054 | 3.821004 | 3.896909 | 40 | 7 | 4.684443 | 1 |
95 | 52 | -1.386294 | 2.463853 | 2.907447 | 3.396185 | 10 | 7 | 5.143124 | 1 |
96 | 68 | 1.558145 | 1.558145 | 2.882564 | 3.773910 | 80 | 7 | 5.477509 | 1 |
97 | 68 | 0.438255 | 2.904165 | 3.471966 | 3.974998 | 20 | 7 | 5.582932 | 1 |
97 rows × 9 columns
The objective of this study is cluster groups of men based off metrics related to prostate and reproductive health.
Similarities within clusters and differences between clusters will be discussed.
There are no missing values and several variables have already been log transformed.
sns.set_style('darkgrid')
def hist_helper(df, ax, var, name, b, c):
ax.hist(df[var],
bins = b,
color = [c],
stacked = True)
ax.set_title("Distribution of " + name, fontsize = 18)
ax.set_xlabel(xlabel = name, fontsize = 13)
if (var == 'SVI'):
plt.sca(ax)
plt.xticks(ticks = [0.25, 0.75], labels = ['No', 'Yes'], ha = 'center', rotation = 45)
elif (var == 'Gleason1'):
plt.sca(ax)
plt.xticks(ticks = [6.375, 7.125, 7.875, 8.625], labels = [6, 7, 8, 9])
ax.set_ylabel(ylabel = "Frequency", fontsize = 14)
ax.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
ax.tick_params(axis = 'both', labelsize = 12)
fig, ax = plt.subplots(3, 3, figsize = (16, 12))
c1 = '#39568CFF'
c2 = '#29AF7FFF'
hist_helper(prostate, ax[0][0], 'Age', 'Age', 10, c1)
hist_helper(prostate, ax[0][1], 'logBPH', 'log(Benign Prostatic Hyperplasia)', 10, c2)
hist_helper(prostate, ax[0][2], 'logCP', 'log(Capsular Penetration)', 10, c1)
hist_helper(prostate, ax[1][0], 'logCV', 'log(Cancer Volume)', 10, c2)
hist_helper(prostate, ax[1][1], 'logPW', 'log(Prostate Weight)', 10, c1)
hist_helper(prostate, ax[1][2], 'Gleason2', '% that is Gleason 4 or 5', 10, c2)
hist_helper(prostate, ax[2][0], 'Gleason1', 'Gleason Score', 4, c1)
hist_helper(prostate, ax[2][1], 'logPSA', 'log(Prostate-Specific Antigen)', 10, c2)
hist_helper(prostate, ax[2][2], 'SVI', 'Seminal Vesicle Invasion', 2, c1)
fig.tight_layout()
fig.savefig('Unsupervised1 - Histograms.png');
float_columns = [x for x in prostate.columns if x not in ['Gleason1', 'SVI']]
prostate[float_columns].skew().sort_values()
Age -0.828476 logCV -0.250304 logPSA -0.000434 logPW 0.063541 logBPH 0.133813 logCP 0.728634 Gleason2 0.968105 dtype: float64
Age
is left skewed.
logCV
, logPSA
, and logPW
have been appropriately log transformed and are close to normal distribution.
logBPH
, logCP
, and Gleason2
all have over 40% of their observations in the least value bin.
Gleason1
and SVI
are both categorical and are unbalanced.
#https://seaborn.pydata.org/examples/many_pairwise_correlations.html
sns.set_theme(style = "white")
# Compute the correlation matrix.
corr = prostate.corr()
#Maximum and minimum.
vmax = corr[corr < 1].max().max()
vmin = corr.min().min()
# Generate a mask for the upper triangle.
mask = np.triu(np.ones_like(corr, dtype = bool))
# Set up the matplotlib figure.
fig, ax = plt.subplots(figsize = (16, 16))
# Draw the heatmap with the mask and correct aspect ratio.
sns.heatmap(corr,
mask = mask,
cmap = 'viridis_r',
vmax = vmax,
vmin = vmin,
square = True,
linewidths = .5,
cbar_kws = {"shrink": .7})
plt.title("Correlation of Prostate Predictors", fontsize = 24)
xl, xt = plt.xticks()
plt.xticks(ticks = xl[0:-1],
labels = xt[0:-1],
fontsize = 18,
ha = 'center',
rotation = 45);
yl, yt = plt.yticks()
plt.yticks(ticks = yl[1:],
labels = yt[1:],
fontsize = 18,
rotation = 45)
fig.savefig('Unsupervised2 - Correlations.png');
Gleason1
and Gleason2
have the highest correlation at around 0.75.
logCV
and logPSA
, logCP
and logCV
, and logCP
and SVI
also have high correlations.
No variables will be removed, as multicollinearity is not an issue.
The data will be split into training (70%) and validation (30%) sets.
I am using three usupervised learning algorithms:
For each of the three models, the cluster results of the train set will be represented as the color in a grid of pair-plots for all nine variables.
from sklearn.model_selection import train_test_split as TTS
train, validation = TTS(prostate, test_size = 0.3, random_state = 31313)
#Create grid of pairplots, colored by cluster labels.
def pair_plot(df, model, name, palette, f):
#Append cluster labels to train data.
temp = pd.DataFrame.copy(df)
temp[name] = model.labels_
#Setup.
sns.set_style('darkgrid')
sns.set(font_scale = 1.75)
#Pairplot setup.
g = sns.pairplot(temp,
hue = name,
palette = palette,
markers = 'd',
corner = True,
height = 2,
plot_kws = {'s': 40,
'alpha': 0.8,
'lw': 0.5,
'edgecolor': 'k'})
#Set main title.
g.fig.suptitle("Pairwise Scatter Plots by Cluster (" + name + ")",
y = .98,
size = 32)
#Remove legend.
g._legend.remove()
#Create new legend
l = g.fig.legend(title = 'Cluster',
handles = g._legend_data.values(),
labels = g._legend_data.keys(),
fontsize = '24',
title_fontsize = '28',
ncol = 1,
frameon = False)
#Legend location and background.
l.set_bbox_to_anchor((0.88, 0.65))
l.get_frame().set_facecolor('white')
g.savefig("Unsupervised" + f + ' - ' + name + '.png');
from sklearn.cluster import KMeans as KM
distortions = []
inertias = []
centers = []
K = range(1, 11)
for k in K:
# Building and fitting the model
model_km = KM(n_clusters = k, init = 'k-means++').fit(train)
distortions.append(sum(np.min(cdist(train, model_km.cluster_centers_, 'euclidean'), axis=1)) / train.shape[0])
inertias.append(model_km.inertia_)
centers.append(model_km.cluster_centers_)
#Graph helper
def elbow_graph(axis, title, y, c) :
sns.set_theme(style = "darkgrid")
#Lines.
axis.plot(K, y, lw = 3, color = c)
#Subtitle.
axis.set_title(title, fontsize = 24)
#Axes.
axis.set_xlabel("# of Clusters", fontsize = 20)
axis.set_ylabel("Distance Metric", fontsize = 20)
for tick in axis.get_yticklabels():
tick.set_rotation(25)
if (title == "Inertia"):
axis.yaxis.set_major_formatter(SMF('{x: ,.0f}'))
axis.tick_params(axis = 'both', labelsize = 14)
axis.annotate('Elbow Point at n = 3',
xy = (3, 1.02 * y[2]),
xytext = (6, 2 * y[2]),
fontsize = 16,
color = c,
arrowprops = dict(facecolor = c,
connectionstyle = "arc3, rad = 0"))
#Full graph.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 7))
elbow_graph(ax1, "Distortion", distortions, '#1F968B')
elbow_graph(ax2, "Inertia", inertias, '#481567')
fig.savefig('Unsupervised3 - Elbow.png');
model_k = KM(n_clusters = 3, init = 'k-means++').fit(train)
pair_plot(train, model_k, 'KMeans', 'viridis', '4')
from scipy.cluster.hierarchy import dendrogram as D
from scipy.cluster.hierarchy import set_link_color_palette as SLCP
from sklearn.cluster import AgglomerativeClustering as AC
#https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
SLCP(("#FDE725",
"#2D708E",
"#73D055",
"#453781",
"#20A387",
"#440154"))
D(linkage_matrix, color_threshold = 45, above_threshold_color = 'grey', **kwargs)
model_a = AC(distance_threshold = 0, n_clusters = None, linkage = 'ward').fit(train)
sns.set_style('white')
fig = plt.figure(figsize = (16, 9))
#https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
plt.title('Hierarchical Agglomerative Clustering Dendrogram', fontsize = 24)
plot_dendrogram(model_a, truncate_mode = 'level', p = 9)
plt.xlabel("Index of Point", fontsize = 20)
#Remove axes and ticks.
ax = plt.gca()
for side in ["top", "right", "bottom", "left"]:
ax.spines[side].set_visible(False)
plt.yscale("symlog")
ax.set_yticks([])
ax.tick_params(axis = 'both', labelsize = 12)
fig.savefig('Unsupervised5 - Dendrogram.png');
model_h = AC(n_clusters = 6, linkage = 'ward').fit(train)
pair_plot(train, model_h, 'HAC', 'viridis', '6')
from sklearn.cluster import MeanShift as MS
model_m = MS().fit(train)
palette = sns.set_palette(sns.color_palette(['#1F968B', '#481567']))
pair_plot(train, model_m, 'MeanShift', palette, '7')
Hierarchical Agglomerative Clustering was chosen as the final model.
model_h_final = model_h.fit(validation)
pair_plot(validation, model_h_final, 'HAC - Validation', 'viridis', '8')