In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score, mutual_info_score, calinski_harabaz_score
In [3]:
from sklearn.datasets import make_blobs
In [4]:
plt.style.use('dark_background')
In [5]:
import warnings
In [6]:
warnings.simplefilter('ignore', FutureWarning)
Clustering¶
A nice overview of clustering which explains the basic concepts and
provides further references to several classes of clustering algorithms
is provided by
`scikit-learn
<https://scikit-learn.org/stable/modules/clustering.html>`__.
It is a useful exercise to implement some of the clustering algorithms
yourself, since they are generally quite tractable, and you can learn a
lot by comparing your solution to the implementation in
``scikit-learn` <https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/cluster>`__.
Toy example to illustrate concepts¶
In [7]:
np.random.seed(123)
In [8]:
npts = 1000
nc = 6
X, y = make_blobs(n_samples=npts, centers=nc)
In [9]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
cmap=plt.cm.get_cmap('Accent', nc))
plt.axis('square')
pass

How to cluster¶
Choose a clustering algorithm
Choose the number of clusters
K-means¶
There are many different clustering methods, but K-means is fast, scales well, and can be interpreted as a probabilistic model. We will write 3 versions of the K-means algorithm to illustrate the main concepts. The algorithm is very simple:
Start with \(k\) centers with labels \(0, 1, \ldots, k-1\)
Find the distance of each data point to each center
Assign the data points nearest to a center to its label
Use the mean of the points assigned to a center as the new center
Repeat for a fixed number of iterations or until the centers stop changing
Note: If you want a probabilistic interpretation, just treat the final solution as a mixture of (multivariate) normal distributions. K-means is often used to initialize the fitting of full statistical mixture models, which are computationally more demanding and hence slower.
Distance between sets of vectors¶
In [10]:
from scipy.spatial.distance import cdist
In [11]:
pts = np.arange(6).reshape(-1,2)
pts
Out[11]:
array([[0, 1],
[2, 3],
[4, 5]])
In [12]:
mus = np.arange(4).reshape(-1, 2)
mus
Out[12]:
array([[0, 1],
[2, 3]])
In [13]:
cdist(pts, mus)
Out[13]:
array([[0. , 2.82842712],
[2.82842712, 0. ],
[5.65685425, 2.82842712]])
Version 1¶
In [14]:
def my_kemans_1(X, k, iters=10):
"""K-means with fixed number of iterations."""
r, c = X.shape
centers = X[np.random.choice(r, k, replace=False)]
for i in range(iters):
m = cdist(X, centers)
z = np.argmin(m, axis=1)
centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
return (z, centers)
In [15]:
np.random.seed(2018)
z, centers = my_kemans_1(X, nc)
Note that K-means can get stuck in local optimum¶
In [16]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
linewidth=3, s=100, c='red')
plt.axis('square')
pass

Version 2¶
In [17]:
def my_kemans_2(X, k, tol= 1e-6):
"""K-means with tolerance."""
r, c = X.shape
centers = X[np.random.choice(r, k, replace=False)]
delta = np.infty
while delta > tol:
m = cdist(X, centers)
z = np.argmin(m, axis=1)
new_centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
delta = np.sum((new_centers - centers)**2)
centers = new_centers
return (z, centers)
In [18]:
np.random.seed(2018)
z, centers = my_kemans_2(X, nc)
Still stuck in local optimum¶
In [19]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
linewidth=3, s=100, c='red')
plt.axis('square')
pass

Version 3¶
Use of a score to evaluate the goodness of fit. In this case, the simplest score is just the sum of distances from each point to its nearest center.
In [20]:
def my_kemans_3(X, k, tol=1e-6, n_starts=10):
"""K-means with tolerance and random restarts."""
r, c = X.shape
best_score = np.infty
for i in range(n_starts):
centers = X[np.random.choice(r, k, replace=False)]
delta = np.infty
while delta > tol:
m = cdist(X, centers)
z = np.argmin(m, axis=1)
new_centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
delta = np.sum((new_centers - centers)**2)
centers = new_centers
score = m[z].sum()
if score < best_score:
best_score = score
best_z = z
best_centers = centers
return (best_z, best_centers)
In [21]:
np.random.seed(2018)
z, centers = my_kemans_3(X, nc)
In [22]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
linewidth=3, s=100, c='red')
plt.axis('square')
pass

Model evaluation¶
We use AMIS for model evaluation. Note that evaluation requires knowing
the ground truth. There are also other ad-hoc methods - see
sckikt-lean
docs
if you are interested.
Mutual Information Score (MIS)¶
The mutual information is defined as
It measures how much knowing \(X\) reduces the uncertainty about \(Y\) (and vice versa).
If \(X\) is independent of \(Y\), then \(I(X; Y) = 0\)
If \(X\) is a deterministic function of \(Y\), then \(I(X; Y) = 1\)
It is equivalent to the Kullback-Leibler divergence
We use AMIS (Adjusted MIS) here, which adjusts for the number of clusters used in the clustering method.
From the documentation
AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))]
In [23]:
help(adjusted_mutual_info_score)
Help on function adjusted_mutual_info_score in module sklearn.metrics.cluster.supervised:
adjusted_mutual_info_score(labels_true, labels_pred, average_method='warn')
Adjusted Mutual Information between two clusterings.
Adjusted Mutual Information (AMI) is an adjustment of the Mutual
Information (MI) score to account for chance. It accounts for the fact that
the MI is generally higher for two clusterings with a larger number of
clusters, regardless of whether there is actually more information shared.
For two clusterings :math:`U` and :math:`V`, the AMI is given as::
AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]
This metric is independent of the absolute values of the labels:
a permutation of the class or cluster label values won't change the
score value in any way.
This metric is furthermore symmetric: switching ``label_true`` with
``label_pred`` will return the same score value. This can be useful to
measure the agreement of two independent label assignments strategies
on the same dataset when the real ground truth is not known.
Be mindful that this function is an order of magnitude slower than other
metrics, such as the Adjusted Rand Index.
Read more in the :ref:`User Guide <mutual_info_score>`.
Parameters
----------
labels_true : int array, shape = [n_samples]
A clustering of the data into disjoint subsets.
labels_pred : array, shape = [n_samples]
A clustering of the data into disjoint subsets.
average_method : string, optional (default: 'warn')
How to compute the normalizer in the denominator. Possible options
are 'min', 'geometric', 'arithmetic', and 'max'.
If 'warn', 'max' will be used. The default will change to
'arithmetic' in version 0.22.
.. versionadded:: 0.20
Returns
-------
ami: float (upperlimited by 1.0)
The AMI returns a value of 1 when the two partitions are identical
(ie perfectly matched). Random partitions (independent labellings) have
an expected AMI around 0 on average hence can be negative.
See also
--------
adjusted_rand_score: Adjusted Rand Index
mutual_info_score: Mutual Information (not adjusted for chance)
Examples
--------
Perfect labelings are both homogeneous and complete, hence have
score 1.0::
>>> from sklearn.metrics.cluster import adjusted_mutual_info_score
>>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
... # doctest: +SKIP
1.0
>>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
... # doctest: +SKIP
1.0
If classes members are completely split across different clusters,
the assignment is totally in-complete, hence the AMI is null::
>>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
... # doctest: +SKIP
0.0
References
----------
.. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
Clusterings Comparison: Variants, Properties, Normalization and
Correction for Chance, JMLR
<http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>`_
.. [2] `Wikipedia entry for the Adjusted Mutual Information
<https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
In [24]:
ncols = 4
nrows = 2
plt.figure(figsize=(ncols*3, nrows*3))
for i, nc in enumerate(range(3, 11)):
kmeans = KMeans(nc, n_init=10)
clusters = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
amis = adjusted_mutual_info_score(y, clusters)
plt.subplot(nrows, ncols, i+1)
plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
linewidth=3, s=100, c='red')
plt.text(0.05, 0.9, 'nc = %d AMIS = %.2f' % (nc, amis),
fontsize=16, color='yellow',
transform=plt.gca().transAxes)
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()

Selecting models¶
Selection uses various criteria about the “goodness” of clustering, and
does not require ground truth. Ad hoc methods available in sklearn
include silhouette_score
, calinski_harabaz_score
and
davies_bouldin_score
. You can also use likelihood-based methods
(e.g. AIC, BIC) by interpreting the solution as (say) a mixture of
normals but we don’t cover those approaches here.
In [25]:
ncols = 4
nrows = 2
plt.figure(figsize=(ncols*3, nrows*3))
for i, nc in enumerate(range(3, 11)):
kmeans = KMeans(nc, n_init=10)
clusters = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
chs = calinski_harabaz_score(X, clusters)
plt.subplot(nrows, ncols, i+1)
plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
linewidth=3, s=100, c='red')
plt.text(0.05, 0.9, 'nc = %d CH = %.2f' % (nc, chs),
fontsize=16, color='yellow',
transform=plt.gca().transAxes)
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()

Aligning clusters across samples¶
In [26]:
X1 = X + np.random.normal(0, 1, X.shape)
In [27]:
lu = dict(zip(range(6), range(5,-1,-1)))
y_alt = [lu[p] for p in y]
ys = [y, y_alt]
In [28]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
plt.subplot(1,2,i+1)
plt.scatter(xs[:, 0], xs[:, 1], s=5, c=ys[i],
cmap=plt.cm.get_cmap('Accent', nc))
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()
pass

Option 1: Using reference sample¶
In [29]:
nc = 6
kmeans = KMeans(nc, n_init=10)
kmeans.fit(X)
z1 = kmeans.predict(X)
z2 = kmeans.predict(X1)
zs = [z1, z2]
In [30]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
plt.subplot(1,2,i+1)
plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
cmap=plt.cm.get_cmap('Accent', nc))
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()
pass

Option 2: Pooling¶
In [31]:
Y = np.r_[X, X1]
kmeans = KMeans(nc, n_init=10)
kmeans.fit(Y)
zs = np.split(kmeans.predict(Y), 2)
In [32]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
plt.subplot(1,2,i+1)
plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
cmap=plt.cm.get_cmap('Accent', nc))
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()
pass

Option 3: Matching¶
A numerical example of the Hungarian algorithm, which can either be described using operations on a bipartite graph (example), or as a series of row and column operations on a matrix (example).
In [33]:
from scipy.optimize import linear_sum_assignment
In [34]:
np.random.seed(2018)
kmeans = KMeans(nc, n_init=10)
c1 = kmeans.fit(X).cluster_centers_
z1 = kmeans.predict(X)
c2 = kmeans.fit(X1).cluster_centers_
z2 = kmeans.predict(X1)
zs = [z1, z2]
cs = [c1, c2]
In [35]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
plt.subplot(1,2,i+1)
z = zs[i]
c = cs[i]
plt.scatter(xs[:, 0], xs[:, 1], s=5, c=z,
cmap=plt.cm.get_cmap('Accent', nc))
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()
pass

Define cost function¶
We use a simple cost as just the distance between centers. More complex dissimilarity measures could be used.
In [36]:
cost = cdist(c1, c2)
Run the Hungarian (Muunkres) algorithm for bipartitie matching¶
In [37]:
row_ind, col_ind = linear_sum_assignment(cost)
In [38]:
row_ind, col_ind
Out[38]:
(array([0, 1, 2, 3, 4, 5]), array([1, 0, 2, 3, 4, 5]))
Swap cluster indexes to align data sets¶
In [39]:
z1_aligned = col_ind[z1]
zs = [z1_aligned, z2]
In [40]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
plt.subplot(1,2,i+1)
plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
cmap=plt.cm.get_cmap('Accent', nc))
plt.xticks([])
plt.yticks([])
plt.axis('square')
plt.tight_layout()
pass

In [ ]: