In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_mutual_info_score, mutual_info_score, calinski_harabaz_score
In [3]:
from sklearn.datasets import make_blobs
In [4]:
plt.style.use('dark_background')
In [5]:
import warnings
In [6]:
warnings.simplefilter('ignore', FutureWarning)

Clustering

A nice overview of clustering which explains the basic concepts and provides further references to several classes of clustering algorithms is provided by `scikit-learn <https://scikit-learn.org/stable/modules/clustering.html>`__. It is a useful exercise to implement some of the clustering algorithms yourself, since they are generally quite tractable, and you can learn a lot by comparing your solution to the implementation in ``scikit-learn` <https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/cluster>`__.

Toy example to illustrate concepts

In [7]:
np.random.seed(123)
In [8]:
npts = 1000
nc = 6
X, y = make_blobs(n_samples=npts, centers=nc)
In [9]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
            cmap=plt.cm.get_cmap('Accent', nc))
plt.axis('square')
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_10_0.png

How to cluster

  1. Choose a clustering algorithm

  2. Choose the number of clusters

K-means

There are many different clustering methods, but K-means is fast, scales well, and can be interpreted as a probabilistic model. We will write 3 versions of the K-means algorithm to illustrate the main concepts. The algorithm is very simple:

  1. Start with \(k\) centers with labels \(0, 1, \ldots, k-1\)

  2. Find the distance of each data point to each center

  3. Assign the data points nearest to a center to its label

  4. Use the mean of the points assigned to a center as the new center

  5. Repeat for a fixed number of iterations or until the centers stop changing

Note: If you want a probabilistic interpretation, just treat the final solution as a mixture of (multivariate) normal distributions. K-means is often used to initialize the fitting of full statistical mixture models, which are computationally more demanding and hence slower.

Distance between sets of vectors

In [10]:
from scipy.spatial.distance import cdist
In [11]:
pts = np.arange(6).reshape(-1,2)
pts
Out[11]:
array([[0, 1],
       [2, 3],
       [4, 5]])
In [12]:
mus = np.arange(4).reshape(-1, 2)
mus
Out[12]:
array([[0, 1],
       [2, 3]])
In [13]:
cdist(pts, mus)
Out[13]:
array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])

Version 1

In [14]:
def my_kemans_1(X, k, iters=10):
    """K-means with fixed number of iterations."""

    r, c = X.shape
    centers = X[np.random.choice(r, k, replace=False)]
    for i in range(iters):
        m = cdist(X, centers)
        z = np.argmin(m, axis=1)
        centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
    return (z, centers)
In [15]:
np.random.seed(2018)
z, centers = my_kemans_1(X, nc)

Note that K-means can get stuck in local optimum

In [16]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
            cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
            linewidth=3, s=100, c='red')
plt.axis('square')
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_22_0.png

Version 2

In [17]:
def my_kemans_2(X, k, tol= 1e-6):
    """K-means with tolerance."""

    r, c = X.shape
    centers = X[np.random.choice(r, k, replace=False)]
    delta = np.infty
    while delta > tol:
        m = cdist(X, centers)
        z = np.argmin(m, axis=1)
        new_centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
        delta = np.sum((new_centers - centers)**2)
        centers = new_centers
    return (z, centers)
In [18]:
np.random.seed(2018)
z, centers = my_kemans_2(X, nc)

Still stuck in local optimum

In [19]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
            cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
            linewidth=3, s=100, c='red')
plt.axis('square')
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_27_0.png

Version 3

Use of a score to evaluate the goodness of fit. In this case, the simplest score is just the sum of distances from each point to its nearest center.

In [20]:
def my_kemans_3(X, k, tol=1e-6, n_starts=10):
    """K-means with tolerance and random restarts."""

    r, c = X.shape
    best_score = np.infty

    for i in range(n_starts):
        centers = X[np.random.choice(r, k, replace=False)]
        delta = np.infty
        while delta > tol:
            m = cdist(X, centers)
            z = np.argmin(m, axis=1)
            new_centers = np.array([np.mean(X[z==i], axis=0) for i in range(k)])
            delta = np.sum((new_centers - centers)**2)
            centers = new_centers
        score = m[z].sum()
        if score < best_score:
            best_score = score
            best_z = z
            best_centers = centers
    return (best_z, best_centers)
In [21]:
np.random.seed(2018)
z, centers = my_kemans_3(X, nc)
In [22]:
plt.scatter(X[:, 0], X[:, 1], s=5, c=z,
            cmap=plt.cm.get_cmap('Accent', nc))
plt.scatter(centers[:, 0], centers[:, 1], marker='x',
            linewidth=3, s=100, c='red')
plt.axis('square')
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_31_0.png

Model evaluation

We use AMIS for model evaluation. Note that evaluation requires knowing the ground truth. There are also other ad-hoc methods - see sckikt-lean docs if you are interested.

Mutual Information Score (MIS)

The mutual information is defined as

\[I(X; Y) = \sum_{y \in Y} \sum_{x \in X} p(x,y) \log \left( \frac{p(x, y)}{p(x)p(y)} \right)\]

It measures how much knowing \(X\) reduces the uncertainty about \(Y\) (and vice versa).

  • If \(X\) is independent of \(Y\), then \(I(X; Y) = 0\)

  • If \(X\) is a deterministic function of \(Y\), then \(I(X; Y) = 1\)

It is equivalent to the Kullback-Leibler divergence

\[\text{KL}(p(x,y) \mid\mid p(x)p(y)\]

We use AMIS (Adjusted MIS) here, which adjusts for the number of clusters used in the clustering method.

From the documentation

AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))]
In [23]:
help(adjusted_mutual_info_score)
Help on function adjusted_mutual_info_score in module sklearn.metrics.cluster.supervised:

adjusted_mutual_info_score(labels_true, labels_pred, average_method='warn')
    Adjusted Mutual Information between two clusterings.

    Adjusted Mutual Information (AMI) is an adjustment of the Mutual
    Information (MI) score to account for chance. It accounts for the fact that
    the MI is generally higher for two clusterings with a larger number of
    clusters, regardless of whether there is actually more information shared.
    For two clusterings :math:`U` and :math:`V`, the AMI is given as::

        AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]

    This metric is independent of the absolute values of the labels:
    a permutation of the class or cluster label values won't change the
    score value in any way.

    This metric is furthermore symmetric: switching ``label_true`` with
    ``label_pred`` will return the same score value. This can be useful to
    measure the agreement of two independent label assignments strategies
    on the same dataset when the real ground truth is not known.

    Be mindful that this function is an order of magnitude slower than other
    metrics, such as the Adjusted Rand Index.

    Read more in the :ref:`User Guide <mutual_info_score>`.

    Parameters
    ----------
    labels_true : int array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    labels_pred : array, shape = [n_samples]
        A clustering of the data into disjoint subsets.

    average_method : string, optional (default: 'warn')
        How to compute the normalizer in the denominator. Possible options
        are 'min', 'geometric', 'arithmetic', and 'max'.
        If 'warn', 'max' will be used. The default will change to
        'arithmetic' in version 0.22.

        .. versionadded:: 0.20

    Returns
    -------
    ami: float (upperlimited by 1.0)
       The AMI returns a value of 1 when the two partitions are identical
       (ie perfectly matched). Random partitions (independent labellings) have
       an expected AMI around 0 on average hence can be negative.

    See also
    --------
    adjusted_rand_score: Adjusted Rand Index
    mutual_info_score: Mutual Information (not adjusted for chance)

    Examples
    --------

    Perfect labelings are both homogeneous and complete, hence have
    score 1.0::

      >>> from sklearn.metrics.cluster import adjusted_mutual_info_score
      >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])
      ... # doctest: +SKIP
      1.0
      >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
      ... # doctest: +SKIP
      1.0

    If classes members are completely split across different clusters,
    the assignment is totally in-complete, hence the AMI is null::

      >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
      ... # doctest: +SKIP
      0.0

    References
    ----------
    .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for
       Clusterings Comparison: Variants, Properties, Normalization and
       Correction for Chance, JMLR
       <http://jmlr.csail.mit.edu/papers/volume11/vinh10a/vinh10a.pdf>`_

    .. [2] `Wikipedia entry for the Adjusted Mutual Information
       <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_

In [24]:
ncols = 4
nrows = 2
plt.figure(figsize=(ncols*3, nrows*3))
for i, nc in enumerate(range(3, 11)):
    kmeans = KMeans(nc, n_init=10)
    clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    amis = adjusted_mutual_info_score(y, clusters)
    plt.subplot(nrows, ncols, i+1)
    plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.scatter(centers[:, 0], centers[:, 1], marker='x',
                linewidth=3, s=100, c='red')
    plt.text(0.05, 0.9, 'nc = %d AMIS = %.2f' % (nc, amis),
             fontsize=16, color='yellow',
             transform=plt.gca().transAxes)
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_35_0.png

Selecting models

Selection uses various criteria about the “goodness” of clustering, and does not require ground truth. Ad hoc methods available in sklearn include silhouette_score, calinski_harabaz_score and davies_bouldin_score. You can also use likelihood-based methods (e.g. AIC, BIC) by interpreting the solution as (say) a mixture of normals but we don’t cover those approaches here.

In [25]:
ncols = 4
nrows = 2
plt.figure(figsize=(ncols*3, nrows*3))
for i, nc in enumerate(range(3, 11)):
    kmeans = KMeans(nc, n_init=10)
    clusters = kmeans.fit_predict(X)
    centers = kmeans.cluster_centers_
    chs = calinski_harabaz_score(X, clusters)
    plt.subplot(nrows, ncols, i+1)
    plt.scatter(X[:, 0], X[:, 1], s=5, c=y,
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.scatter(centers[:, 0], centers[:, 1], marker='x',
                linewidth=3, s=100, c='red')
    plt.text(0.05, 0.9, 'nc = %d CH = %.2f' % (nc, chs),
             fontsize=16, color='yellow',
             transform=plt.gca().transAxes)
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_37_0.png

Aligning clusters across samples

In [26]:
X1 = X + np.random.normal(0, 1, X.shape)
In [27]:
lu = dict(zip(range(6), range(5,-1,-1)))
y_alt = [lu[p] for p in y]
ys = [y, y_alt]
In [28]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
    plt.subplot(1,2,i+1)
    plt.scatter(xs[:, 0], xs[:, 1], s=5, c=ys[i],
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_41_0.png

Option 1: Using reference sample

In [29]:
nc = 6
kmeans = KMeans(nc, n_init=10)
kmeans.fit(X)
z1 = kmeans.predict(X)
z2 = kmeans.predict(X1)
zs = [z1, z2]
In [30]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
    plt.subplot(1,2,i+1)
    plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_44_0.png

Option 2: Pooling

In [31]:
Y = np.r_[X, X1]
kmeans = KMeans(nc, n_init=10)
kmeans.fit(Y)
zs = np.split(kmeans.predict(Y), 2)
In [32]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
    plt.subplot(1,2,i+1)
    plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_47_0.png

Option 3: Matching

A numerical example of the Hungarian algorithm, which can either be described using operations on a bipartite graph (example), or as a series of row and column operations on a matrix (example).

In [33]:
from scipy.optimize import linear_sum_assignment
In [34]:
np.random.seed(2018)
kmeans = KMeans(nc, n_init=10)
c1 = kmeans.fit(X).cluster_centers_
z1 = kmeans.predict(X)
c2 = kmeans.fit(X1).cluster_centers_
z2 = kmeans.predict(X1)
zs = [z1, z2]
cs = [c1, c2]
In [35]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
    plt.subplot(1,2,i+1)
    z = zs[i]
    c = cs[i]

    plt.scatter(xs[:, 0], xs[:, 1], s=5, c=z,
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_51_0.png

Define cost function

We use a simple cost as just the distance between centers. More complex dissimilarity measures could be used.

In [36]:
cost = cdist(c1, c2)

Run the Hungarian (Muunkres) algorithm for bipartitie matching

In [37]:
row_ind, col_ind = linear_sum_assignment(cost)
In [38]:
row_ind, col_ind
Out[38]:
(array([0, 1, 2, 3, 4, 5]), array([1, 0, 2, 3, 4, 5]))

Swap cluster indexes to align data sets

In [39]:
z1_aligned = col_ind[z1]
zs = [z1_aligned, z2]
In [40]:
plt.figure(figsize=(8, 4))
for i, xs in enumerate([X, X1]):
    plt.subplot(1,2,i+1)
    plt.scatter(xs[:, 0], xs[:, 1], s=5, c=zs[i],
                cmap=plt.cm.get_cmap('Accent', nc))
    plt.xticks([])
    plt.yticks([])
    plt.axis('square')
plt.tight_layout()
pass
notebook/../../build/doctrees/nbsphinx/notebook_S11_Clustering_59_0.png
In [ ]: