In [71]:

%matplotlib inline

In [2]:

import numpy as np
import numpy.random as rng
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc3 as pm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
import daft

In [3]:

import theano
import theano.tensor as tt
theano.config.warn.round=False

In [4]:

import warnings
warnings.simplefilter('ignore', UserWarning)

In [5]:

sns.set_context('notebook')
sns.set_style('darkgrid')

Mixture models¶

We can construct very flexible new distributions using mixtures of other distributions. In fact, we can construct mixtures of not just distributions, but of regression models, neural networks etc, making this a very powerful framework. We consider finite and Dirichlet Process (DP) mixtures, and see basic ideas for how to work with mixtures in pymc3.

The Categorical distribution¶

This is just the extension of the Bernoulli distribuiotn to more than 2 states.

In [10]:

cat = stats.multinomial(1, [0.1, 0.2, 0.3, 0.4])

In [11]:

cat.rvs(10)

Out[11]:

array([[0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

In [28]:

x = range(4)
plt.vlines(x, 0, cat.p)
plt.xticks(range(4))
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_10_0.png

The Dirichlet distribution¶

The code to visualize Dirichlet distributions in barycentric coordinates is from this blog.

In [29]:

from functools import reduce
import matplotlib.tri as tri
from math import gamma
from operator import mul

In [30]:

corners = np.array([[0, 0], [1, 0], [0.5, 0.75**0.5]])
triangle = tri.Triangulation(corners[:, 0], corners[:, 1])
refiner = tri.UniformTriRefiner(triangle)
trimesh = refiner.refine_triangulation(subdiv=4)
midpoints = [(corners[(i + 1) % 3] + corners[(i + 2) % 3]) / 2.0 \
             for i in range(3)]

def xy2bc(xy, tol=1.e-3):
    '''Converts 2D Cartesian coordinates to barycentric.'''
    s = [(corners[i] - midpoints[i]).dot(xy - midpoints[i]) / 0.75 \
         for i in range(3)]
    return np.clip(s, tol, 1.0 - tol)

class Dirichlet(object):
    def __init__(self, alpha):

        self._alpha = np.array(alpha)
        self._coef = gamma(np.sum(self._alpha)) / \
                     reduce(mul, [gamma(a) for a in self._alpha])

    def pdf(self, x):
        '''Returns pdf value for `x`.'''
        return self._coef * reduce(mul, [xx ** (aa - 1)
                                         for (xx, aa)in zip(x, self._alpha)])

def draw_pdf_contours(dist, nlevels=200, subdiv=8, **kwargs):
    refiner = tri.UniformTriRefiner(triangle)
    trimesh = refiner.refine_triangulation(subdiv=subdiv)
    pvals = [dist.pdf(xy2bc(xy)) for xy in zip(trimesh.x, trimesh.y)]

    plt.tricontourf(trimesh, pvals, nlevels, **kwargs)
    plt.axis('equal')
    plt.xlim(0, 1)
    plt.ylim(0, 0.75**0.5)
    plt.axis('off')

In [31]:

ks = (
    [0.999, 0.999, 0.999],
    [1,1,1],
    [5,5,5],
    [5,2,1],
)

plt.viridis()
fig = plt.figure(figsize=(12,4))
for i, k in enumerate(ks):
    plt.subplot(1,4,i+1)
    draw_pdf_contours(Dirichlet(k))

<Figure size 432x288 with 0 Axes>

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_14_1.png

Samples from Dirichlet distribution¶

In [32]:

n = 20
s = stats.dirichlet.rvs(alpha=[7,7,7], size=n).T

In [33]:

plt.figure(figsize=(4, n*0.4))
with plt.style.context('seaborn-dark'):
    plt.barh(range(n), s[0])
    plt.barh(range(n), s[1], left=s[0], color='g')
    plt.barh(range(n), s[2], left=s[0]+s[1], color='r')
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_17_0.png

Finite Gaussian Mixture Model¶

Data¶

In [34]:

np.random.seed(1)
data = np.r_[np.random.normal(-6, 2, 500),
          np.random.normal(0, 1, 200),
          np.random.normal(4, 1, 300)]
n = data.shape[0]

In [35]:

plt.hist(data, bins=20, normed=True, alpha=0.5)
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_21_0.png

In [36]:

data.shape

Out[36]:

(1000,)

Explicit latent variable¶

In [37]:

k = 3

with pm.Model() as gmm1:
    p = pm.Dirichlet('p', a=np.ones(k))
    z = pm.Categorical('z', p=p, shape=data.shape[0])
    μ = pm.Normal('μ', 0, 10, shape=k)
    σ = pm.InverseGamma('τ', 1, 1, shape=k)

    y = pm.Normal('y', mu=μ[z], sd=σ[z], observed=data)

In [38]:

with gmm1:
    trace = pm.sample(n_init=10000, tune=1000, random_seed=123)

Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>NUTS: [τ, μ, p]
>CategoricalGibbsMetropolis: [z]
Sampling 4 chains: 100%|██████████| 6000/6000 [05:39<00:00,  4.63draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

In [39]:

pm.traceplot(trace, varnames=['p', 'μ', 'σ'])
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_26_0.png

Note that the sampler is quite slow and convergence is poor. This is due to the need to sample over the discrete latent variable \(z\). We can improve convergence by marginalizing out \(z\).

In [40]:

with gmm1:
    ppc = pm.sample_posterior_predictive(trace, samples=10)

plt.hist(data, bins=20, normed=True, alpha=0.5)
for i in range(10):
    sns.distplot(ppc['y'][i], hist=False, color='grey')

100%|██████████| 10/10 [00:00<00:00, 181.99it/s]

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_28_1.png

Marginalization¶

\[f(x | p) = \sum_{i=1}^{k} p_i N(\mu, \sigma)\]

In [41]:

k = 3

with pm.Model() as gmm2:
    p = pm.Dirichlet('p', a=np.ones(k))
    μ = pm.Normal('μ', 0, 10, shape=k)
    σ = pm.InverseGamma('τ', 1, 1, shape=k)

    y = pm.NormalMixture('y', p, mu=μ, sd=σ, observed=data)

In [42]:

with gmm2:
    trace = pm.sample(n_init=10000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [τ, μ, p]
Sampling 4 chains: 100%|██████████| 6000/6000 [00:08<00:00, 672.90draws/s]
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

In [43]:

pm.traceplot(trace, varnames=['p', 'μ', 'σ'])
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_32_0.png

In [44]:

with gmm2:
    ppc = pm.sample_posterior_predictive(trace, samples=10)

plt.hist(data, bins=20, normed=True, alpha=0.5)
for i in range(10):
    sns.distplot(ppc['y'][i], hist=False, color='grey')

100%|██████████| 10/10 [00:00<00:00, 14.91it/s]

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_33_1.png

The Label-Switching problem¶

In [45]:

k = 3

with pm.Model() as gmm3:
    p = pm.Dirichlet('p', a=np.ones(k))
    μ = pm.Normal('μ', np.linspace(data.min(), data.max(), k)
                  , sd=10, shape=k)
    σ = pm.InverseGamma('τ', 1, 1, shape=k)

    op = pm.Potential('op',
                      tt.switch(μ[1]-μ[0] < 0, -np.inf, 0) +
                      tt.switch(μ[2]-μ[1] < 0, -np.inf, 0))

    y = pm.NormalMixture('y', p, mu=μ, sd=σ, observed=data)

In [46]:

with gmm3:
    trace = pm.sample(n_init=10000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [τ, μ, p]
Sampling 4 chains: 100%|██████████| 6000/6000 [00:13<00:00, 440.40draws/s]
The acceptance probability does not match the target. It is 0.9245971316563495, but should be close to 0.8. Try to increase the number of tuning steps.

In [47]:

pm.traceplot(trace, varnames=['p', 'μ', 'σ'])
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_37_0.png

In [48]:

with gmm3:
    ppc = pm.sample_posterior_predictive(trace, samples=10)

plt.hist(data, bins=20, normed=True, alpha=0.5)
for i in range(10):
    sns.distplot(ppc['y'][i], hist=False, color='grey')

100%|██████████| 10/10 [00:00<00:00, 15.21it/s]

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_38_1.png

Alternative for label switching¶

In [49]:

k = 3

with pm.Model() as gmm4:
    p = pm.Dirichlet('p', a=np.ones(k))
    μ = pm.Normal('μ',
                  np.linspace(data.min(), data.max(), k),
                  sd=10,
                  shape=k,
                  transform=pm.distributions.transforms.ordered)
    σ = pm.InverseGamma('τ', 1, 1, shape=k)

    y = pm.NormalMixture('y', p, mu=μ, sd=σ, observed=data)

In [50]:

with gmm4:
    trace = pm.sample(n_init=10000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [τ, μ, p]
Sampling 4 chains: 100%|██████████| 6000/6000 [00:17<00:00, 334.60draws/s]

In [51]:

pm.traceplot(trace, varnames=['p', 'μ', 'σ'])
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_42_0.png

In [52]:

with gmm4:
    ppc = pm.sample_posterior_predictive(trace, samples=10)

plt.hist(data, bins=20, normed=True, alpha=0.5)
for i in range(10):
    sns.distplot(ppc['y'][i], hist=False, color='grey')

100%|██████████| 10/10 [00:00<00:00, 15.51it/s]

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_43_1.png

Dirichlet process¶

Samples from DP¶

In [53]:

def sbp0(α, k, base=stats.norm(0,1)):
    β = stats.beta.rvs(1, α, size=k)
    w = β * np.r_[[1], (1 - β[:-1]).cumprod()]
    x = base.rvs(size=k)
    return w, x

In [56]:

k = 50

xp = np.linspace(-3,3,100)
np.random.seed(123)
plt.figure(figsize=(12,3))
for i, α in enumerate([0.1, 1,10,100]):
    plt.subplot(1, 4, i+1)
    w, x = sbp0(α, k)
    plt.vlines(x, 0, w)
    plt.plot(xp, w.max()*stats.norm(0,1).pdf(xp))
    plt.title('α = %.1f' % α)
plt.tight_layout()

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_47_0.png

DP mixture model¶

Generative process¶

In [57]:

np.random.seed(123)
α = 1
k = 20

w, x = sbp0(α, k)
dist = stats.norm(x, 0.5)

In [58]:

xp = np.transpose([np.linspace(-3,3,100)]*k)
plt.plot(xp[:,0], (w*dist.pdf(xp)).sum(1))
plt.plot(xp, w*dist.pdf(xp), c='grey', ls='--')
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_51_0.png

Fitting to data¶

In [59]:

def sbp(α, k=20):
    β = pm.Beta('β', 1, α, shape=k)
    w = β * pm.math.concatenate([[1], (1 - β[:-1]).cumprod()])
    return w

In [60]:

with pm.Model() as dpgmm:
    α = pm.Gamma('α', 1, 1)
    p = pm.Deterministic('p', sbp(α))
    μ = pm.Normal('μ',
                  np.linspace(data.min(), data.max(), k),
                  sd=10,
                  shape=k,
                  transform=pm.distributions.transforms.ordered)
    σ = pm.InverseGamma('τ', 1, 1, shape=k)

    y = pm.NormalMixture('y', p, mu=μ, sd=σ, observed=data)

In [61]:

with dpgmm:
    trace = pm.sample(n_init=10000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [τ, μ, β, α]
Sampling 4 chains: 100%|██████████| 6000/6000 [13:16<00:00,  2.71draws/s]
There were 188 divergences after tuning. Increase `target_accept` or reparameterize.
There were 218 divergences after tuning. Increase `target_accept` or reparameterize.
There were 278 divergences after tuning. Increase `target_accept` or reparameterize.
There were 238 divergences after tuning. Increase `target_accept` or reparameterize.
The gelman-rubin statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.

In [74]:

pm.traceplot(trace, varnames=['α'])
pass

Out[74]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x152654940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x152679b38>]],
      dtype=object)

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_56_1.png

Check that a truncation level of k=20 is sufficient.

In [82]:

plt.stem(trace['p'].mean(axis=0))
plt.xticks(range(20))
pass

notebook/../../build/doctrees/nbsphinx/notebook_S15C_PyMC3_58_0.png

In [ ]: