Tensorflow¶

[1]:

%matplotlib inline

[2]:

import warnings
warnings.simplefilter('ignore', RuntimeWarning)

[3]:

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

[4]:

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

[5]:

data = fetch_california_housing()

We work with TF1 but ask it to emulate TF2 behavior

[6]:

# import tensorflow.compat.v2 as tf
# tf.enable_v2_behavior()
import tensorflow as tf

[7]:

%%capture
import tensorflow_probability as tfp
tfd = tfp.distributions

Working with tensors¶

Almost exaclty like numpy arrays.`m

[8]:

tf.constant([1., 2., 3.])

[8]:

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1., 2., 3.], dtype=float32)>

[9]:

x = tf.Variable([[1.,2.,3.], [4.,5.,6.]])

[10]:

x.shape

[10]:

TensorShape([2, 3])

[11]:

x.dtype

[11]:

tf.float32

Conversin to numpy¶

[12]:

x.numpy()

[12]:

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

Indexing¶

[13]:

x[:, :2]

[13]:

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1., 2.],
       [4., 5.]], dtype=float32)>

Assignment¶

[14]:

x[0,:].assign([3.,2.,1.])

[14]:

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[3., 2., 1.],
       [4., 5., 6.]], dtype=float32)>

[15]:

[15]:

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[3., 2., 1.],
       [4., 5., 6.]], dtype=float32)>

Reductions¶

[16]:

tf.reduce_mean(x, axis=0)

[16]:

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([3.5, 3.5, 3.5], dtype=float32)>

[17]:

tf.reduce_sum(x, axis=1)

[17]:

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 6., 15.], dtype=float32)>

Broadcasting¶

[18]:

x + 10

[18]:

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[13., 12., 11.],
       [14., 15., 16.]], dtype=float32)>

[19]:

x * 10

[19]:

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[30., 20., 10.],
       [40., 50., 60.]], dtype=float32)>

[20]:

x - tf.reduce_mean(x, axis=1)[:, tf.newaxis]

[20]:

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 1.,  0., -1.],
       [-1.,  0.,  1.]], dtype=float32)>

Matrix operations¶

[21]:

x @ tf.transpose(x)

[21]:

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 28.],
       [28., 77.]], dtype=float32)>

Ufuncs¶

[22]:

tf.exp(x)

[22]:

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 20.085537 ,   7.389056 ,   2.7182817],
       [ 54.59815  , 148.41316  , 403.4288   ]], dtype=float32)>

[23]:

tf.sqrt(x)

[23]:

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1.7320508, 1.4142135, 1.       ],
       [2.       , 2.236068 , 2.4494898]], dtype=float32)>

Random numbers¶

[24]:

X = tf.random.normal(shape=(10,4))
y = tf.random.normal(shape=(10,1))

Linear algebra¶

[25]:

tf.linalg.lstsq(X, y)

[25]:

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[-0.427304  ],
       [ 0.11494821],
       [ 0.23162855],
       [ 0.62853664]], dtype=float32)>

Vectorization¶

[26]:

X = tf.random.normal(shape=(1000,10,4))
y = tf.random.normal(shape=(1000,10,1))

[27]:

tf.linalg.lstsq(X, y)

[27]:

<tf.Tensor: shape=(1000, 4, 1), dtype=float32, numpy=
array([[[-7.7759638e-02],
        [-1.3257186e-01],
        [-7.6929688e-02],
        [ 2.3807605e-01]],

       [[-2.1676989e-01],
        [ 1.1578191e-03],
        [ 1.2898314e-01],
        [ 1.9959596e-01]],

       [[-5.5951335e-02],
        [-2.3162486e-01],
        [-1.6260512e-01],
        [-2.1083586e-01]],

       ...,

       [[ 8.7220746e-01],
        [-7.2760344e-01],
        [ 1.2602654e+00],
        [-7.9936546e-01]],

       [[ 5.4767132e-01],
        [ 3.9195046e-01],
        [ 4.2830217e-01],
        [-4.3413453e-02]],

       [[-3.0089268e-01],
        [-2.1683019e-01],
        [-1.4834183e-01],
        [-1.8235873e-02]]], dtype=float32)>

Automatic differntiation¶

[28]:

def f(x,y):
    return x**2 + 2*y**2 + 3*x*y

Gradient¶

[29]:

x, y = tf.Variable(1.0), tf.Variable(2.0)

[30]:

with tf.GradientTape() as tape:
    z = f(x, y)

[31]:

tape.gradient(z, [x,y])

[31]:

[<tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=11.0>]

Hessian¶

[32]:

with tf.GradientTape(persistent=True) as H_tape:
    with tf.GradientTape() as J_tape:
        z = f(x, y)
    Js = J_tape.gradient(z, [x,y])
Hs = [H_tape.gradient(J, [x,y]) for J in Js]
del H_tape

[33]:

np.array(Hs)

[33]:

array([[2., 3.],
       [3., 4.]], dtype=float32)

Keras¶

[34]:

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)

[35]:

y_train.min(), y_train.max()

[35]:

(0.14999, 5.00001)

[36]:

scalar = StandardScaler()
X_train_s = scalar.fit_transform(X_train)
X_test_s = scalar.transform(X_test)

[37]:

import tensorflow.keras as keras

[38]:

Dense = keras.layers.Dense

We can consider a DL model as just a black box with a bunch of unnown parameters. For exanple, when the outoput is a Dense layer with just one node, the entire network model is just doing some form of regression. Hence we can replace a linear regression model with such a neural network model and run MCMC or VI as usual.

[39]:

model = keras.models.Sequential([
    Dense(30,
          activation='elu',
          input_shape=X_train.shape[1:]),
    Dense(1)
])

[40]:

model.compile(loss="mse", optimizer="nadam", metrics=["mae"])

[41]:

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
dense (Dense)                (None, 30)                270
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31
=================================================================
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________

[42]:

model.layers

[42]:

[<tensorflow.python.keras.layers.core.Dense at 0x7f9cec625748>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f9cec625400>]

[43]:

model.layers[0].name

[43]:

'dense'

[44]:

model.layers[0].activation

[44]:

<function tensorflow.python.keras.activations.elu>

[45]:

hist = model.fit(X_train_s,
                 y_train,
                 epochs=10,
                 validation_split=0.2)

Train on 12384 samples, validate on 3096 samples
Epoch 1/10
12384/12384 [==============================] - 2s 145us/sample - loss: 1.9358 - mae: 1.0443 - val_loss: 0.6545 - val_mae: 0.5832
Epoch 2/10
12384/12384 [==============================] - 1s 60us/sample - loss: 0.5266 - mae: 0.5304 - val_loss: 0.5282 - val_mae: 0.5214
Epoch 3/10
12384/12384 [==============================] - 1s 59us/sample - loss: 0.4554 - mae: 0.4945 - val_loss: 0.4905 - val_mae: 0.5008
Epoch 4/10
12384/12384 [==============================] - 1s 60us/sample - loss: 0.4402 - mae: 0.4864 - val_loss: 0.4770 - val_mae: 0.4971
Epoch 5/10
12384/12384 [==============================] - 1s 58us/sample - loss: 0.4334 - mae: 0.4822 - val_loss: 0.4740 - val_mae: 0.4902
Epoch 6/10
12384/12384 [==============================] - 1s 58us/sample - loss: 0.4378 - mae: 0.4809 - val_loss: 0.4668 - val_mae: 0.4854
Epoch 7/10
12384/12384 [==============================] - 1s 59us/sample - loss: 0.4278 - mae: 0.4779 - val_loss: 0.4645 - val_mae: 0.4891
Epoch 8/10
12384/12384 [==============================] - 1s 59us/sample - loss: 0.4225 - mae: 0.4756 - val_loss: 0.4551 - val_mae: 0.4840
Epoch 9/10
12384/12384 [==============================] - 1s 58us/sample - loss: 0.4202 - mae: 0.4719 - val_loss: 0.4538 - val_mae: 0.4867
Epoch 10/10
12384/12384 [==============================] - 1s 58us/sample - loss: 0.4189 - mae: 0.4716 - val_loss: 0.4519 - val_mae: 0.4774

[46]:

import pandas as pd

[47]:

df = pd.DataFrame(hist.history)

[48]:

df.head()

[48]:

	loss	mae	val_loss	val_mae
0	1.935755	1.044255	0.654487	0.583205
1	0.526578	0.530396	0.528221	0.521408
2	0.455439	0.494459	0.490467	0.500771
3	0.440165	0.486384	0.476991	0.497115
4	0.433447	0.482155	0.473955	0.490190

[49]:

df.plot()
pass

[50]:

model.evaluate(X_test_s, y_test)

5160/5160 [==============================] - 0s 33us/sample - loss: 0.4417 - mae: 0.4719

[50]:

[0.4417388436868209, 0.47192544]

[51]:

np.c_[model.predict(X_test_s[:3, :]), y_test[:3]]

[51]:

array([[2.66226506, 4.889     ],
       [2.38322258, 1.746     ],
       [1.085361  , 1.533     ]])

[52]:

model.save('housing.h5')

[53]:

model = keras.models.load_model('housing.h5')

Tensorflow proability¶

Distributions¶

[54]:

[str(x).split('.')[-1][:-2] for x in tfd.distribution.Distribution.__subclasses__()]

[54]:

['Autoregressive',
 'BatchReshape',
 'Bernoulli',
 'Beta',
 'Categorical',
 'Multinomial',
 'Binomial',
 'JointDistribution',
 'JointDistribution',
 '_Cast',
 'Blockwise',
 'Cauchy',
 'Gamma',
 'Chi2',
 'TransformedDistribution',
 'Normal',
 'LKJ',
 'CholeskyLKJ',
 '_BaseDeterministic',
 '_BaseDeterministic',
 'Dirichlet',
 'DirichletMultinomial',
 'DoublesidedMaxwell',
 'Empirical',
 'FiniteDiscrete',
 'GammaGamma',
 'GaussianProcess',
 'GeneralizedPareto',
 'Geometric',
 'Uniform',
 'HalfCauchy',
 'HalfNormal',
 'HiddenMarkovModel',
 'Horseshoe',
 'Independent',
 'InverseGamma',
 'InverseGaussian',
 'Laplace',
 'LinearGaussianStateSpaceModel',
 'Logistic',
 'Mixture',
 'MixtureSameFamily',
 'MultivariateStudentTLinearOperator',
 'NegativeBinomial',
 'OneHotCategorical',
 'Pareto',
 'PERT',
 'QuantizedDistribution',
 'Poisson',
 '_TensorCoercible',
 'PixelCNN',
 'PlackettLuce',
 'PoissonLogNormalQuadratureCompound',
 'ProbitBernoulli',
 'RelaxedBernoulli',
 'ExpRelaxedOneHotCategorical',
 'Sample',
 'StudentT',
 'StudentTProcess',
 'Triangular',
 'TruncatedNormal',
 'VectorDiffeomixture',
 'VonMises',
 'VonMisesFisher',
 'WishartLinearOperator',
 'Zipf']

[55]:

dist = tfd.Normal(loc=100, scale=15)

[56]:

x = dist.sample((3,4))
x

[56]:

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[112.298294,  89.107925, 117.49498 , 105.644806],
       [ 99.38011 , 121.4716  , 112.74607 , 126.83206 ],
       [112.833664, 119.85916 , 105.05743 ,  96.81615 ]], dtype=float32)>

[57]:

n = 100
xs = dist.sample(n)
plt.hist(xs, density=True)
xp = tf.linspace(50., 150., 100)
plt.plot(xp, dist.prob(xp))
pass

Broadcasting¶

[58]:

dist = tfd.Normal(loc=[3,4,5,6], scale=0.5)

[59]:

dist.sample(5)

[59]:

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[3.0555534, 3.8904114, 5.2794166, 5.9625177],
       [3.884369 , 5.159878 , 4.394644 , 5.358372 ],
       [3.0781407, 4.152742 , 5.883699 , 5.845205 ],
       [2.5866   , 3.4164045, 5.4926476, 5.5667934],
       [3.1870022, 4.1119504, 5.0041285, 6.1963325]], dtype=float32)>

[60]:

xp = tf.linspace(0., 9., 100)[:, tf.newaxis]
plt.plot(np.tile(xp, dist.batch_shape), dist.prob(xp))
pass

Mixtures¶

[ ]:

tfd.MixtureSameFamily?

[61]:

gmm = tfd.MixtureSameFamily(
    mixture_distribution=tfd.Categorical(
        probs=[0.4, 0.1, 0.2, 0.3]
    ),
    components_distribution=tfd.Normal(
      loc=[3., 4., 5., 6.],
      scale=[0.1, 0.5, 0.5, .1])
)

[62]:

n = 10000
xs = gmm.sample(n)

[63]:

sns.distplot(xs)
pass

Transformations¶

[64]:

[x for x in dir(tfp.bijectors) if x[0].isupper()]

[64]:

['AbsoluteValue',
 'Affine',
 'AffineLinearOperator',
 'AffineScalar',
 'AutoregressiveNetwork',
 'BatchNormalization',
 'Bijector',
 'Blockwise',
 'Chain',
 'CholeskyOuterProduct',
 'CholeskyToInvCholesky',
 'CorrelationCholesky',
 'Cumsum',
 'DiscreteCosineTransform',
 'Exp',
 'Expm1',
 'FFJORD',
 'FillScaleTriL',
 'FillTriangular',
 'GeneralizedPareto',
 'Gumbel',
 'GumbelCDF',
 'Identity',
 'Inline',
 'Invert',
 'IteratedSigmoidCentered',
 'Kumaraswamy',
 'KumaraswamyCDF',
 'Log',
 'Log1p',
 'MaskedAutoregressiveFlow',
 'MatrixInverseTriL',
 'MatvecLU',
 'NormalCDF',
 'Ordered',
 'Pad',
 'Permute',
 'PowerTransform',
 'RationalQuadraticSpline',
 'RealNVP',
 'Reciprocal',
 'Reshape',
 'Scale',
 'ScaleMatvecDiag',
 'ScaleMatvecLU',
 'ScaleMatvecLinearOperator',
 'ScaleMatvecTriL',
 'ScaleTriL',
 'Shift',
 'Sigmoid',
 'SinhArcsinh',
 'Softfloor',
 'SoftmaxCentered',
 'Softplus',
 'Softsign',
 'Square',
 'Tanh',
 'TransformDiagonal',
 'Transpose',
 'Weibull',
 'WeibullCDF']

[65]:

lognormal = tfp.bijectors.Exp()(tfd.Normal(0, 0.5))

[66]:

xs = lognormal.sample(1000)
sns.distplot(xs)
xp = np.linspace(tf.reduce_min(xs), tf.reduce_max(xs), 100)
plt.plot(xp, tfd.LogNormal(loc=0, scale=0.5).prob(xp))
pass

Regression¶

[67]:

xs = tf.Variable([0., 1., 2., 5., 6., 8.])
ys = tf.sin(xs) + tfd.Normal(loc=0, scale=0.5).sample(xs.shape[0])

[68]:

xs.shape, ys.shape

[68]:

(TensorShape([6]), TensorShape([6]))

[69]:

xs.numpy()

[69]:

array([0., 1., 2., 5., 6., 8.], dtype=float32)

[70]:

ys.numpy()

[70]:

array([ 0.01067163,  0.78099585,  0.6936413 , -0.37853056,  0.36542195,
        0.8795101 ], dtype=float32)

[71]:

xp = tf.linspace(-1., 9., 100)[:, None]
plt.scatter(xs.numpy(), ys.numpy())
plt.plot(xp, tf.sin(xp))
pass

[72]:

kernel = tfp.math.psd_kernels.ExponentiatedQuadratic(length_scale=1.5)
reg = tfd.GaussianProcessRegressionModel(
    kernel, xp[:, tf.newaxis], xs[:, tf.newaxis], ys
)

[74]:

ub, lb = reg.mean() + [2*reg.stddev(), -2*reg.stddev()]
plt.fill_between(np.ravel(xp), np.ravel(ub), np.ravel(lb), alpha=0.2)
plt.plot(xp, reg.mean(), c='red', linewidth=2)
plt.scatter(xs[:], ys[:], s=50, c='k')
pass

Modeling¶

Sampling from a normal distribuiton using HMC (prior predictive samples)

[75]:

[x for x in dir(tfp.mcmc) if x[0].isupper()]

[75]:

['CheckpointableStatesAndTrace',
 'DualAveragingStepSizeAdaptation',
 'HamiltonianMonteCarlo',
 'MetropolisAdjustedLangevinAlgorithm',
 'MetropolisHastings',
 'NoUTurnSampler',
 'RandomWalkMetropolis',
 'ReplicaExchangeMC',
 'SimpleStepSizeAdaptation',
 'SliceSampler',
 'StatesAndTrace',
 'TransformedTransitionKernel',
 'TransitionKernel',
 'UncalibratedHamiltonianMonteCarlo',
 'UncalibratedLangevin',
 'UncalibratedRandomWalk']

[76]:

dir(tfp.vi)

[76]:

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_allowed_symbols',
 'amari_alpha',
 'arithmetic_geometric',
 'chi_square',
 'csiszar_vimco',
 'dual_csiszar_function',
 'fit_surrogate_posterior',
 'jeffreys',
 'jensen_shannon',
 'kl_forward',
 'kl_reverse',
 'log1p_abs',
 'modified_gan',
 'monte_carlo_variational_loss',
 'mutual_information',
 'pearson',
 'squared_hellinger',
 'symmetrized_csiszar_function',
 't_power',
 'total_variation',
 'triangular']

[77]:

from tensorflow_probability import edward2 as ed

[78]:

# From example in help docs

def unnormalized_log_prob(x):
    return -x - x**2.

# Initialize the HMC transition kernel.
num_results = int(1e2)
num_burnin_steps = int(1e2)
adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation(
    tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=unnormalized_log_prob,
        num_leapfrog_steps=3,
        step_size=1.),
    num_adaptation_steps=int(num_burnin_steps * 0.8))

# Run the chain (with burn-in).
samples, is_accepted = tfp.mcmc.sample_chain(
    num_results=num_results,
    num_burnin_steps=num_burnin_steps,
    current_state=1.,
    kernel=adaptive_hmc,
    trace_fn=lambda _, pkr: pkr.inner_results.is_accepted)

sample_mean = tf.reduce_mean(samples)
sample_stddev = tf.math.reduce_std(samples)

[79]:

sample_mean

[79]:

<tf.Tensor: shape=(), dtype=float32, numpy=-0.37961948>

[80]:

sample_stddev

[80]:

<tf.Tensor: shape=(), dtype=float32, numpy=0.57268214>

[81]:

sns.distplot(samples)
plt.axvline(sample_mean.numpy(), c='red')
plt.plot([sample_mean - 2*sample_stddev, sample_mean + 2*sample_stddev],
         [0.01, 0.01], c='k', linewidth=3)
pass

[ ]: