Random Variables¶
In [1]:
%matplotlib inline
import itertools as it
import re
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy import stats
import toolz as tz
Using numpy.random
¶
Setting seed for reproducibility¶
In [2]:
np.random.seed(123)
Standard uniform¶
In [3]:
np.random.rand(3,4)
Out[3]:
array([[0.69646919, 0.28613933, 0.22685145, 0.55131477],
[0.71946897, 0.42310646, 0.9807642 , 0.68482974],
[0.4809319 , 0.39211752, 0.34317802, 0.72904971]])
Standard normal¶
In [4]:
np.random.randn(3, 4)
Out[4]:
array([[-0.67888615, -0.09470897, 1.49138963, -0.638902 ],
[-0.44398196, -0.43435128, 2.20593008, 2.18678609],
[ 1.0040539 , 0.3861864 , 0.73736858, 1.49073203]])
Parameterized distributions¶
Parameterized distribution functions typically have one or more of location, scale, shape or other parameters that can be specified.
Continuous distributions¶
In [5]:
np.random.uniform(low=-1, high=1, size=(3, 4))
Out[5]:
array([[ 0.44488677, -0.35408217, -0.27642269, -0.54347354],
[-0.41257191, 0.26195225, -0.81579012, -0.13259765],
[-0.13827447, -0.0126298 , -0.14833942, -0.37547755]])
In [6]:
np.random.normal(loc=100, scale=15, size=(3, 4))
Out[6]:
array([[113.91193648, 97.39546476, 100.04268874, 110.32334067],
[ 86.80695485, 104.25440986, 87.91950223, 74.08495759],
[ 94.13650309, 108.60708794, 105.07883576, 99.82254258]])
In [7]:
np.random.standard_t(df=3, size=(3,4))
Out[7]:
array([[ 2.26603603, 0.26443366, -2.62014171, 0.73989909],
[ 0.52766961, 0.84688526, -0.63048839, -0.92233841],
[ 1.15114019, 0.67780629, 0.82852178, 0.30139753]])
In [8]:
np.random.beta(a=0.5, b=0.5, size=(10,))
Out[8]:
array([0.9853416 , 0.36941327, 0.17888099, 0.42376794, 0.12553194,
0.32966061, 0.37205691, 0.39564619, 0.19150945, 0.83135736])
Discrete distributions¶
In [9]:
np.random.poisson(lam=10, size=(10,))
Out[9]:
array([ 8, 8, 11, 15, 8, 7, 13, 12, 9, 9])
In [10]:
np.random.binomial(n=10, p=0.6, size=(10,))
Out[10]:
array([8, 7, 4, 6, 6, 5, 5, 8, 5, 4])
In [11]:
np.random.negative_binomial(n=10, p=0.6, size=(10,))
Out[11]:
array([10, 8, 3, 0, 6, 6, 4, 6, 3, 5])
In [12]:
np.random.geometric(p=0.6, size=(10,))
Out[12]:
array([2, 1, 1, 5, 1, 1, 1, 4, 1, 2])
Multivariate distributions¶
In [13]:
np.random.multinomial(4, [0.1, 0.2, 0.3, 0.4], size=5)
Out[13]:
array([[1, 0, 1, 2],
[1, 0, 1, 2],
[2, 1, 1, 0],
[0, 0, 2, 2],
[0, 3, 0, 1]])
In [14]:
np.random.multivariate_normal([10, 10], np.array([[3, 0.5], [0.5, 2]]), 5)
Out[14]:
array([[12.36034662, 10.17775889],
[10.59100147, 9.72067176],
[ 9.14425098, 7.58936076],
[12.13627781, 8.89252357],
[11.98365842, 11.00145391]])
Sampling with and without replacement¶
In [15]:
# Sampling is done with replacement by default
np.random.choice(4, 12)
Out[15]:
array([1, 0, 1, 1, 0, 3, 0, 0, 0, 0, 2, 0])
In [16]:
# Probability weights can be given
np.random.choice(4, 12, p=[.4, .1, .1, .4])
Out[16]:
array([0, 3, 0, 3, 3, 3, 3, 3, 1, 3, 0, 3])
In [17]:
x = np.random.randint(0, 10, (8, 12))
x
Out[17]:
array([[7, 3, 1, 3, 9, 3, 6, 2, 3, 1, 9, 8],
[0, 2, 3, 7, 9, 2, 7, 9, 7, 1, 4, 7],
[4, 7, 8, 0, 2, 7, 4, 3, 7, 3, 7, 0],
[7, 6, 8, 6, 8, 0, 5, 3, 2, 4, 3, 3],
[9, 7, 1, 8, 0, 1, 4, 7, 2, 4, 8, 3],
[4, 9, 6, 2, 7, 7, 7, 4, 7, 8, 4, 0],
[7, 6, 3, 4, 1, 1, 7, 2, 7, 8, 9, 4],
[4, 4, 7, 7, 9, 9, 4, 1, 5, 5, 8, 9]])
In [18]:
# sampling individual elements
np.random.choice(x.ravel(), 12)
Out[18]:
array([1, 2, 7, 4, 7, 3, 8, 7, 7, 2, 4, 8])
In [19]:
# sampling rows
idx = np.random.choice(x.shape[0], 4)
x[idx, :]
Out[19]:
array([[9, 7, 1, 8, 0, 1, 4, 7, 2, 4, 8, 3],
[7, 6, 3, 4, 1, 1, 7, 2, 7, 8, 9, 4],
[4, 9, 6, 2, 7, 7, 7, 4, 7, 8, 4, 0],
[7, 6, 8, 6, 8, 0, 5, 3, 2, 4, 3, 3]])
In [20]:
# sampling columns
idx = np.random.choice(x.shape[1], 4)
x[:, idx]
Out[20]:
array([[9, 6, 8, 8],
[9, 7, 7, 7],
[2, 4, 0, 0],
[8, 5, 3, 3],
[0, 4, 3, 3],
[7, 7, 0, 0],
[1, 7, 4, 4],
[9, 4, 9, 9]])
In [21]:
# Give the argument replace=False
try:
np.random.choice(4, 12, replace=False)
except ValueError as e:
print(e)
Cannot take a larger sample than population when 'replace=False'
Shuffles, permutations and combinations¶
Shuffle¶
Shuffle is an in-place permutation
In [22]:
xs = np.arange(10)
xs
Out[22]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [23]:
np.random.shuffle(xs)
xs
Out[23]:
array([1, 0, 8, 7, 3, 6, 4, 9, 5, 2])
Shuffle permutes rows of a matrix
In [24]:
xs = np.arange(12).reshape(3,4)
xs
Out[24]:
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
In [25]:
np.random.shuffle(xs)
xs
Out[25]:
array([[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 0, 1, 2, 3]])
In [26]:
# To shuffle columns instead, transpose before shuffling
np.random.shuffle(x.T)
x
Out[26]:
array([[1, 3, 2, 3, 9, 7, 1, 8, 3, 9, 3, 6],
[3, 2, 9, 7, 9, 0, 1, 7, 2, 4, 7, 7],
[8, 7, 3, 0, 2, 4, 3, 0, 7, 7, 7, 4],
[8, 6, 3, 6, 8, 7, 4, 3, 0, 3, 2, 5],
[1, 7, 7, 8, 0, 9, 4, 3, 1, 8, 2, 4],
[6, 9, 4, 2, 7, 4, 8, 0, 7, 4, 7, 7],
[3, 6, 2, 4, 1, 7, 8, 4, 1, 9, 7, 7],
[7, 4, 1, 7, 9, 4, 5, 9, 9, 8, 5, 4]])
Permutation¶
In [27]:
np.random.permutation(10)
Out[27]:
array([1, 2, 3, 4, 7, 6, 5, 9, 8, 0])
In [28]:
# When given an integre n, permutation treats is as the array arange(n)
xs = np.arange(10)
np.random.permutation(xs)
Out[28]:
array([6, 5, 2, 7, 4, 3, 9, 8, 1, 0])
In [29]:
xs = np.arange(12).reshape(3,4)
np.random.permutation(xs)
Out[29]:
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
In [30]:
# Use indices if you needed to shuffle collections of arrays in synchrony
x = np.arange(12).reshape(4,3)
y = x + 10
idx = np.random.permutation(x.shape[0])
list(zip(x[idx, :], y[idx, :]))
Out[30]:
[(array([0, 1, 2]), array([10, 11, 12])),
(array([ 9, 10, 11]), array([19, 20, 21])),
(array([3, 4, 5]), array([13, 14, 15])),
(array([6, 7, 8]), array([16, 17, 18]))]
Using itertools
¶
In [31]:
list(map(lambda x: ''.join(x), it.permutations('abc')))
Out[31]:
['abc', 'acb', 'bac', 'bca', 'cab', 'cba']
In [32]:
list(map(lambda x: ''.join(x), it.combinations('abcd', 3)))
Out[32]:
['abc', 'abd', 'acd', 'bcd']
In [33]:
list(map(lambda x: ''.join(x), it.combinations_with_replacement('abcd', 2)))
Out[33]:
['aa', 'ab', 'ac', 'ad', 'bb', 'bc', 'bd', 'cc', 'cd', 'dd']
Leave one out¶
Unlike R, Python does not use negative indexing to delete items. So we need to create a Boolean index to create leave-one-out sequences.
In [34]:
x = np.arange(10, 15)
for i in range(len(x)):
idx = np.arange(len(x)) != i
print(x[idx])
[11 12 13 14]
[10 12 13 14]
[10 11 13 14]
[10 11 12 14]
[10 11 12 13]
Using scipy.stats
¶
Example: modeling IQ¶
Suppose IQ is normally distributed with a mean of 0 and a standard deviation of 15.
In [35]:
dist = stats.norm(loc=100, scale=15)
Random variates¶
In [36]:
dist.rvs(10)
Out[36]:
array([102.13638378, 108.11846946, 120.10148055, 76.46115809,
92.34485689, 93.28342862, 114.06775446, 94.65005408,
71.5723661 , 101.31595696])
In [37]:
xs = np.linspace(50, 150, 100)
Percentiles¶
In [40]:
cdf = np.linspace(0, 1, 100)
plt.plot(cdf, dist.ppf(cdf))
pass

In [41]:
data = np.random.normal(110, 15, 100)
Exercises¶
1. If your IQ is 138, what percentage of the population has a higher IQ?
In [42]:
dist = stats.norm(loc=100, scale=15)
In [43]:
100 * (1 - dist.cdf(138))
Out[43]:
0.564917275556065
Via simulation¶
In [44]:
n = int(1e6)
samples = dist.rvs(n)
In [45]:
np.sum(samples > 138)/n
Out[45]:
0.005694
2. If your IQ is at the 88th percentile, what is your IQ?
In [46]:
dist.ppf(0.88)
Out[46]:
117.62480188099136
Via simulation¶
In [47]:
samples = np.sort(samples)
samples[int(0.88*n)]
Out[47]:
117.6441801355916
3. What proportion of the population has IQ between 70 and 90?
In [48]:
dist.cdf(90) - dist.cdf(70)
Out[48]:
0.2297424055987437
MLE fit and confidence intervals¶
In [50]:
loc, scale = stats.norm.fit(data)
loc, scale
Out[50]:
(108.25727900498485, 14.374033188248367)
In [51]:
dist = stats.norm(loc, scale)
In [52]:
xs = np.linspace(data.min(), data.max(), 100)
plt.hist(data, 12, histtype='stepfilled', normed=True, alpha=0.5)
plt.plot(xs, dist.pdf(xs))
plt.plot(dist.interval(0.95), [0.001, 0.001], c='r', linewidth=3)
pass
/usr/local/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6571: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
warnings.warn("The 'normed' kwarg is deprecated, and has been "

Sampling¶
Without replication¶
In [53]:
np.random.choice(range(10), 5, replace=False)
Out[53]:
array([7, 2, 3, 1, 6])
With replication¶
In [54]:
np.random.choice(range(10), 15)
Out[54]:
array([9, 4, 9, 9, 1, 0, 4, 1, 6, 8, 3, 0, 5, 4, 7])
Example¶
How often do we get a run of 5 or more consecutive heads in 100 coin tosses if we repeat the experiment 1000 times?
What if the coin is biased to generate heads only 40% of the time?
In [55]:
expts = 1000
tosses = 100
We assume that 0 maps to T and 1 to H¶
In [56]:
xs = np.random.choice([0,1], (expts, tosses))
For biased coin¶
In [57]:
ys = np.random.choice([0,1], (expts, tosses), p=[0.6, 0.4])
Using a finite state machine¶
In [58]:
runs = 0
for x in xs:
m = 0
for i in x:
if i == 1:
m += 1
if m >=5:
runs += 1
break
else:
m = 0
runs
Out[58]:
796
Using partitionby
¶
In [59]:
runs = 0
for x in xs:
parts = tz.partitionby(lambda i: i==1, x)
for part in parts:
if part[0] == 1 and len(part) >= 5:
runs += 1
break
runs
Out[59]:
796
Using sliding windows¶
In [60]:
runs = 0
for x in xs:
for w in tz.sliding_window(5, x):
if np.sum(w) == 5:
runs += 1
break
runs
Out[60]:
796
Using a regular expression¶
In [61]:
xs = xs.astype('str')
In [62]:
runs = 0
for x in xs:
if (re.search(r'1{5,}', ''.join(x))):
runs += 1
runs
Out[62]:
796