In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [3]:
np.random.seed(123)

Series

  • types

  • index

  • values

  • string methods

  • cat methods

  • datetime methods

Numeric

In [4]:
s1 = pd.Series(np.random.randint(0, 10, 5), name='n')
In [5]:
s1
Out[5]:
0    2
1    2
2    6
3    1
4    3
Name: n, dtype: int64
In [6]:
s1.shape
Out[6]:
(5,)
In [7]:
s1.index
Out[7]:
RangeIndex(start=0, stop=5, step=1)
In [8]:
s1.values
Out[8]:
array([2, 2, 6, 1, 3])
In [9]:
s1[1:4]
Out[9]:
1    2
2    6
3    1
Name: n, dtype: int64
In [10]:
s1 *= 2
In [11]:
s1
Out[11]:
0     4
1     4
2    12
3     2
4     6
Name: n, dtype: int64

Strings

In [12]:
s2 = pd.Series(['duke-bugsy-math', 'duke-flopsy-stats', 'unc-scooby-lit', 'unc-scooby-stats'], name='rhyme')
In [13]:
s2
Out[13]:
0      duke-bugsy-math
1    duke-flopsy-stats
2       unc-scooby-lit
3     unc-scooby-stats
Name: rhyme, dtype: object
In [14]:
s2.str.split('-', expand=True)
Out[14]:
0 1 2
0 duke bugsy math
1 duke flopsy stats
2 unc scooby lit
3 unc scooby stats
In [15]:
s2.str.split('-').str[1]
Out[15]:
0     bugsy
1    flopsy
2    scooby
3    scooby
Name: rhyme, dtype: object

Categories (aka “factors”)

In [16]:
s3 = pd.Series(['first', 'second', 'third', 'fourth', 'first', 'third'])
In [17]:
s3 = s3.astype('category')
In [18]:
s3
Out[18]:
0     first
1    second
2     third
3    fourth
4     first
5     third
dtype: category
Categories (4, object): [first, fourth, second, third]
In [19]:
s3 = s3.cat.set_categories(['first', 'second', 'third', 'fourth', 'fifth'], ordered=True)
In [20]:
s3
Out[20]:
0     first
1    second
2     third
3    fourth
4     first
5     third
dtype: category
Categories (5, object): [first < second < third < fourth < fifth]
In [21]:
s3.sort_values()
Out[21]:
0     first
4     first
1    second
2     third
5     third
3    fourth
dtype: category
Categories (5, object): [first < second < third < fourth < fifth]
In [22]:
s3.cat.codes
Out[22]:
0    0
1    1
2    2
3    3
4    0
5    2
dtype: int8
In [23]:
s3.cat.categories
Out[23]:
Index(['first', 'second', 'third', 'fourth', 'fifth'], dtype='object')
In [24]:
s3 = s3.cat.remove_unused_categories()
In [25]:
s3
Out[25]:
0     first
1    second
2     third
3    fourth
4     first
5     third
dtype: category
Categories (4, object): [first < second < third < fourth]

With datetime index

In [26]:
n = 10
ts = pd.date_range('today', periods=n, freq='D')
In [27]:
ts
Out[27]:
DatetimeIndex(['2019-04-22 13:30:42.299071', '2019-04-23 13:30:42.299071',
               '2019-04-24 13:30:42.299071', '2019-04-25 13:30:42.299071',
               '2019-04-26 13:30:42.299071', '2019-04-27 13:30:42.299071',
               '2019-04-28 13:30:42.299071', '2019-04-29 13:30:42.299071',
               '2019-04-30 13:30:42.299071', '2019-05-01 13:30:42.299071'],
              dtype='datetime64[ns]', freq='D')
In [28]:
s4 = pd.Series(np.random.randn(n), index=ts)
In [29]:
s4
Out[29]:
2019-04-22 13:30:42.299071    1.521361
2019-04-23 13:30:42.299071   -0.048250
2019-04-24 13:30:42.299071    1.595301
2019-04-25 13:30:42.299071   -1.783094
2019-04-26 13:30:42.299071   -0.286451
2019-04-27 13:30:42.299071   -1.041969
2019-04-28 13:30:42.299071   -1.479645
2019-04-29 13:30:42.299071    1.071282
2019-04-30 13:30:42.299071    0.807485
2019-05-01 13:30:42.299071   -1.058267
Freq: D, dtype: float64
In [30]:
s4.index.weekday_name
Out[30]:
Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype='object')
In [31]:
s4.plot(style='-o')
pass
notebook/../../build/doctrees/nbsphinx/notebook_S04A_Using_Pandas_Simple_Annotated_35_0.png

With datetime values

In [32]:
s5 = pd.Series(pd.date_range('today', periods=n, freq='23H'), name='visits')
In [33]:
s5
Out[33]:
0   2019-04-22 13:30:42.667354
1   2019-04-23 12:30:42.667354
2   2019-04-24 11:30:42.667354
3   2019-04-25 10:30:42.667354
4   2019-04-26 09:30:42.667354
5   2019-04-27 08:30:42.667354
6   2019-04-28 07:30:42.667354
7   2019-04-29 06:30:42.667354
8   2019-04-30 05:30:42.667354
9   2019-05-01 04:30:42.667354
Name: visits, dtype: datetime64[ns]
In [34]:
s5.dt.hour
Out[34]:
0    13
1    12
2    11
3    10
4     9
5     8
6     7
7     6
8     5
9     4
Name: visits, dtype: int64
In [35]:
s5.dt.weekday_name
Out[35]:
0       Monday
1      Tuesday
2    Wednesday
3     Thursday
4       Friday
5     Saturday
6       Sunday
7       Monday
8      Tuesday
9    Wednesday
Name: visits, dtype: object

Data Frames

In [36]:
df = pd.read_csv('https://bit.ly/2RIw7Ig', index_col=0)

Basic operations

In [37]:
df.columns
Out[37]:
Index(['sex', 'weight', 'height', 'repwt', 'repht'], dtype='object')
In [38]:
df.index
Out[38]:
Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            191, 192, 193, 194, 195, 196, 197, 198, 199, 200],
           dtype='int64', length=200)
In [39]:
df.head(3)
Out[39]:
sex weight height repwt repht
1 M 77 182 77.0 180.0
2 F 58 161 51.0 159.0
3 F 53 161 54.0 158.0
In [40]:
df.tail(3)
Out[40]:
sex weight height repwt repht
198 M 81 175 NaN NaN
199 M 90 181 91.0 178.0
200 M 79 177 81.0 178.0
In [41]:
df.sample(3)
Out[41]:
sex weight height repwt repht
189 M 76 183 75.0 180.0
58 M 73 183 74.0 180.0
72 M 66 173 66.0 175.0
In [42]:
df.shape
Out[42]:
(200, 5)
In [43]:
df.dtypes
Out[43]:
sex        object
weight      int64
height      int64
repwt     float64
repht     float64
dtype: object
In [44]:
df.describe()
Out[44]:
weight height repwt repht
count 200.000000 200.000000 183.000000 183.000000
mean 65.800000 170.020000 65.622951 168.497268
std 15.095009 12.007937 13.776669 9.467048
min 39.000000 57.000000 41.000000 148.000000
25% 55.000000 164.000000 55.000000 160.500000
50% 63.000000 169.500000 63.000000 168.000000
75% 74.000000 177.250000 73.500000 175.000000
max 166.000000 197.000000 124.000000 200.000000
In [45]:
df.weight.head(3)
Out[45]:
1    77
2    58
3    53
Name: weight, dtype: int64
In [46]:
df['weight'].head(3)
Out[46]:
1    77
2    58
3    53
Name: weight, dtype: int64
In [47]:
df[['weight', 'height']].head(3)
Out[47]:
weight height
1 77 182
2 58 161
3 53 161
In [48]:
df[1:3]
Out[48]:
sex weight height repwt repht
2 F 58 161 51.0 159.0
3 F 53 161 54.0 158.0
In [49]:
df.iloc[1:3, 1:4]
Out[49]:
weight height repwt
2 58 161 51.0
3 53 161 54.0
In [50]:
df.loc[[1,4,7], ['weight', 'height']]
Out[50]:
weight height
1 77 182
4 68 177
7 76 167
In [51]:
df.iloc[1:3]
Out[51]:
sex weight height repwt repht
2 F 58 161 51.0 159.0
3 F 53 161 54.0 158.0
In [52]:
df.loc[1:3]
Out[52]:
sex weight height repwt repht
1 M 77 182 77.0 180.0
2 F 58 161 51.0 159.0
3 F 53 161 54.0 158.0

dplyr type operations

In [53]:
df.filter(regex=r'^r', axis=1).head(3)
Out[53]:
repwt repht
1 77.0 180.0
2 51.0 159.0
3 54.0 158.0
In [54]:
df.loc[1:3, df.columns.str.startswith('r')]
Out[54]:
repwt repht
1 77.0 180.0
2 51.0 159.0
3 54.0 158.0
In [55]:
cols = [c for c in df.columns if c.startswith('r')]
df[cols].head(3)
Out[55]:
repwt repht
1 77.0 180.0
2 51.0 159.0
3 54.0 158.0
In [56]:
df.loc[df.repwt > df.weight].head(3)
Out[56]:
sex weight height repwt repht
3 F 53 161 54.0 158.0
4 M 68 177 70.0 175.0
7 M 76 167 77.0 165.0
In [57]:
df.sort_values(['weight', 'height'], ascending=[False, False]).head(3)
Out[57]:
sex weight height repwt repht
12 F 166 57 56.0 163.0
21 M 119 180 124.0 178.0
97 M 103 185 101.0 182.0
In [58]:
df.nlargest(3, ['weight', 'height'])
Out[58]:
sex weight height repwt repht
12 F 166 57 56.0 163.0
21 M 119 180 124.0 178.0
97 M 103 185 101.0 182.0
In [59]:
df.assign(bmi = lambda x: x.weight/((x.height/100)**2)).head(3)
Out[59]:
sex weight height repwt repht bmi
1 M 77 182 77.0 180.0 23.245985
2 F 58 161 51.0 159.0 22.375680
3 F 53 161 54.0 158.0 20.446742
In [60]:
df['bmi'] = df['weight']/((df['height']/100)**2)
In [61]:
df.head(3)
Out[61]:
sex weight height repwt repht bmi
1 M 77 182 77.0 180.0 23.245985
2 F 58 161 51.0 159.0 22.375680
3 F 53 161 54.0 158.0 20.446742
In [62]:
df.mean()
Out[62]:
weight     65.800000
height    170.020000
repwt      65.622951
repht     168.497268
bmi        24.700956
dtype: float64
In [63]:
df.median()
Out[63]:
weight     63.000000
height    169.500000
repwt      63.000000
repht     168.000000
bmi        21.837154
dtype: float64
In [64]:
df.groupby('sex').mean()
Out[64]:
weight height repwt repht bmi
sex
F 57.866071 163.741071 56.742574 162.198020 25.331053
M 75.897727 178.011364 76.560976 176.256098 23.899014
In [65]:
df.groupby('sex')[['weight', 'height']].mean()
Out[65]:
weight height
sex
F 57.866071 163.741071
M 75.897727 178.011364
In [66]:
df.groupby('sex')[['weight', 'height']].agg(['mean', 'std'])
Out[66]:
weight height
mean std mean std
sex
F 57.866071 12.383144 163.741071 11.643925
M 75.897727 11.890342 178.011364 6.440701
In [67]:
df.groupby('sex').agg({'weight': ['min', 'max'], 'height': ['mean', 'std']})
Out[67]:
weight height
min max mean std
sex
F 39 166 163.741071 11.643925
M 54 119 178.011364 6.440701

tidyr type operations

In [68]:
df = pd.DataFrame(dict(name = ['clark kent', 'bruce wayne', 'diana prince'],
                       visit1 = [23,34,45],
                       visit2 = [25, 40, 54])
                 )
In [69]:
df
Out[69]:
name visit1 visit2
0 clark kent 23 25
1 bruce wayne 34 40
2 diana prince 45 54
In [70]:
df_tall = pd.melt(df, id_vars='name')
In [71]:
df_tall
Out[71]:
name variable value
0 clark kent visit1 23
1 bruce wayne visit1 34
2 diana prince visit1 45
3 clark kent visit2 25
4 bruce wayne visit2 40
5 diana prince visit2 54
In [72]:
df_tall.pivot(index='name', columns='variable', values='value')
Out[72]:
variable visit1 visit2
name
bruce wayne 34 40
clark kent 23 25
diana prince 45 54
In [73]:
df[['first', 'last']] = df.name.str.split(expand=True)
In [74]:
df.head()
Out[74]:
name visit1 visit2 first last
0 clark kent 23 25 clark kent
1 bruce wayne 34 40 bruce wayne
2 diana prince 45 54 diana prince
In [75]:
df = df.drop('name', axis=1)
df
Out[75]:
visit1 visit2 first last
0 23 25 clark kent
1 34 40 bruce wayne
2 45 54 diana prince
In [76]:
df['name'] = df[['first', 'last']].apply(lambda x: ' '.join(x), axis=1)
In [77]:
df.head()
Out[77]:
visit1 visit2 first last name
0 23 25 clark kent clark kent
1 34 40 bruce wayne bruce wayne
2 45 54 diana prince diana prince
In [78]:
df = df.drop(['first', 'last'], axis=1)
In [79]:
df
Out[79]:
visit1 visit2 name
0 23 25 clark kent
1 34 40 bruce wayne
2 45 54 diana prince
In [80]:
df[['first', 'last']] = df.name.str.split(expand=True)
In [81]:
df1 = df.drop('name', axis=1)
In [82]:
df1
Out[82]:
visit1 visit2 first last
0 23 25 clark kent
1 34 40 bruce wayne
2 45 54 diana prince
In [83]:
df2 = df1.copy()
df2 = df2.rename({'visit1': 'visit3', 'visit2': 'visit4'}, axis=1)
df2 = df2.drop(0, axis=0)
df2.iloc[:, :2] *= 2
df2.loc[3] = [11, 23, 'arthur',  'curry']
df2
Out[83]:
visit3 visit4 first last
1 68 80 bruce wayne
2 90 108 diana prince
3 11 23 arthur curry
In [84]:
pd.merge(df1, df2)
Out[84]:
visit1 visit2 first last visit3 visit4
0 34 40 bruce wayne 68 80
1 45 54 diana prince 90 108
In [85]:
pd.merge(df1, df2, on=['first', 'last'])
Out[85]:
visit1 visit2 first last visit3 visit4
0 34 40 bruce wayne 68 80
1 45 54 diana prince 90 108
In [86]:
pd.merge(df1, df2, left_on=['first', 'last'], right_on=['first', 'last'])
Out[86]:
visit1 visit2 first last visit3 visit4
0 34 40 bruce wayne 68 80
1 45 54 diana prince 90 108
In [87]:
pd.merge(df1, df2, on=['first', 'last'], how='left')
Out[87]:
visit1 visit2 first last visit3 visit4
0 23 25 clark kent NaN NaN
1 34 40 bruce wayne 68.0 80.0
2 45 54 diana prince 90.0 108.0
In [88]:
pd.merge(df1, df2, on=['first', 'last'], how='right')
Out[88]:
visit1 visit2 first last visit3 visit4
0 34.0 40.0 bruce wayne 68 80
1 45.0 54.0 diana prince 90 108
2 NaN NaN arthur curry 11 23
In [89]:
pd.merge(df1, df2, on=['first', 'last'], how='outer')
Out[89]:
visit1 visit2 first last visit3 visit4
0 23.0 25.0 clark kent NaN NaN
1 34.0 40.0 bruce wayne 68.0 80.0
2 45.0 54.0 diana prince 90.0 108.0
3 NaN NaN arthur curry 11.0 23.0
In [90]:
df1.append(df1)
Out[90]:
visit1 visit2 first last
0 23 25 clark kent
1 34 40 bruce wayne
2 45 54 diana prince
0 23 25 clark kent
1 34 40 bruce wayne
2 45 54 diana prince
In [91]:
df1.append(df2, sort=False)
Out[91]:
visit1 visit2 first last visit3 visit4
0 23.0 25.0 clark kent NaN NaN
1 34.0 40.0 bruce wayne NaN NaN
2 45.0 54.0 diana prince NaN NaN
1 NaN NaN bruce wayne 68.0 80.0
2 NaN NaN diana prince 90.0 108.0
3 NaN NaN arthur curry 11.0 23.0
In [92]:
df3 = df1.append(df2, sort=False)
In [93]:
df3.reset_index(drop=True)
Out[93]:
visit1 visit2 first last visit3 visit4
0 23.0 25.0 clark kent NaN NaN
1 34.0 40.0 bruce wayne NaN NaN
2 45.0 54.0 diana prince NaN NaN
3 NaN NaN bruce wayne 68.0 80.0
4 NaN NaN diana prince 90.0 108.0
5 NaN NaN arthur curry 11.0 23.0
In [94]:
df4 = df1.set_index(['first', 'last'])
df4
Out[94]:
visit1 visit2
first last
clark kent 23 25
bruce wayne 34 40
diana prince 45 54
In [95]:
df5 = df2.set_index(['first', 'last'])
df5
Out[95]:
visit3 visit4
first last
bruce wayne 68 80
diana prince 90 108
arthur curry 11 23
In [96]:
df6 = df5.copy()
df6 = df6.rename({'visit3': 'visit5', 'visit4': 'visit6'}, axis=1)
df6 = df6 - 10
df6
Out[96]:
visit5 visit6
first last
bruce wayne 58 70
diana prince 80 98
arthur curry 1 13
In [97]:
df4.join([df5, df6])
Out[97]:
visit1 visit2 visit3 visit4 visit5 visit6
first last
clark kent 23 25 NaN NaN NaN NaN
bruce wayne 34 40 68.0 80.0 58.0 70.0
diana prince 45 54 90.0 108.0 80.0 98.0
In [98]:
df4.join([df5, df6], how='outer')
Out[98]:
visit1 visit2 visit3 visit4 visit5 visit6
first last
arthur curry NaN NaN 11.0 23.0 1.0 13.0
bruce wayne 34.0 40.0 68.0 80.0 58.0 70.0
clark kent 23.0 25.0 NaN NaN NaN NaN
diana prince 45.0 54.0 90.0 108.0 80.0 98.0
In [ ]: