In [1]:

%matplotlib inline

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:

np.random.seed(123)

Series¶

types
index
values
string methods
cat methods
datetime methods

Numeric¶

In [4]:

s1 = pd.Series(np.random.randint(0, 10, 5), name='n')

In [5]:

s1

Out[5]:

  2
  2
  6
  1
  3
Name: n, dtype: int64

In [6]:

s1.shape

Out[6]:

(5,)

In [7]:

s1.index

Out[7]:

RangeIndex(start=0, stop=5, step=1)

In [8]:

s1.values

Out[8]:

array([2, 2, 6, 1, 3])

In [9]:

s1[1:4]

Out[9]:

  2
  6
  1
Name: n, dtype: int64

In [10]:

s1 *= 2

In [11]:

s1

Out[11]:

   4
   4
  12
   2
   6
Name: n, dtype: int64

Strings¶

In [12]:

s2 = pd.Series(['duke-bugsy-math', 'duke-flopsy-stats', 'unc-scooby-lit', 'unc-scooby-stats'], name='rhyme')

In [13]:

s2

Out[13]:

    duke-bugsy-math
  duke-flopsy-stats
     unc-scooby-lit
   unc-scooby-stats
Name: rhyme, dtype: object

In [14]:

s2.str.split('-', expand=True)

Out[14]:

	0	1	2
0	duke	bugsy	math
1	duke	flopsy	stats
2	unc	scooby	lit
3	unc	scooby	stats

In [15]:

s2.str.split('-').str[1]

Out[15]:

   bugsy
  flopsy
  scooby
  scooby
Name: rhyme, dtype: object

Categories (aka “factors”)¶

In [16]:

s3 = pd.Series(['first', 'second', 'third', 'fourth', 'first', 'third'])

In [17]:

s3 = s3.astype('category')

In [18]:

s3

Out[18]:

   first
  second
   third
  fourth
   first
   third
dtype: category
Categories (4, object): [first, fourth, second, third]

In [19]:

s3 = s3.cat.set_categories(['first', 'second', 'third', 'fourth', 'fifth'], ordered=True)

In [20]:

s3

Out[20]:

   first
  second
   third
  fourth
   first
   third
dtype: category
Categories (5, object): [first < second < third < fourth < fifth]

In [21]:

s3.sort_values()

Out[21]:

   first
   first
  second
   third
   third
  fourth
dtype: category
Categories (5, object): [first < second < third < fourth < fifth]

In [22]:

s3.cat.codes

Out[22]:

  0
  1
  2
  3
  0
  2
dtype: int8

In [23]:

s3.cat.categories

Out[23]:

Index(['first', 'second', 'third', 'fourth', 'fifth'], dtype='object')

In [24]:

s3 = s3.cat.remove_unused_categories()

In [25]:

s3

Out[25]:

   first
  second
   third
  fourth
   first
   third
dtype: category
Categories (4, object): [first < second < third < fourth]

With datetime index¶

In [26]:

n = 10
ts = pd.date_range('today', periods=n, freq='D')

In [27]:

ts

Out[27]:

DatetimeIndex(['2019-04-22 13:30:42.299071', '2019-04-23 13:30:42.299071',
               '2019-04-24 13:30:42.299071', '2019-04-25 13:30:42.299071',
               '2019-04-26 13:30:42.299071', '2019-04-27 13:30:42.299071',
               '2019-04-28 13:30:42.299071', '2019-04-29 13:30:42.299071',
               '2019-04-30 13:30:42.299071', '2019-05-01 13:30:42.299071'],
              dtype='datetime64[ns]', freq='D')

In [28]:

s4 = pd.Series(np.random.randn(n), index=ts)

In [29]:

s4

Out[29]:

2019-04-22 13:30:42.299071    1.521361
2019-04-23 13:30:42.299071   -0.048250
2019-04-24 13:30:42.299071    1.595301
2019-04-25 13:30:42.299071   -1.783094
2019-04-26 13:30:42.299071   -0.286451
2019-04-27 13:30:42.299071   -1.041969
2019-04-28 13:30:42.299071   -1.479645
2019-04-29 13:30:42.299071    1.071282
2019-04-30 13:30:42.299071    0.807485
2019-05-01 13:30:42.299071   -1.058267
Freq: D, dtype: float64

In [30]:

s4.index.weekday_name

Out[30]:

Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [31]:

s4.plot(style='-o')
pass

notebook/../../build/doctrees/nbsphinx/notebook_S04A_Using_Pandas_Simple_Annotated_35_0.png

With datetime values¶

In [32]:

s5 = pd.Series(pd.date_range('today', periods=n, freq='23H'), name='visits')

In [33]:

s5

Out[33]:

 2019-04-22 13:30:42.667354
 2019-04-23 12:30:42.667354
 2019-04-24 11:30:42.667354
 2019-04-25 10:30:42.667354
 2019-04-26 09:30:42.667354
 2019-04-27 08:30:42.667354
 2019-04-28 07:30:42.667354
 2019-04-29 06:30:42.667354
 2019-04-30 05:30:42.667354
 2019-05-01 04:30:42.667354
Name: visits, dtype: datetime64[ns]

In [34]:

s5.dt.hour

Out[34]:

  13
  12
  11
  10
   9
   8
   7
   6
   5
   4
Name: visits, dtype: int64

In [35]:

s5.dt.weekday_name

Out[35]:

     Monday
    Tuesday
  Wednesday
   Thursday
     Friday
   Saturday
     Sunday
     Monday
    Tuesday
  Wednesday
Name: visits, dtype: object

Data Frames¶

In [36]:

df = pd.read_csv('https://bit.ly/2RIw7Ig', index_col=0)

Basic operations¶

In [37]:

df.columns

Out[37]:

Index(['sex', 'weight', 'height', 'repwt', 'repht'], dtype='object')

In [38]:

df.index

Out[38]:

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            191, 192, 193, 194, 195, 196, 197, 198, 199, 200],
           dtype='int64', length=200)

In [39]:

df.head(3)

Out[39]:

	sex	weight	height	repwt	repht
1	M	77	182	77.0	180.0
2	F	58	161	51.0	159.0
3	F	53	161	54.0	158.0

In [40]:

df.tail(3)

Out[40]:

	sex	weight	height	repwt	repht
198	M	81	175	NaN	NaN
199	M	90	181	91.0	178.0
200	M	79	177	81.0	178.0

In [41]:

df.sample(3)

Out[41]:

	sex	weight	height	repwt	repht
189	M	76	183	75.0	180.0
58	M	73	183	74.0	180.0
72	M	66	173	66.0	175.0

In [42]:

df.shape

Out[42]:

(200, 5)

In [43]:

df.dtypes

Out[43]:

sex        object
weight      int64
height      int64
repwt     float64
repht     float64
dtype: object

In [44]:

df.describe()

Out[44]:

	weight	height	repwt	repht
count	200.000000	200.000000	183.000000	183.000000
mean	65.800000	170.020000	65.622951	168.497268
std	15.095009	12.007937	13.776669	9.467048
min	39.000000	57.000000	41.000000	148.000000
25%	55.000000	164.000000	55.000000	160.500000
50%	63.000000	169.500000	63.000000	168.000000
75%	74.000000	177.250000	73.500000	175.000000
max	166.000000	197.000000	124.000000	200.000000

In [45]:

df.weight.head(3)

Out[45]:

  77
  58
  53
Name: weight, dtype: int64

In [46]:

df['weight'].head(3)

Out[46]:

  77
  58
  53
Name: weight, dtype: int64

In [47]:

df[['weight', 'height']].head(3)

Out[47]:

	weight	height
1	77	182
2	58	161
3	53	161

In [48]:

df[1:3]

Out[48]:

	sex	weight	height	repwt	repht
2	F	58	161	51.0	159.0
3	F	53	161	54.0	158.0

In [49]:

df.iloc[1:3, 1:4]

Out[49]:

	weight	height	repwt
2	58	161	51.0
3	53	161	54.0

In [50]:

df.loc[[1,4,7], ['weight', 'height']]

Out[50]:

	weight	height
1	77	182
4	68	177
7	76	167

In [51]:

df.iloc[1:3]

Out[51]:

	sex	weight	height	repwt	repht
2	F	58	161	51.0	159.0
3	F	53	161	54.0	158.0

In [52]:

df.loc[1:3]

Out[52]:

	sex	weight	height	repwt	repht
1	M	77	182	77.0	180.0
2	F	58	161	51.0	159.0
3	F	53	161	54.0	158.0

`dplyr` type operations¶

In [53]:

df.filter(regex=r'^r', axis=1).head(3)

Out[53]:

	repwt	repht
1	77.0	180.0
2	51.0	159.0
3	54.0	158.0

In [54]:

df.loc[1:3, df.columns.str.startswith('r')]

Out[54]:

	repwt	repht
1	77.0	180.0
2	51.0	159.0
3	54.0	158.0

In [55]:

cols = [c for c in df.columns if c.startswith('r')]
df[cols].head(3)

Out[55]:

	repwt	repht
1	77.0	180.0
2	51.0	159.0
3	54.0	158.0

In [56]:

df.loc[df.repwt > df.weight].head(3)

Out[56]:

	sex	weight	height	repwt	repht
3	F	53	161	54.0	158.0
4	M	68	177	70.0	175.0
7	M	76	167	77.0	165.0

In [57]:

df.sort_values(['weight', 'height'], ascending=[False, False]).head(3)

Out[57]:

	sex	weight	height	repwt	repht
12	F	166	57	56.0	163.0
21	M	119	180	124.0	178.0
97	M	103	185	101.0	182.0

In [58]:

df.nlargest(3, ['weight', 'height'])

Out[58]:

	sex	weight	height	repwt	repht
12	F	166	57	56.0	163.0
21	M	119	180	124.0	178.0
97	M	103	185	101.0	182.0

In [59]:

df.assign(bmi = lambda x: x.weight/((x.height/100)**2)).head(3)

Out[59]:

	sex	weight	height	repwt	repht	bmi
1	M	77	182	77.0	180.0	23.245985
2	F	58	161	51.0	159.0	22.375680
3	F	53	161	54.0	158.0	20.446742

In [60]:

df['bmi'] = df['weight']/((df['height']/100)**2)

In [61]:

df.head(3)

Out[61]:

	sex	weight	height	repwt	repht	bmi
1	M	77	182	77.0	180.0	23.245985
2	F	58	161	51.0	159.0	22.375680
3	F	53	161	54.0	158.0	20.446742

In [62]:

df.mean()

Out[62]:

weight     65.800000
height    170.020000
repwt      65.622951
repht     168.497268
bmi        24.700956
dtype: float64

In [63]:

df.median()

Out[63]:

weight     63.000000
height    169.500000
repwt      63.000000
repht     168.000000
bmi        21.837154
dtype: float64

In [64]:

df.groupby('sex').mean()

Out[64]:

	weight	height	repwt	repht	bmi
sex
F	57.866071	163.741071	56.742574	162.198020	25.331053
M	75.897727	178.011364	76.560976	176.256098	23.899014

In [65]:

df.groupby('sex')[['weight', 'height']].mean()

Out[65]:

	weight	height
sex
F	57.866071	163.741071
M	75.897727	178.011364

In [66]:

df.groupby('sex')[['weight', 'height']].agg(['mean', 'std'])

Out[66]:

	weight		height
	mean	std	mean	std
sex
F	57.866071	12.383144	163.741071	11.643925
M	75.897727	11.890342	178.011364	6.440701

In [67]:

df.groupby('sex').agg({'weight': ['min', 'max'], 'height': ['mean', 'std']})

Out[67]:

	weight		height
	min	max	mean	std
sex
F	39	166	163.741071	11.643925
M	54	119	178.011364	6.440701

`tidyr` type operations¶

In [68]:

df = pd.DataFrame(dict(name = ['clark kent', 'bruce wayne', 'diana prince'],
                       visit1 = [23,34,45],
                       visit2 = [25, 40, 54])
                 )

In [69]:

df

Out[69]:

	name	visit1	visit2
0	clark kent	23	25
1	bruce wayne	34	40
2	diana prince	45	54

In [70]:

df_tall = pd.melt(df, id_vars='name')

In [71]:

df_tall

Out[71]:

	name	variable	value
0	clark kent	visit1	23
1	bruce wayne	visit1	34
2	diana prince	visit1	45
3	clark kent	visit2	25
4	bruce wayne	visit2	40
5	diana prince	visit2	54

In [72]:

df_tall.pivot(index='name', columns='variable', values='value')

Out[72]:

variable	visit1	visit2
name
bruce wayne	34	40
clark kent	23	25
diana prince	45	54

In [73]:

df[['first', 'last']] = df.name.str.split(expand=True)

In [74]:

df.head()

Out[74]:

	name	visit1	visit2	first	last
0	clark kent	23	25	clark	kent
1	bruce wayne	34	40	bruce	wayne
2	diana prince	45	54	diana	prince

In [75]:

df = df.drop('name', axis=1)
df

Out[75]:

	visit1	visit2	first	last
0	23	25	clark	kent
1	34	40	bruce	wayne
2	45	54	diana	prince

In [76]:

df['name'] = df[['first', 'last']].apply(lambda x: ' '.join(x), axis=1)

In [77]:

df.head()

Out[77]:

	visit1	visit2	first	last	name
0	23	25	clark	kent	clark kent
1	34	40	bruce	wayne	bruce wayne
2	45	54	diana	prince	diana prince

In [78]:

df = df.drop(['first', 'last'], axis=1)

In [79]:

df

Out[79]:

	visit1	visit2	name
0	23	25	clark kent
1	34	40	bruce wayne
2	45	54	diana prince

In [80]:

df[['first', 'last']] = df.name.str.split(expand=True)

In [81]:

df1 = df.drop('name', axis=1)

In [82]:

df1

Out[82]:

	visit1	visit2	first	last
0	23	25	clark	kent
1	34	40	bruce	wayne
2	45	54	diana	prince

In [83]:

df2 = df1.copy()
df2 = df2.rename({'visit1': 'visit3', 'visit2': 'visit4'}, axis=1)
df2 = df2.drop(0, axis=0)
df2.iloc[:, :2] *= 2
df2.loc[3] = [11, 23, 'arthur',  'curry']
df2

Out[83]:

	visit3	visit4	first	last
1	68	80	bruce	wayne
2	90	108	diana	prince
3	11	23	arthur	curry

In [84]:

pd.merge(df1, df2)

Out[84]:

	visit1	visit2	first	last	visit3	visit4
0	34	40	bruce	wayne	68	80
1	45	54	diana	prince	90	108

In [85]:

pd.merge(df1, df2, on=['first', 'last'])

Out[85]:

	visit1	visit2	first	last	visit3	visit4
0	34	40	bruce	wayne	68	80
1	45	54	diana	prince	90	108

In [86]:

pd.merge(df1, df2, left_on=['first', 'last'], right_on=['first', 'last'])

Out[86]:

	visit1	visit2	first	last	visit3	visit4
0	34	40	bruce	wayne	68	80
1	45	54	diana	prince	90	108

In [87]:

pd.merge(df1, df2, on=['first', 'last'], how='left')

Out[87]:

	visit1	visit2	first	last	visit3	visit4
0	23	25	clark	kent	NaN	NaN
1	34	40	bruce	wayne	68.0	80.0
2	45	54	diana	prince	90.0	108.0

In [88]:

pd.merge(df1, df2, on=['first', 'last'], how='right')

Out[88]:

	visit1	visit2	first	last	visit3	visit4
0	34.0	40.0	bruce	wayne	68	80
1	45.0	54.0	diana	prince	90	108
2	NaN	NaN	arthur	curry	11	23

In [89]:

pd.merge(df1, df2, on=['first', 'last'], how='outer')

Out[89]:

	visit1	visit2	first	last	visit3	visit4
0	23.0	25.0	clark	kent	NaN	NaN
1	34.0	40.0	bruce	wayne	68.0	80.0
2	45.0	54.0	diana	prince	90.0	108.0
3	NaN	NaN	arthur	curry	11.0	23.0

In [90]:

df1.append(df1)

Out[90]:

	visit1	visit2	first	last
0	23	25	clark	kent
1	34	40	bruce	wayne
2	45	54	diana	prince
0	23	25	clark	kent
1	34	40	bruce	wayne
2	45	54	diana	prince

In [91]:

df1.append(df2, sort=False)

Out[91]:

	visit1	visit2	first	last	visit3	visit4
0	23.0	25.0	clark	kent	NaN	NaN
1	34.0	40.0	bruce	wayne	NaN	NaN
2	45.0	54.0	diana	prince	NaN	NaN
1	NaN	NaN	bruce	wayne	68.0	80.0
2	NaN	NaN	diana	prince	90.0	108.0
3	NaN	NaN	arthur	curry	11.0	23.0

In [92]:

df3 = df1.append(df2, sort=False)

In [93]:

df3.reset_index(drop=True)

Out[93]:

	visit1	visit2	first	last	visit3	visit4
0	23.0	25.0	clark	kent	NaN	NaN
1	34.0	40.0	bruce	wayne	NaN	NaN
2	45.0	54.0	diana	prince	NaN	NaN
3	NaN	NaN	bruce	wayne	68.0	80.0
4	NaN	NaN	diana	prince	90.0	108.0
5	NaN	NaN	arthur	curry	11.0	23.0

In [94]:

df4 = df1.set_index(['first', 'last'])
df4

Out[94]:

		visit1	visit2
first	last
clark	kent	23	25
bruce	wayne	34	40
diana	prince	45	54

In [95]:

df5 = df2.set_index(['first', 'last'])
df5

Out[95]:

		visit3	visit4
first	last
bruce	wayne	68	80
diana	prince	90	108
arthur	curry	11	23

In [96]:

df6 = df5.copy()
df6 = df6.rename({'visit3': 'visit5', 'visit4': 'visit6'}, axis=1)
df6 = df6 - 10
df6

Out[96]:

		visit5	visit6
first	last
bruce	wayne	58	70
diana	prince	80	98
arthur	curry	1	13

In [97]:

df4.join([df5, df6])

Out[97]:

		visit1	visit2	visit3	visit4	visit5	visit6
first	last
clark	kent	23	25	NaN	NaN	NaN	NaN
bruce	wayne	34	40	68.0	80.0	58.0	70.0
diana	prince	45	54	90.0	108.0	80.0	98.0

In [98]:

df4.join([df5, df6], how='outer')

Out[98]:

		visit1	visit2	visit3	visit4	visit5	visit6
first	last
arthur	curry	NaN	NaN	11.0	23.0	1.0	13.0
bruce	wayne	34.0	40.0	68.0	80.0	58.0	70.0
clark	kent	23.0	25.0	NaN	NaN	NaN	NaN
diana	prince	45.0	54.0	90.0	108.0	80.0	98.0

In [ ]:

Series¶

Numeric¶

Strings¶

Categories (aka “factors”)¶

With datetime index¶

With datetime values¶

Data Frames¶

Basic operations¶

`dplyr` type operations¶

`tidyr` type operations¶

Page contents

Previous page

Next page

This Page

Series¶

Numeric¶

Strings¶

Categories (aka “factors”)¶

With datetime index¶

With datetime values¶

Data Frames¶

Basic operations¶

dplyr type operations¶

tidyr type operations¶

`dplyr` type operations¶

`tidyr` type operations¶