In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext version_information
%load_ext rpy2.ipython
Combining and reshaping data¶
In this notebook, we introduce the pandas
DataFrame
, show how to
combine multiple data sets into a single DataFeame
. We also show how
to change the layout of a DataFrame
for more convenient analysis
without actually changing the content. Modifying the content will be
covered in the next session.
In [2]:
np.random.seed(123)
Series¶
Numeric data¶
In [3]:
age = pd.Series([23, 17, 22, 37, 42], name='years')
age
Out[3]:
0 23
1 17
2 22
3 37
4 42
Name: years, dtype: int64
In [4]:
type(age)
Out[4]:
pandas.core.series.Series
In [5]:
age.index
Out[5]:
RangeIndex(start=0, stop=5, step=1)
In [6]:
age.sort_values()
Out[6]:
1 17
2 22
0 23
3 37
4 42
Name: years, dtype: int64
In [7]:
age.nsmallest(3)
Out[7]:
1 17
2 22
0 23
Name: years, dtype: int64
In [8]:
age.values
Out[8]:
array([23, 17, 22, 37, 42])
String data¶
In [9]:
species = pd.Series(['mouse', 'mouse', 'human', 'human', 'mouse', 'mouse'])
In [10]:
species.sort_values()
Out[10]:
2 human
3 human
0 mouse
1 mouse
4 mouse
5 mouse
dtype: object
In [11]:
species.unique()
Out[11]:
array(['mouse', 'human'], dtype=object)
In [12]:
species.str.title()
Out[12]:
0 Mouse
1 Mouse
2 Human
3 Human
4 Mouse
5 Mouse
dtype: object
In [13]:
species.str[2:4]
Out[13]:
0 us
1 us
2 ma
3 ma
4 us
5 us
dtype: object
In [14]:
species.replace({'mosue': 'mus musculus', 'human': 'homo sapiens'})
Out[14]:
0 mouse
1 mouse
2 homo sapiens
3 homo sapiens
4 mouse
5 mouse
dtype: object
Cateogrical data¶
In [15]:
species = species.astype('category')
In [16]:
species.cat.codes
Out[16]:
0 1
1 1
2 0
3 0
4 1
5 1
dtype: int8
In [17]:
species.cat.categories
Out[17]:
Index(['human', 'mouse'], dtype='object')
DataFrame¶
Read CSV¶
In [18]:
iris_1 = pd.read_csv('data/iris.csv')
In [19]:
iris_1.head()
Out[19]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
Write CSV¶
In [20]:
iris_1.to_csv('data/iris_1.csv', index=False)
Read Excel¶
In [21]:
iris_2 = pd.read_excel('data/iris.xlsx')
In [22]:
iris_2.head()
Out[22]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
Write Excel¶
In [23]:
iris_2.to_excel('data/iris_2.xlsx', index=False)
Check files using Unix shell commands¶
In [24]:
! ls
Basic_Plots.ipynb _static
Customizing_Plots.ipynb _templates
Data_Manipulation.ipynb conf.py
Getting_Started_With_Graphics.ipynb data
Getting_Started_with_Python.ipynb figs
Introduction_to_Pandas.ipynb index.rst
Makefile make.bat
Scratch.ipynb schhedule.md
_build
In [25]:
! head -n 5 iris_1.csv | cat
head: iris_1.csv: No such file or directory
Combining data sets¶
Combining rows¶
In [26]:
df_versiocolor = pd.read_csv('data/versicolor.csv')
df_virginica = pd.read_csv('data/virginica.csv')
df_sertosa = pd.read_csv('data/setosa.csv')
dfs = [df_versiocolor, df_virginica, df_sertosa]
In [27]:
[df.shape for df in dfs]
Out[27]:
[(50, 5), (50, 5), (50, 5)]
Each DataFrame only contains data about one species¶
In [28]:
for df in dfs:
print(df.Species.unique())
['versicolor']
['virginica']
['setosa']
Combine with concat
¶
We first add a column containing the plant_id for each DataFrame so
that each row has a unique identifier (combination of Species and
plant_id) and then combine the data frames using concat
.
In [29]:
df = pd.concat(dfs)
df.shape
Out[29]:
(150, 5)
Combined DataFrame contains all 3 species¶
In [30]:
df.Species.unique()
Out[30]:
array(['versicolor', 'virginica', 'setosa'], dtype=object)
Combining columns¶
In [31]:
df_sepal = pd.read_csv('data/sepal.csv')
df_petal = pd.read_csv('data/petal.csv')
In [32]:
df_sepal.head(3)
Out[32]:
Species | Sepal.Length | Sepal.Width | |
---|---|---|---|
0 | setosa | 5.1 | 3.5 |
1 | setosa | 4.9 | 3.0 |
2 | setosa | 4.7 | 3.2 |
In [33]:
df_petal.head(3)
Out[33]:
Species | Petal.Length | Petal.Width | |
---|---|---|---|
0 | setosa | 1.4 | 0.2 |
1 | setosa | 1.4 | 0.2 |
2 | setosa | 1.3 | 0.2 |
In [34]:
df_sepal.shape, df_petal.shape
Out[34]:
((150, 3), (150, 3))
In [35]:
df_cols = pd.merge(df_sepal, df_petal, on = 'Species', left_index=True, right_index=True)
df_cols.shape
Out[35]:
(150, 5)
In [36]:
df_cols.head(3)
Out[36]:
Species | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|---|
0 | setosa | 5.1 | 3.5 | 1.4 | 0.2 |
1 | setosa | 4.9 | 3.0 | 1.4 | 0.2 |
2 | setosa | 4.7 | 3.2 | 1.3 | 0.2 |
Joininig on a single unique column¶
Combining values for the same subject across different messurements.
In [37]:
pid1 = np.random.choice(100, 6, replace=False)
pid1
Out[37]:
array([ 8, 70, 82, 28, 63, 0])
In [38]:
val1 = np.random.normal(10, 1, 6)
val1
Out[38]:
array([ 10.46843912, 9.16884502, 11.16220405, 8.90279695,
7.87689965, 11.03972709])
In [39]:
df1 = pd.DataFrame({'pid': pid1, 'val': val1})
df1
Out[39]:
pid | val | |
---|---|---|
0 | 8 | 10.468439 |
1 | 70 | 9.168845 |
2 | 82 | 11.162204 |
3 | 28 | 8.902797 |
4 | 63 | 7.876900 |
5 | 0 | 11.039727 |
In [40]:
pid2 = np.random.permutation(pid1)
pid2
Out[40]:
array([28, 82, 8, 70, 63, 0])
In [41]:
val2 = np.random.normal(15, 1, 6)
val2
Out[41]:
array([ 14.16248328, 13.39403724, 16.25523737, 14.31113102,
16.66095249, 15.80730819])
In [42]:
df2 = pd.DataFrame({'pid': pid2, 'val': val2})
df2
Out[42]:
pid | val | |
---|---|---|
0 | 28 | 14.162483 |
1 | 82 | 13.394037 |
2 | 8 | 16.255237 |
3 | 70 | 14.311131 |
4 | 63 | 16.660952 |
5 | 0 | 15.807308 |
In [43]:
pd.merge(df1, df2, on='pid', suffixes=['_visit_1', '_visit_2'])
Out[43]:
pid | val_visit_1 | val_visit_2 | |
---|---|---|---|
0 | 8 | 10.468439 | 16.255237 |
1 | 70 | 9.168845 | 14.311131 |
2 | 82 | 11.162204 | 13.394037 |
3 | 28 | 8.902797 | 14.162483 |
4 | 63 | 7.876900 | 16.660952 |
5 | 0 | 11.039727 | 15.807308 |
Joining on two unique columns¶
In [44]:
df1['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df1 = df1[['pid', 'stim', 'val']]
df1
Out[44]:
pid | stim | val | |
---|---|---|---|
0 | 8 | cmv | 10.468439 |
1 | 70 | flu | 9.168845 |
2 | 82 | cmv | 11.162204 |
3 | 28 | cmv | 8.902797 |
4 | 63 | cmv | 7.876900 |
5 | 0 | cmv | 11.039727 |
In [45]:
df2['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df2 = df2[['pid', 'stim', 'val']]
df2
Out[45]:
pid | stim | val | |
---|---|---|---|
0 | 28 | flu | 14.162483 |
1 | 82 | cmv | 13.394037 |
2 | 8 | flu | 16.255237 |
3 | 70 | flu | 14.311131 |
4 | 63 | cmv | 16.660952 |
5 | 0 | cmv | 15.807308 |
In [46]:
pd.merge(df1, df2, on = ['pid', 'stim'], suffixes = ['_visit_1', '_visit_2'])
Out[46]:
pid | stim | val_visit_1 | val_visit_2 | |
---|---|---|---|---|
0 | 70 | flu | 9.168845 | 14.311131 |
1 | 82 | cmv | 11.162204 | 13.394037 |
2 | 63 | cmv | 7.876900 | 16.660952 |
3 | 0 | cmv | 11.039727 | 15.807308 |
In [47]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'left', suffixes = ['_visit_1', '_visit_2'])
Out[47]:
pid | stim | val_visit_1 | val_visit_2 | |
---|---|---|---|---|
0 | 8 | cmv | 10.468439 | NaN |
1 | 70 | flu | 9.168845 | 14.311131 |
2 | 82 | cmv | 11.162204 | 13.394037 |
3 | 28 | cmv | 8.902797 | NaN |
4 | 63 | cmv | 7.876900 | 16.660952 |
5 | 0 | cmv | 11.039727 | 15.807308 |
In [48]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'right', suffixes = ['_visit_1', '_visit_2'])
Out[48]:
pid | stim | val_visit_1 | val_visit_2 | |
---|---|---|---|---|
0 | 70.0 | flu | 9.168845 | 14.311131 |
1 | 82.0 | cmv | 11.162204 | 13.394037 |
2 | 63.0 | cmv | 7.876900 | 16.660952 |
3 | 0.0 | cmv | 11.039727 | 15.807308 |
4 | 28.0 | flu | NaN | 14.162483 |
5 | 8.0 | flu | NaN | 16.255237 |
In [49]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'outer', suffixes = ['_visit_1', '_visit_2'])
Out[49]:
pid | stim | val_visit_1 | val_visit_2 | |
---|---|---|---|---|
0 | 8.0 | cmv | 10.468439 | NaN |
1 | 70.0 | flu | 9.168845 | 14.311131 |
2 | 82.0 | cmv | 11.162204 | 13.394037 |
3 | 28.0 | cmv | 8.902797 | NaN |
4 | 63.0 | cmv | 7.876900 | 16.660952 |
5 | 0.0 | cmv | 11.039727 | 15.807308 |
6 | 28.0 | flu | NaN | 14.162483 |
7 | 8.0 | flu | NaN | 16.255237 |
Separate multiple values in a single column¶
In [50]:
from collections import OrderedDict
In [51]:
d = OrderedDict()
d['pid-visit-stim'] = ['1-1-cmv', '1-1-hiv', '1-2-cmv', '1-2-hiv', '1-3-cmv', '1-3-hiv', '2-1-cmv', '2-1-hiv', '2-2-cmv', '2-2-hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [52]:
df
Out[52]:
pid-visit-stim | tnf | ifn | il2 | |
---|---|---|---|---|
0 | 1-1-cmv | 1.0 | 11.0 | 0.0 |
1 | 1-1-hiv | 2.0 | 12.0 | 0.0 |
2 | 1-2-cmv | 1.1 | 11.1 | 0.1 |
3 | 1-2-hiv | 2.1 | 12.1 | 0.1 |
4 | 1-3-cmv | 1.2 | 11.2 | 0.2 |
5 | 1-3-hiv | 2.2 | 12.2 | 0.2 |
6 | 2-1-cmv | 3.0 | 13.0 | 0.1 |
7 | 2-1-hiv | 4.0 | 14.0 | 0.3 |
8 | 2-2-cmv | 3.1 | 13.1 | 0.1 |
9 | 2-2-hiv | 4.1 | 14.1 | 0.1 |
In [53]:
df1 = pd.DataFrame(df['pid-visit-stim'].str.split('-').tolist(),
columns = ['pid', 'visit', 'stim'])
In [54]:
df1 = pd.concat([df1, df], axis=1)
In [55]:
df1.drop('pid-visit-stim', axis=1)
Out[55]:
pid | visit | stim | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | 1 | cmv | 1.0 | 11.0 | 0.0 |
1 | 1 | 1 | hiv | 2.0 | 12.0 | 0.0 |
2 | 1 | 2 | cmv | 1.1 | 11.1 | 0.1 |
3 | 1 | 2 | hiv | 2.1 | 12.1 | 0.1 |
4 | 1 | 3 | cmv | 1.2 | 11.2 | 0.2 |
5 | 1 | 3 | hiv | 2.2 | 12.2 | 0.2 |
6 | 2 | 1 | cmv | 3.0 | 13.0 | 0.1 |
7 | 2 | 1 | hiv | 4.0 | 14.0 | 0.3 |
8 | 2 | 2 | cmv | 3.1 | 13.1 | 0.1 |
9 | 2 | 2 | hiv | 4.1 | 14.1 | 0.1 |
In [56]:
def separate(df, column, sep):
df1 = pd.DataFrame(df[column].str.split(sep).tolist(), columns = column.split(sep))
df1 = pd.concat([df1, df], axis=1)
return df1.drop(column, axis = 1)
In [57]:
separate(df, 'pid-visit-stim', '-')
Out[57]:
pid | visit | stim | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | 1 | cmv | 1.0 | 11.0 | 0.0 |
1 | 1 | 1 | hiv | 2.0 | 12.0 | 0.0 |
2 | 1 | 2 | cmv | 1.1 | 11.1 | 0.1 |
3 | 1 | 2 | hiv | 2.1 | 12.1 | 0.1 |
4 | 1 | 3 | cmv | 1.2 | 11.2 | 0.2 |
5 | 1 | 3 | hiv | 2.2 | 12.2 | 0.2 |
6 | 2 | 1 | cmv | 3.0 | 13.0 | 0.1 |
7 | 2 | 1 | hiv | 4.0 | 14.0 | 0.3 |
8 | 2 | 2 | cmv | 3.1 | 13.1 | 0.1 |
9 | 2 | 2 | hiv | 4.1 | 14.1 | 0.1 |
Reshaping DataFrame
¶
In [58]:
d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [59]:
df
Out[59]:
pid | visit | stim | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | 1 | cmv | 1.0 | 11.0 | 0.0 |
1 | 1 | 1 | hiv | 2.0 | 12.0 | 0.0 |
2 | 1 | 2 | cmv | 1.1 | 11.1 | 0.1 |
3 | 1 | 2 | hiv | 2.1 | 12.1 | 0.1 |
4 | 1 | 3 | cmv | 1.2 | 11.2 | 0.2 |
5 | 1 | 3 | hiv | 2.2 | 12.2 | 0.2 |
6 | 2 | 1 | cmv | 3.0 | 13.0 | 0.1 |
7 | 2 | 1 | hiv | 4.0 | 14.0 | 0.3 |
8 | 2 | 2 | cmv | 3.1 | 13.1 | 0.1 |
9 | 2 | 2 | hiv | 4.1 | 14.1 | 0.1 |
Wide to Long¶
In [60]:
long1 = pd.melt(df, id_vars =['pid', 'stim', 'visit'])
long1
Out[60]:
pid | stim | visit | variable | value | |
---|---|---|---|---|---|
0 | 1 | cmv | 1 | tnf | 1.0 |
1 | 1 | hiv | 1 | tnf | 2.0 |
2 | 1 | cmv | 2 | tnf | 1.1 |
3 | 1 | hiv | 2 | tnf | 2.1 |
4 | 1 | cmv | 3 | tnf | 1.2 |
5 | 1 | hiv | 3 | tnf | 2.2 |
6 | 2 | cmv | 1 | tnf | 3.0 |
7 | 2 | hiv | 1 | tnf | 4.0 |
8 | 2 | cmv | 2 | tnf | 3.1 |
9 | 2 | hiv | 2 | tnf | 4.1 |
10 | 1 | cmv | 1 | ifn | 11.0 |
11 | 1 | hiv | 1 | ifn | 12.0 |
12 | 1 | cmv | 2 | ifn | 11.1 |
13 | 1 | hiv | 2 | ifn | 12.1 |
14 | 1 | cmv | 3 | ifn | 11.2 |
15 | 1 | hiv | 3 | ifn | 12.2 |
16 | 2 | cmv | 1 | ifn | 13.0 |
17 | 2 | hiv | 1 | ifn | 14.0 |
18 | 2 | cmv | 2 | ifn | 13.1 |
19 | 2 | hiv | 2 | ifn | 14.1 |
20 | 1 | cmv | 1 | il2 | 0.0 |
21 | 1 | hiv | 1 | il2 | 0.0 |
22 | 1 | cmv | 2 | il2 | 0.1 |
23 | 1 | hiv | 2 | il2 | 0.1 |
24 | 1 | cmv | 3 | il2 | 0.2 |
25 | 1 | hiv | 3 | il2 | 0.2 |
26 | 2 | cmv | 1 | il2 | 0.1 |
27 | 2 | hiv | 1 | il2 | 0.3 |
28 | 2 | cmv | 2 | il2 | 0.1 |
29 | 2 | hiv | 2 | il2 | 0.1 |
In [61]:
long2 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'],
value_vars = ['tnf', 'ifn', 'il2'])
long2.sample(6)
Out[61]:
pid | stim | visit | variable | value | |
---|---|---|---|---|---|
10 | 1 | cmv | 1 | ifn | 11.0 |
16 | 2 | cmv | 1 | ifn | 13.0 |
23 | 1 | hiv | 2 | il2 | 0.1 |
22 | 1 | cmv | 2 | il2 | 0.1 |
9 | 2 | hiv | 2 | tnf | 4.1 |
13 | 1 | hiv | 2 | ifn | 12.1 |
In [62]:
long3 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'],
value_vars = ['tnf', 'il2'])
long3.sample(6)
Out[62]:
pid | stim | visit | variable | value | |
---|---|---|---|---|---|
1 | 1 | hiv | 1 | tnf | 2.0 |
8 | 2 | cmv | 2 | tnf | 3.1 |
12 | 1 | cmv | 2 | il2 | 0.1 |
18 | 2 | cmv | 2 | il2 | 0.1 |
14 | 1 | cmv | 3 | il2 | 0.2 |
7 | 2 | hiv | 1 | tnf | 4.0 |
Long to Wide¶
There is no function in pandas
to undo the wide_to_long
operation, and the details are tricky, so I have written a small
function to do this.
In [65]:
def long_to_wide(df, index):
df = df.set_index(index).unstack().reset_index()
cols = [t[1] if t[1] else t[0] for t in df.columns]
df.columns = cols
return df
In [64]:
wide1 = long_to_wide(long1, ['pid', 'stim', 'visit', 'variable'])
wide1.head(6)
Out[64]:
pid | stim | visit | ifn | il2 | tnf | |
---|---|---|---|---|---|---|
0 | 1 | cmv | 1 | 11.0 | 0.0 | 1.0 |
1 | 1 | cmv | 2 | 11.1 | 0.1 | 1.1 |
2 | 1 | cmv | 3 | 11.2 | 0.2 | 1.2 |
3 | 1 | hiv | 1 | 12.0 | 0.0 | 2.0 |
4 | 1 | hiv | 2 | 12.1 | 0.1 | 2.1 |
5 | 1 | hiv | 3 | 12.2 | 0.2 | 2.2 |
In [65]:
wide2 = long_to_wide(long2, ['pid', 'stim', 'visit', 'variable'])
wide2.head(6)
Out[65]:
pid | stim | visit | ifn | il2 | tnf | |
---|---|---|---|---|---|---|
0 | 1 | cmv | 1 | 11.0 | 0.0 | 1.0 |
1 | 1 | cmv | 2 | 11.1 | 0.1 | 1.1 |
2 | 1 | cmv | 3 | 11.2 | 0.2 | 1.2 |
3 | 1 | hiv | 1 | 12.0 | 0.0 | 2.0 |
4 | 1 | hiv | 2 | 12.1 | 0.1 | 2.1 |
5 | 1 | hiv | 3 | 12.2 | 0.2 | 2.2 |
In [66]:
wide3 = long_to_wide(long3, ['pid', 'stim', 'visit', 'variable'])
wide3.head(6)
Out[66]:
pid | stim | visit | il2 | tnf | |
---|---|---|---|---|---|
0 | 1 | cmv | 1 | 0.0 | 1.0 |
1 | 1 | cmv | 2 | 0.1 | 1.1 |
2 | 1 | cmv | 3 | 0.2 | 1.2 |
3 | 1 | hiv | 1 | 0.0 | 2.0 |
4 | 1 | hiv | 2 | 0.1 | 2.1 |
5 | 1 | hiv | 3 | 0.2 | 2.2 |
Hierarchical Indexes¶
In [67]:
df.head()
Out[67]:
pid | visit | stim | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | 1 | cmv | 1.0 | 11.0 | 0.0 |
1 | 1 | 1 | hiv | 2.0 | 12.0 | 0.0 |
2 | 1 | 2 | cmv | 1.1 | 11.1 | 0.1 |
3 | 1 | 2 | hiv | 2.1 | 12.1 | 0.1 |
4 | 1 | 3 | cmv | 1.2 | 11.2 | 0.2 |
In [68]:
df1 = df.set_index(['pid', 'stim', 'visit'])
df1
Out[68]:
tnf | ifn | il2 | |||
---|---|---|---|---|---|
pid | stim | visit | |||
1 | cmv | 1 | 1.0 | 11.0 | 0.0 |
hiv | 1 | 2.0 | 12.0 | 0.0 | |
cmv | 2 | 1.1 | 11.1 | 0.1 | |
hiv | 2 | 2.1 | 12.1 | 0.1 | |
cmv | 3 | 1.2 | 11.2 | 0.2 | |
hiv | 3 | 2.2 | 12.2 | 0.2 | |
2 | cmv | 1 | 3.0 | 13.0 | 0.1 |
hiv | 1 | 4.0 | 14.0 | 0.3 | |
cmv | 2 | 3.1 | 13.1 | 0.1 | |
hiv | 2 | 4.1 | 14.1 | 0.1 |
Indexing for mult-index¶
With the multi-index, each “cell” is now a block of values for the combinations (pid, stim, visit).
Find TNF values¶
In [69]:
df1[['tnf']]
Out[69]:
tnf | |||
---|---|---|---|
pid | stim | visit | |
1 | cmv | 1 | 1.0 |
hiv | 1 | 2.0 | |
cmv | 2 | 1.1 | |
hiv | 2 | 2.1 | |
cmv | 3 | 1.2 | |
hiv | 3 | 2.2 | |
2 | cmv | 1 | 3.0 |
hiv | 1 | 4.0 | |
cmv | 2 | 3.1 | |
hiv | 2 | 4.1 |
Find all values for Subject 2¶
In [70]:
df1.ix['2']
Out[70]:
tnf | ifn | il2 | ||
---|---|---|---|---|
stim | visit | |||
cmv | 1 | 3.0 | 13.0 | 0.1 |
hiv | 1 | 4.0 | 14.0 | 0.3 |
cmv | 2 | 3.1 | 13.1 | 0.1 |
hiv | 2 | 4.1 | 14.1 | 0.1 |
Find TNF values for subject 2¶
In [71]:
df1.ix['2', ['tnf']]
Out[71]:
tnf | ||
---|---|---|
stim | visit | |
cmv | 1 | 3.0 |
hiv | 1 | 4.0 |
cmv | 2 | 3.1 |
hiv | 2 | 4.1 |
Undo¶
In [72]:
df1.reset_index()
Out[72]:
pid | stim | visit | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | cmv | 1 | 1.0 | 11.0 | 0.0 |
1 | 1 | hiv | 1 | 2.0 | 12.0 | 0.0 |
2 | 1 | cmv | 2 | 1.1 | 11.1 | 0.1 |
3 | 1 | hiv | 2 | 2.1 | 12.1 | 0.1 |
4 | 1 | cmv | 3 | 1.2 | 11.2 | 0.2 |
5 | 1 | hiv | 3 | 2.2 | 12.2 | 0.2 |
6 | 2 | cmv | 1 | 3.0 | 13.0 | 0.1 |
7 | 2 | hiv | 1 | 4.0 | 14.0 | 0.3 |
8 | 2 | cmv | 2 | 3.1 | 13.1 | 0.1 |
9 | 2 | hiv | 2 | 4.1 | 14.1 | 0.1 |
Move pid from column to row¶
In [73]:
df1.unstack('pid')
Out[73]:
tnf | ifn | il2 | |||||
---|---|---|---|---|---|---|---|
pid | 1 | 2 | 1 | 2 | 1 | 2 | |
stim | visit | ||||||
cmv | 1 | 1.0 | 3.0 | 11.0 | 13.0 | 0.0 | 0.1 |
2 | 1.1 | 3.1 | 11.1 | 13.1 | 0.1 | 0.1 | |
3 | 1.2 | NaN | 11.2 | NaN | 0.2 | NaN | |
hiv | 1 | 2.0 | 4.0 | 12.0 | 14.0 | 0.0 | 0.3 |
2 | 2.1 | 4.1 | 12.1 | 14.1 | 0.1 | 0.1 | |
3 | 2.2 | NaN | 12.2 | NaN | 0.2 | NaN |
In [74]:
df1.unstack(0)
Out[74]:
tnf | ifn | il2 | |||||
---|---|---|---|---|---|---|---|
pid | 1 | 2 | 1 | 2 | 1 | 2 | |
stim | visit | ||||||
cmv | 1 | 1.0 | 3.0 | 11.0 | 13.0 | 0.0 | 0.1 |
2 | 1.1 | 3.1 | 11.1 | 13.1 | 0.1 | 0.1 | |
3 | 1.2 | NaN | 11.2 | NaN | 0.2 | NaN | |
hiv | 1 | 2.0 | 4.0 | 12.0 | 14.0 | 0.0 | 0.3 |
2 | 2.1 | 4.1 | 12.1 | 14.1 | 0.1 | 0.1 | |
3 | 2.2 | NaN | 12.2 | NaN | 0.2 | NaN |
Move stim from column to row¶
In [75]:
df1.unstack('stim')
Out[75]:
tnf | ifn | il2 | |||||
---|---|---|---|---|---|---|---|
stim | cmv | hiv | cmv | hiv | cmv | hiv | |
pid | visit | ||||||
1 | 1 | 1.0 | 2.0 | 11.0 | 12.0 | 0.0 | 0.0 |
2 | 1.1 | 2.1 | 11.1 | 12.1 | 0.1 | 0.1 | |
3 | 1.2 | 2.2 | 11.2 | 12.2 | 0.2 | 0.2 | |
2 | 1 | 3.0 | 4.0 | 13.0 | 14.0 | 0.1 | 0.3 |
2 | 3.1 | 4.1 | 13.1 | 14.1 | 0.1 | 0.1 |
In [76]:
df1.unstack(1)
Out[76]:
tnf | ifn | il2 | |||||
---|---|---|---|---|---|---|---|
stim | cmv | hiv | cmv | hiv | cmv | hiv | |
pid | visit | ||||||
1 | 1 | 1.0 | 2.0 | 11.0 | 12.0 | 0.0 | 0.0 |
2 | 1.1 | 2.1 | 11.1 | 12.1 | 0.1 | 0.1 | |
3 | 1.2 | 2.2 | 11.2 | 12.2 | 0.2 | 0.2 | |
2 | 1 | 3.0 | 4.0 | 13.0 | 14.0 | 0.1 | 0.3 |
2 | 3.1 | 4.1 | 13.1 | 14.1 | 0.1 | 0.1 |
Move pid and stim from column to row¶
In [77]:
df1.unstack(['pid', 'stim'])
Out[77]:
tnf | ifn | il2 | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
pid | 1 | 2 | 1 | 2 | 1 | 2 | ||||||
stim | cmv | hiv | cmv | hiv | cmv | hiv | cmv | hiv | cmv | hiv | cmv | hiv |
visit | ||||||||||||
1 | 1.0 | 2.0 | 3.0 | 4.0 | 11.0 | 12.0 | 13.0 | 14.0 | 0.0 | 0.0 | 0.1 | 0.3 |
2 | 1.1 | 2.1 | 3.1 | 4.1 | 11.1 | 12.1 | 13.1 | 14.1 | 0.1 | 0.1 | 0.1 | 0.1 |
3 | 1.2 | 2.2 | NaN | NaN | 11.2 | 12.2 | NaN | NaN | 0.2 | 0.2 | NaN | NaN |
Exercises¶
You are given three DataFrames d1
, d2
, and d3
. Answer the
questions below.
In [52]:
d1 = pd.DataFrame({'subject': ['a', 'b', 'c', 'd'], 'group': ['A', 'B', 'C', 'D'], 'value': [1, 2, 3, 4]})
d2 = pd.DataFrame({'subject': ['w', 'x', 'y', 'z'], 'group': ['A', 'B', 'C', 'D'], 'value': [2, 1, 4, 3]})
d3 = pd.DataFrame({'subject': ['a', 'x', 'w', 'd', 'b', 'c', 'y', 'z'], 'age': [10, 20, 30, 40, 50, 60, 70, 80]})
In [53]:
d1
Out[53]:
group | subject | value | |
---|---|---|---|
0 | A | a | 1 |
1 | B | b | 2 |
2 | C | c | 3 |
3 | D | d | 4 |
In [54]:
d2
Out[54]:
group | subject | value | |
---|---|---|---|
0 | A | w | 2 |
1 | B | x | 1 |
2 | C | y | 4 |
3 | D | z | 3 |
In [55]:
d3
Out[55]:
age | subject | |
---|---|---|
0 | 10 | a |
1 | 20 | x |
2 | 30 | w |
3 | 40 | d |
4 | 50 | b |
5 | 60 | c |
6 | 70 | y |
7 | 80 | z |
1. Combine d1
and d2
to create a DataFrame called df1
with 8 rows and 3 columns.
In [57]:
df1 = pd.concat([d1, d2])
df1
Out[57]:
group | subject | value | |
---|---|---|---|
0 | A | a | 1 |
1 | B | b | 2 |
2 | C | c | 3 |
3 | D | d | 4 |
0 | A | w | 2 |
1 | B | x | 1 |
2 | C | y | 4 |
3 | D | z | 3 |
2. Combine df1
with d3
to create a DataFrame called df2
with 8 rows and 4 columns.
In [59]:
df2 = pd.merge(d3, df1, on = 'subject')
df2
Out[59]:
age | subject | group | value | |
---|---|---|---|---|
0 | 10 | a | A | 1 |
1 | 20 | x | B | 1 |
2 | 30 | w | A | 2 |
3 | 40 | d | D | 4 |
4 | 50 | b | B | 2 |
5 | 60 | c | C | 3 |
6 | 70 | y | C | 4 |
7 | 80 | z | D | 3 |
3.. Convert the given DataFrame fd
from wide to tall format
and save in df
.
In [82]:
name = ['ann', 'bob']
vals1 = [1, 2]
vals2 = [3, 4]
vals3 = [5, 6]
d = pd.DataFrame(OrderedDict([('name', name),
('group_1', vals1),
('group_2', vals2),
('group_3', vals3)]))
d
Out[82]:
name | group_1 | group_2 | group_3 | |
---|---|---|---|---|
0 | ann | 1 | 3 | 5 |
1 | bob | 2 | 4 | 6 |
In [85]:
Out[85]:
name | variable | value | |
---|---|---|---|
0 | ann | group_1 | 1 |
1 | bob | group_1 | 2 |
2 | ann | group_2 | 3 |
3 | bob | group_2 | 4 |
4 | ann | group_3 | 5 |
5 | bob | group_3 | 6 |
4. Name the column in df
with group values group
and remove
the group_
part from the group name values in each cell, leaving
only the number.
In [102]:
Out[102]:
name | group | value | |
---|---|---|---|
0 | ann | 1 | 1 |
1 | bob | 1 | 2 |
2 | ann | 2 | 3 |
3 | bob | 2 | 4 |
4 | ann | 3 | 5 |
5 | bob | 3 | 6 |
Version information¶
In [78]:
%load_ext version_information
%version_information
The version_information extension is already loaded. To reload it, use:
%reload_ext version_information
Out[78]:
Software | Version |
---|---|
Python | 3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)] |
IPython | 5.0.0 |
OS | Darwin 15.6.0 x86_64 i386 64bit |
Tue Aug 16 09:06:17 2016 EDT |
In [ ]: