In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext version_information
%load_ext rpy2.ipython

Combining and reshaping data

In this notebook, we introduce the pandas DataFrame, show how to combine multiple data sets into a single DataFeame. We also show how to change the layout of a DataFrame for more convenient analysis without actually changing the content. Modifying the content will be covered in the next session.

In [2]:
np.random.seed(123)

Series

Numeric data

In [3]:
age = pd.Series([23, 17, 22, 37, 42], name='years')
age
Out[3]:
0    23
1    17
2    22
3    37
4    42
Name: years, dtype: int64
In [4]:
type(age)
Out[4]:
pandas.core.series.Series
In [5]:
age.index
Out[5]:
RangeIndex(start=0, stop=5, step=1)
In [6]:
age.sort_values()
Out[6]:
1    17
2    22
0    23
3    37
4    42
Name: years, dtype: int64
In [7]:
age.nsmallest(3)
Out[7]:
1    17
2    22
0    23
Name: years, dtype: int64
In [8]:
age.values
Out[8]:
array([23, 17, 22, 37, 42])

String data

In [9]:
species = pd.Series(['mouse', 'mouse', 'human', 'human', 'mouse', 'mouse'])
In [10]:
species.sort_values()
Out[10]:
2    human
3    human
0    mouse
1    mouse
4    mouse
5    mouse
dtype: object
In [11]:
species.unique()
Out[11]:
array(['mouse', 'human'], dtype=object)
In [12]:
species.str.title()
Out[12]:
0    Mouse
1    Mouse
2    Human
3    Human
4    Mouse
5    Mouse
dtype: object
In [13]:
species.str[2:4]
Out[13]:
0    us
1    us
2    ma
3    ma
4    us
5    us
dtype: object
In [14]:
species.replace({'mosue': 'mus musculus', 'human': 'homo sapiens'})
Out[14]:
0           mouse
1           mouse
2    homo sapiens
3    homo sapiens
4           mouse
5           mouse
dtype: object

Cateogrical data

In [15]:
species = species.astype('category')
In [16]:
species.cat.codes
Out[16]:
0    1
1    1
2    0
3    0
4    1
5    1
dtype: int8
In [17]:
species.cat.categories
Out[17]:
Index(['human', 'mouse'], dtype='object')

DataFrame

Read CSV

In [18]:
iris_1 = pd.read_csv('data/iris.csv')
In [19]:
iris_1.head()
Out[19]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

Write CSV

In [20]:
iris_1.to_csv('data/iris_1.csv', index=False)

Read Excel

In [21]:
iris_2 = pd.read_excel('data/iris.xlsx')
In [22]:
iris_2.head()
Out[22]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

Write Excel

In [23]:
iris_2.to_excel('data/iris_2.xlsx', index=False)

Check files using Unix shell commands

In [24]:
! ls
Basic_Plots.ipynb                   _static
Customizing_Plots.ipynb             _templates
Data_Manipulation.ipynb             conf.py
Getting_Started_With_Graphics.ipynb data
Getting_Started_with_Python.ipynb   figs
Introduction_to_Pandas.ipynb        index.rst
Makefile                            make.bat
Scratch.ipynb                       schhedule.md
_build
In [25]:
! head -n 5 iris_1.csv | cat
head: iris_1.csv: No such file or directory

Combining data sets

Combining rows

In [26]:
df_versiocolor = pd.read_csv('data/versicolor.csv')
df_virginica = pd.read_csv('data/virginica.csv')
df_sertosa = pd.read_csv('data/setosa.csv')
dfs = [df_versiocolor, df_virginica, df_sertosa]
In [27]:
[df.shape for df in dfs]
Out[27]:
[(50, 5), (50, 5), (50, 5)]

Each DataFrame only contains data about one species

In [28]:
for df in dfs:
    print(df.Species.unique())
['versicolor']
['virginica']
['setosa']

Combine with concat

We first add a column containing the plant_id for each DataFrame so that each row has a unique identifier (combination of Species and plant_id) and then combine the data frames using concat.

In [29]:
df = pd.concat(dfs)
df.shape
Out[29]:
(150, 5)

Combined DataFrame contains all 3 species

In [30]:
df.Species.unique()
Out[30]:
array(['versicolor', 'virginica', 'setosa'], dtype=object)

Combining columns

In [31]:
df_sepal = pd.read_csv('data/sepal.csv')
df_petal = pd.read_csv('data/petal.csv')
In [32]:
df_sepal.head(3)
Out[32]:
Species Sepal.Length Sepal.Width
0 setosa 5.1 3.5
1 setosa 4.9 3.0
2 setosa 4.7 3.2
In [33]:
df_petal.head(3)
Out[33]:
Species Petal.Length Petal.Width
0 setosa 1.4 0.2
1 setosa 1.4 0.2
2 setosa 1.3 0.2
In [34]:
df_sepal.shape, df_petal.shape
Out[34]:
((150, 3), (150, 3))
In [35]:
df_cols = pd.merge(df_sepal, df_petal, on = 'Species', left_index=True, right_index=True)
df_cols.shape
Out[35]:
(150, 5)
In [36]:
df_cols.head(3)
Out[36]:
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
0 setosa 5.1 3.5 1.4 0.2
1 setosa 4.9 3.0 1.4 0.2
2 setosa 4.7 3.2 1.3 0.2

Joininig on a single unique column

Combining values for the same subject across different messurements.

In [37]:
pid1 = np.random.choice(100, 6, replace=False)
pid1
Out[37]:
array([ 8, 70, 82, 28, 63,  0])
In [38]:
val1 = np.random.normal(10, 1, 6)
val1
Out[38]:
array([ 10.46843912,   9.16884502,  11.16220405,   8.90279695,
         7.87689965,  11.03972709])
In [39]:
df1 = pd.DataFrame({'pid': pid1, 'val': val1})
df1
Out[39]:
pid val
0 8 10.468439
1 70 9.168845
2 82 11.162204
3 28 8.902797
4 63 7.876900
5 0 11.039727
In [40]:
pid2 = np.random.permutation(pid1)
pid2
Out[40]:
array([28, 82,  8, 70, 63,  0])
In [41]:
val2 = np.random.normal(15, 1, 6)
val2
Out[41]:
array([ 14.16248328,  13.39403724,  16.25523737,  14.31113102,
        16.66095249,  15.80730819])
In [42]:
df2 = pd.DataFrame({'pid': pid2, 'val': val2})
df2
Out[42]:
pid val
0 28 14.162483
1 82 13.394037
2 8 16.255237
3 70 14.311131
4 63 16.660952
5 0 15.807308
In [43]:
pd.merge(df1, df2, on='pid', suffixes=['_visit_1', '_visit_2'])
Out[43]:
pid val_visit_1 val_visit_2
0 8 10.468439 16.255237
1 70 9.168845 14.311131
2 82 11.162204 13.394037
3 28 8.902797 14.162483
4 63 7.876900 16.660952
5 0 11.039727 15.807308

Joining on two unique columns

In [44]:
df1['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df1 = df1[['pid', 'stim', 'val']]
df1
Out[44]:
pid stim val
0 8 cmv 10.468439
1 70 flu 9.168845
2 82 cmv 11.162204
3 28 cmv 8.902797
4 63 cmv 7.876900
5 0 cmv 11.039727
In [45]:
df2['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)
df2 = df2[['pid', 'stim', 'val']]
df2
Out[45]:
pid stim val
0 28 flu 14.162483
1 82 cmv 13.394037
2 8 flu 16.255237
3 70 flu 14.311131
4 63 cmv 16.660952
5 0 cmv 15.807308
In [46]:
pd.merge(df1, df2, on = ['pid', 'stim'], suffixes = ['_visit_1', '_visit_2'])
Out[46]:
pid stim val_visit_1 val_visit_2
0 70 flu 9.168845 14.311131
1 82 cmv 11.162204 13.394037
2 63 cmv 7.876900 16.660952
3 0 cmv 11.039727 15.807308
In [47]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'left', suffixes = ['_visit_1', '_visit_2'])
Out[47]:
pid stim val_visit_1 val_visit_2
0 8 cmv 10.468439 NaN
1 70 flu 9.168845 14.311131
2 82 cmv 11.162204 13.394037
3 28 cmv 8.902797 NaN
4 63 cmv 7.876900 16.660952
5 0 cmv 11.039727 15.807308
In [48]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'right', suffixes = ['_visit_1', '_visit_2'])
Out[48]:
pid stim val_visit_1 val_visit_2
0 70.0 flu 9.168845 14.311131
1 82.0 cmv 11.162204 13.394037
2 63.0 cmv 7.876900 16.660952
3 0.0 cmv 11.039727 15.807308
4 28.0 flu NaN 14.162483
5 8.0 flu NaN 16.255237
In [49]:
pd.merge(df1, df2, on = ['pid', 'stim'], how = 'outer', suffixes = ['_visit_1', '_visit_2'])
Out[49]:
pid stim val_visit_1 val_visit_2
0 8.0 cmv 10.468439 NaN
1 70.0 flu 9.168845 14.311131
2 82.0 cmv 11.162204 13.394037
3 28.0 cmv 8.902797 NaN
4 63.0 cmv 7.876900 16.660952
5 0.0 cmv 11.039727 15.807308
6 28.0 flu NaN 14.162483
7 8.0 flu NaN 16.255237

Separate multiple values in a single column

In [50]:
from collections import OrderedDict
In [51]:
d = OrderedDict()
d['pid-visit-stim'] = ['1-1-cmv', '1-1-hiv', '1-2-cmv', '1-2-hiv', '1-3-cmv', '1-3-hiv', '2-1-cmv', '2-1-hiv', '2-2-cmv', '2-2-hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [52]:
df
Out[52]:
pid-visit-stim tnf ifn il2
0 1-1-cmv 1.0 11.0 0.0
1 1-1-hiv 2.0 12.0 0.0
2 1-2-cmv 1.1 11.1 0.1
3 1-2-hiv 2.1 12.1 0.1
4 1-3-cmv 1.2 11.2 0.2
5 1-3-hiv 2.2 12.2 0.2
6 2-1-cmv 3.0 13.0 0.1
7 2-1-hiv 4.0 14.0 0.3
8 2-2-cmv 3.1 13.1 0.1
9 2-2-hiv 4.1 14.1 0.1
In [53]:
df1 = pd.DataFrame(df['pid-visit-stim'].str.split('-').tolist(),
                   columns = ['pid', 'visit', 'stim'])
In [54]:
df1 = pd.concat([df1, df], axis=1)
In [55]:
df1.drop('pid-visit-stim', axis=1)
Out[55]:
pid visit stim tnf ifn il2
0 1 1 cmv 1.0 11.0 0.0
1 1 1 hiv 2.0 12.0 0.0
2 1 2 cmv 1.1 11.1 0.1
3 1 2 hiv 2.1 12.1 0.1
4 1 3 cmv 1.2 11.2 0.2
5 1 3 hiv 2.2 12.2 0.2
6 2 1 cmv 3.0 13.0 0.1
7 2 1 hiv 4.0 14.0 0.3
8 2 2 cmv 3.1 13.1 0.1
9 2 2 hiv 4.1 14.1 0.1
In [56]:
def separate(df, column, sep):
    df1 = pd.DataFrame(df[column].str.split(sep).tolist(), columns = column.split(sep))
    df1 = pd.concat([df1, df], axis=1)
    return df1.drop(column, axis = 1)
In [57]:
separate(df, 'pid-visit-stim', '-')
Out[57]:
pid visit stim tnf ifn il2
0 1 1 cmv 1.0 11.0 0.0
1 1 1 hiv 2.0 12.0 0.0
2 1 2 cmv 1.1 11.1 0.1
3 1 2 hiv 2.1 12.1 0.1
4 1 3 cmv 1.2 11.2 0.2
5 1 3 hiv 2.2 12.2 0.2
6 2 1 cmv 3.0 13.0 0.1
7 2 1 hiv 4.0 14.0 0.3
8 2 2 cmv 3.1 13.1 0.1
9 2 2 hiv 4.1 14.1 0.1

Reshaping DataFrame

In [58]:
d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [59]:
df
Out[59]:
pid visit stim tnf ifn il2
0 1 1 cmv 1.0 11.0 0.0
1 1 1 hiv 2.0 12.0 0.0
2 1 2 cmv 1.1 11.1 0.1
3 1 2 hiv 2.1 12.1 0.1
4 1 3 cmv 1.2 11.2 0.2
5 1 3 hiv 2.2 12.2 0.2
6 2 1 cmv 3.0 13.0 0.1
7 2 1 hiv 4.0 14.0 0.3
8 2 2 cmv 3.1 13.1 0.1
9 2 2 hiv 4.1 14.1 0.1

Wide to Long

In [60]:
long1 = pd.melt(df, id_vars =['pid', 'stim', 'visit'])
long1
Out[60]:
pid stim visit variable value
0 1 cmv 1 tnf 1.0
1 1 hiv 1 tnf 2.0
2 1 cmv 2 tnf 1.1
3 1 hiv 2 tnf 2.1
4 1 cmv 3 tnf 1.2
5 1 hiv 3 tnf 2.2
6 2 cmv 1 tnf 3.0
7 2 hiv 1 tnf 4.0
8 2 cmv 2 tnf 3.1
9 2 hiv 2 tnf 4.1
10 1 cmv 1 ifn 11.0
11 1 hiv 1 ifn 12.0
12 1 cmv 2 ifn 11.1
13 1 hiv 2 ifn 12.1
14 1 cmv 3 ifn 11.2
15 1 hiv 3 ifn 12.2
16 2 cmv 1 ifn 13.0
17 2 hiv 1 ifn 14.0
18 2 cmv 2 ifn 13.1
19 2 hiv 2 ifn 14.1
20 1 cmv 1 il2 0.0
21 1 hiv 1 il2 0.0
22 1 cmv 2 il2 0.1
23 1 hiv 2 il2 0.1
24 1 cmv 3 il2 0.2
25 1 hiv 3 il2 0.2
26 2 cmv 1 il2 0.1
27 2 hiv 1 il2 0.3
28 2 cmv 2 il2 0.1
29 2 hiv 2 il2 0.1
In [61]:
long2 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'],
               value_vars = ['tnf', 'ifn', 'il2'])
long2.sample(6)
Out[61]:
pid stim visit variable value
10 1 cmv 1 ifn 11.0
16 2 cmv 1 ifn 13.0
23 1 hiv 2 il2 0.1
22 1 cmv 2 il2 0.1
9 2 hiv 2 tnf 4.1
13 1 hiv 2 ifn 12.1
In [62]:
long3 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'],
                value_vars = ['tnf', 'il2'])
long3.sample(6)
Out[62]:
pid stim visit variable value
1 1 hiv 1 tnf 2.0
8 2 cmv 2 tnf 3.1
12 1 cmv 2 il2 0.1
18 2 cmv 2 il2 0.1
14 1 cmv 3 il2 0.2
7 2 hiv 1 tnf 4.0

Long to Wide

There is no function in pandas to undo the wide_to_long operation, and the details are tricky, so I have written a small function to do this.

In [65]:
def long_to_wide(df, index):
    df = df.set_index(index).unstack().reset_index()
    cols = [t[1] if t[1] else t[0] for t in df.columns]
    df.columns = cols
    return df
In [64]:
wide1 = long_to_wide(long1, ['pid', 'stim', 'visit', 'variable'])
wide1.head(6)
Out[64]:
pid stim visit ifn il2 tnf
0 1 cmv 1 11.0 0.0 1.0
1 1 cmv 2 11.1 0.1 1.1
2 1 cmv 3 11.2 0.2 1.2
3 1 hiv 1 12.0 0.0 2.0
4 1 hiv 2 12.1 0.1 2.1
5 1 hiv 3 12.2 0.2 2.2
In [65]:
wide2 = long_to_wide(long2, ['pid', 'stim', 'visit', 'variable'])
wide2.head(6)
Out[65]:
pid stim visit ifn il2 tnf
0 1 cmv 1 11.0 0.0 1.0
1 1 cmv 2 11.1 0.1 1.1
2 1 cmv 3 11.2 0.2 1.2
3 1 hiv 1 12.0 0.0 2.0
4 1 hiv 2 12.1 0.1 2.1
5 1 hiv 3 12.2 0.2 2.2
In [66]:
wide3 = long_to_wide(long3, ['pid', 'stim', 'visit', 'variable'])
wide3.head(6)
Out[66]:
pid stim visit il2 tnf
0 1 cmv 1 0.0 1.0
1 1 cmv 2 0.1 1.1
2 1 cmv 3 0.2 1.2
3 1 hiv 1 0.0 2.0
4 1 hiv 2 0.1 2.1
5 1 hiv 3 0.2 2.2

Hierarchical Indexes

In [67]:
df.head()
Out[67]:
pid visit stim tnf ifn il2
0 1 1 cmv 1.0 11.0 0.0
1 1 1 hiv 2.0 12.0 0.0
2 1 2 cmv 1.1 11.1 0.1
3 1 2 hiv 2.1 12.1 0.1
4 1 3 cmv 1.2 11.2 0.2
In [68]:
df1 = df.set_index(['pid', 'stim', 'visit'])
df1
Out[68]:
tnf ifn il2
pid stim visit
1 cmv 1 1.0 11.0 0.0
hiv 1 2.0 12.0 0.0
cmv 2 1.1 11.1 0.1
hiv 2 2.1 12.1 0.1
cmv 3 1.2 11.2 0.2
hiv 3 2.2 12.2 0.2
2 cmv 1 3.0 13.0 0.1
hiv 1 4.0 14.0 0.3
cmv 2 3.1 13.1 0.1
hiv 2 4.1 14.1 0.1

Indexing for mult-index

With the multi-index, each “cell” is now a block of values for the combinations (pid, stim, visit).

Find TNF values

In [69]:
df1[['tnf']]
Out[69]:
tnf
pid stim visit
1 cmv 1 1.0
hiv 1 2.0
cmv 2 1.1
hiv 2 2.1
cmv 3 1.2
hiv 3 2.2
2 cmv 1 3.0
hiv 1 4.0
cmv 2 3.1
hiv 2 4.1

Find all values for Subject 2

In [70]:
df1.ix['2']
Out[70]:
tnf ifn il2
stim visit
cmv 1 3.0 13.0 0.1
hiv 1 4.0 14.0 0.3
cmv 2 3.1 13.1 0.1
hiv 2 4.1 14.1 0.1

Find TNF values for subject 2

In [71]:
df1.ix['2', ['tnf']]
Out[71]:
tnf
stim visit
cmv 1 3.0
hiv 1 4.0
cmv 2 3.1
hiv 2 4.1

Undo

In [72]:
df1.reset_index()
Out[72]:
pid stim visit tnf ifn il2
0 1 cmv 1 1.0 11.0 0.0
1 1 hiv 1 2.0 12.0 0.0
2 1 cmv 2 1.1 11.1 0.1
3 1 hiv 2 2.1 12.1 0.1
4 1 cmv 3 1.2 11.2 0.2
5 1 hiv 3 2.2 12.2 0.2
6 2 cmv 1 3.0 13.0 0.1
7 2 hiv 1 4.0 14.0 0.3
8 2 cmv 2 3.1 13.1 0.1
9 2 hiv 2 4.1 14.1 0.1

Move pid from column to row

In [73]:
df1.unstack('pid')
Out[73]:
tnf ifn il2
pid 1 2 1 2 1 2
stim visit
cmv 1 1.0 3.0 11.0 13.0 0.0 0.1
2 1.1 3.1 11.1 13.1 0.1 0.1
3 1.2 NaN 11.2 NaN 0.2 NaN
hiv 1 2.0 4.0 12.0 14.0 0.0 0.3
2 2.1 4.1 12.1 14.1 0.1 0.1
3 2.2 NaN 12.2 NaN 0.2 NaN
In [74]:
df1.unstack(0)
Out[74]:
tnf ifn il2
pid 1 2 1 2 1 2
stim visit
cmv 1 1.0 3.0 11.0 13.0 0.0 0.1
2 1.1 3.1 11.1 13.1 0.1 0.1
3 1.2 NaN 11.2 NaN 0.2 NaN
hiv 1 2.0 4.0 12.0 14.0 0.0 0.3
2 2.1 4.1 12.1 14.1 0.1 0.1
3 2.2 NaN 12.2 NaN 0.2 NaN

Move stim from column to row

In [75]:
df1.unstack('stim')
Out[75]:
tnf ifn il2
stim cmv hiv cmv hiv cmv hiv
pid visit
1 1 1.0 2.0 11.0 12.0 0.0 0.0
2 1.1 2.1 11.1 12.1 0.1 0.1
3 1.2 2.2 11.2 12.2 0.2 0.2
2 1 3.0 4.0 13.0 14.0 0.1 0.3
2 3.1 4.1 13.1 14.1 0.1 0.1
In [76]:
df1.unstack(1)
Out[76]:
tnf ifn il2
stim cmv hiv cmv hiv cmv hiv
pid visit
1 1 1.0 2.0 11.0 12.0 0.0 0.0
2 1.1 2.1 11.1 12.1 0.1 0.1
3 1.2 2.2 11.2 12.2 0.2 0.2
2 1 3.0 4.0 13.0 14.0 0.1 0.3
2 3.1 4.1 13.1 14.1 0.1 0.1

Move pid and stim from column to row

In [77]:
df1.unstack(['pid', 'stim'])
Out[77]:
tnf ifn il2
pid 1 2 1 2 1 2
stim cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv
visit
1 1.0 2.0 3.0 4.0 11.0 12.0 13.0 14.0 0.0 0.0 0.1 0.3
2 1.1 2.1 3.1 4.1 11.1 12.1 13.1 14.1 0.1 0.1 0.1 0.1
3 1.2 2.2 NaN NaN 11.2 12.2 NaN NaN 0.2 0.2 NaN NaN

Exercises

You are given three DataFrames d1, d2, and d3. Answer the questions below.

In [52]:
d1 = pd.DataFrame({'subject': ['a', 'b', 'c', 'd'], 'group': ['A', 'B', 'C', 'D'], 'value': [1, 2, 3, 4]})
d2 = pd.DataFrame({'subject': ['w', 'x', 'y', 'z'], 'group': ['A', 'B', 'C', 'D'], 'value': [2, 1, 4, 3]})
d3 = pd.DataFrame({'subject': ['a', 'x', 'w', 'd', 'b', 'c', 'y', 'z'], 'age': [10, 20, 30, 40, 50, 60, 70, 80]})
In [53]:
d1
Out[53]:
group subject value
0 A a 1
1 B b 2
2 C c 3
3 D d 4
In [54]:
d2
Out[54]:
group subject value
0 A w 2
1 B x 1
2 C y 4
3 D z 3
In [55]:
d3
Out[55]:
age subject
0 10 a
1 20 x
2 30 w
3 40 d
4 50 b
5 60 c
6 70 y
7 80 z

1. Combine d1 and d2 to create a DataFrame called df1 with 8 rows and 3 columns.

In [57]:
df1 = pd.concat([d1, d2])
df1
Out[57]:
group subject value
0 A a 1
1 B b 2
2 C c 3
3 D d 4
0 A w 2
1 B x 1
2 C y 4
3 D z 3

2. Combine df1 with d3 to create a DataFrame called df2 with 8 rows and 4 columns.

In [59]:
df2 = pd.merge(d3, df1, on = 'subject')
df2
Out[59]:
age subject group value
0 10 a A 1
1 20 x B 1
2 30 w A 2
3 40 d D 4
4 50 b B 2
5 60 c C 3
6 70 y C 4
7 80 z D 3

3.. Convert the given DataFrame fd from wide to tall format and save in df.

In [82]:
name = ['ann', 'bob']
vals1 = [1, 2]
vals2 = [3, 4]
vals3 = [5, 6]

d = pd.DataFrame(OrderedDict([('name', name),
                              ('group_1', vals1),
                              ('group_2', vals2),
                              ('group_3', vals3)]))
d
Out[82]:
name group_1 group_2 group_3
0 ann 1 3 5
1 bob 2 4 6
In [85]:




Out[85]:
name variable value
0 ann group_1 1
1 bob group_1 2
2 ann group_2 3
3 bob group_2 4
4 ann group_3 5
5 bob group_3 6

4. Name the column in df with group values group and remove the group_ part from the group name values in each cell, leaving only the number.

In [102]:




Out[102]:
name group value
0 ann 1 1
1 bob 1 2
2 ann 2 3
3 bob 2 4
4 ann 3 5
5 bob 3 6

Version information

In [78]:
%load_ext version_information
%version_information
The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information
Out[78]:
SoftwareVersion
Python3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython5.0.0
OSDarwin 15.6.0 x86_64 i386 64bit
Tue Aug 16 09:06:17 2016 EDT
In [ ]: