In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext version_information
%load_ext rpy2.ipython
Manipulating and querying data¶
Constructing a DataFrame¶
In [2]:
subject = ['ann', 'bob', 'charles', 'david', 'ella']
sex = ['F', 'M', 'M', 'M', 'F']
group = ['A', 'A', 'A', 'B', 'B']
age = [23, 34, 27, 30, 27]
score = [89, 90, 78, 98, 90]
df = pd.DataFrame(data = {'name': subject, 'sex': sex, 'age': age, 'group': group, 'score': score})
In [3]:
df
Out[3]:
age | group | name | score | sex | |
---|---|---|---|---|---|
0 | 23 | A | ann | 89 | F |
1 | 34 | A | bob | 90 | M |
2 | 27 | A | charles | 78 | M |
3 | 30 | B | david | 98 | M |
4 | 27 | B | ella | 90 | F |
Getting basic information¶
In [4]:
df.dtypes
Out[4]:
age int64
group object
name object
score int64
sex object
dtype: object
In [5]:
df.shape
Out[5]:
(5, 5)
In [6]:
df.index
Out[6]:
RangeIndex(start=0, stop=5, step=1)
In [7]:
df.columns
Out[7]:
Index(['age', 'group', 'name', 'score', 'sex'], dtype='object')
In [8]:
df = df[['name', 'sex', 'age', 'group', 'score']]
df
Out[8]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
1 | bob | M | 34 | A | 90 |
2 | charles | M | 27 | A | 78 |
3 | david | M | 30 | B | 98 |
4 | ella | F | 27 | B | 90 |
Row subsets¶
In [9]:
df.head(2)
Out[9]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
1 | bob | M | 34 | A | 90 |
In [10]:
df.tail(2)
Out[10]:
name | sex | age | group | score | |
---|---|---|---|---|---|
3 | david | M | 30 | B | 98 |
4 | ella | F | 27 | B | 90 |
In [11]:
df.sample(2)
Out[11]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
2 | charles | M | 27 | A | 78 |
Column subsets¶
In [12]:
df['name']
Out[12]:
0 ann
1 bob
2 charles
3 david
4 ella
Name: name, dtype: object
In [13]:
df.name
Out[13]:
0 ann
1 bob
2 charles
3 david
4 ella
Name: name, dtype: object
In [14]:
df[[0]]
Out[14]:
name | |
---|---|
0 | ann |
1 | bob |
2 | charles |
3 | david |
4 | ella |
In [15]:
df[['age', 'sex']]
Out[15]:
age | sex | |
---|---|---|
0 | 23 | F |
1 | 34 | M |
2 | 27 | M |
3 | 30 | M |
4 | 27 | F |
In [16]:
df[[1,2]]
Out[16]:
sex | age | |
---|---|---|
0 | F | 23 |
1 | M | 34 |
2 | M | 27 |
3 | M | 30 |
4 | F | 27 |
Indexing¶
In [17]:
df.ix[0,0]
Out[17]:
'ann'
In [18]:
df.ix[1,1]
Out[18]:
'M'
In [19]:
df.ix[1:4, :2]
Out[19]:
name | sex | |
---|---|---|
1 | bob | M |
2 | charles | M |
3 | david | M |
4 | ella | F |
In [20]:
df.ix[[1,3,4], ['age', 'name']]
Out[20]:
age | name | |
---|---|---|
1 | 34 | bob |
3 | 30 | david |
4 | 27 | ella |
Logical indexing¶
In [21]:
df[df.age > 25]
Out[21]:
name | sex | age | group | score | |
---|---|---|---|---|---|
1 | bob | M | 34 | A | 90 |
2 | charles | M | 27 | A | 78 |
3 | david | M | 30 | B | 98 |
4 | ella | F | 27 | B | 90 |
In [22]:
df[(df.age > 25) & (df.sex == 'M')]
Out[22]:
name | sex | age | group | score | |
---|---|---|---|---|---|
1 | bob | M | 34 | A | 90 |
2 | charles | M | 27 | A | 78 |
3 | david | M | 30 | B | 98 |
Sorting¶
In [23]:
df.sort_values('age')
Out[23]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
2 | charles | M | 27 | A | 78 |
4 | ella | F | 27 | B | 90 |
3 | david | M | 30 | B | 98 |
1 | bob | M | 34 | A | 90 |
In [24]:
df.sort_values('age', ascending = False)
Out[24]:
name | sex | age | group | score | |
---|---|---|---|---|---|
1 | bob | M | 34 | A | 90 |
3 | david | M | 30 | B | 98 |
2 | charles | M | 27 | A | 78 |
4 | ella | F | 27 | B | 90 |
0 | ann | F | 23 | A | 89 |
In [25]:
df.sort_values(['age', 'score'], ascending = ['True', 'True'])
Out[25]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
2 | charles | M | 27 | A | 78 |
4 | ella | F | 27 | B | 90 |
3 | david | M | 30 | B | 98 |
1 | bob | M | 34 | A | 90 |
In [26]:
df.sort_values(['age', 'score'], ascending = ['True', 'False'], inplace = True)
df
Out[26]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
2 | charles | M | 27 | A | 78 |
4 | ella | F | 27 | B | 90 |
3 | david | M | 30 | B | 98 |
1 | bob | M | 34 | A | 90 |
In [27]:
df.sort_index()
Out[27]:
name | sex | age | group | score | |
---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 |
1 | bob | M | 34 | A | 90 |
2 | charles | M | 27 | A | 78 |
3 | david | M | 30 | B | 98 |
4 | ella | F | 27 | B | 90 |
Transformation¶
In [28]:
df['log_score'] = np.log(df['score'])
df
Out[28]:
name | sex | age | group | score | log_score | |
---|---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 | 4.488636 |
2 | charles | M | 27 | A | 78 | 4.356709 |
4 | ella | F | 27 | B | 90 | 4.499810 |
3 | david | M | 30 | B | 98 | 4.584967 |
1 | bob | M | 34 | A | 90 | 4.499810 |
In [29]:
df['adjusted_score'] = np.where(df.sex == 'M', df.score - 10, df.score)
df
Out[29]:
name | sex | age | group | score | log_score | adjusted_score | |
---|---|---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 | 4.488636 | 89 |
2 | charles | M | 27 | A | 78 | 4.356709 | 68 |
4 | ella | F | 27 | B | 90 | 4.499810 | 90 |
3 | david | M | 30 | B | 98 | 4.584967 | 88 |
1 | bob | M | 34 | A | 90 | 4.499810 | 80 |
String Operations¶
In [30]:
df.name.str[:3]
Out[30]:
0 ann
2 cha
4 ell
3 dav
1 bob
Name: name, dtype: object
In [31]:
df.name.str.upper()
Out[31]:
0 ANN
2 CHARLES
4 ELLA
3 DAVID
1 BOB
Name: name, dtype: object
In [32]:
df.name.str.replace('a', 'A')
Out[32]:
0 Ann
2 chArles
4 ellA
3 dAvid
1 bob
Name: name, dtype: object
Summary Statistics¶
In [33]:
df.mean()
Out[33]:
age 28.200000
score 89.000000
log_score 4.485986
adjusted_score 83.000000
dtype: float64
In [34]:
df.score.var()
Out[34]:
51.0
In [35]:
df.count()
Out[35]:
name 5
sex 5
age 5
group 5
score 5
log_score 5
adjusted_score 5
dtype: int64
In [36]:
df.describe()
Out[36]:
age | score | log_score | adjusted_score | |
---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 28.200000 | 89.000000 | 4.485986 | 83.000000 |
std | 4.086563 | 7.141428 | 0.082005 | 9.273618 |
min | 23.000000 | 78.000000 | 4.356709 | 68.000000 |
25% | 27.000000 | 89.000000 | 4.488636 | 80.000000 |
50% | 27.000000 | 90.000000 | 4.499810 | 88.000000 |
75% | 30.000000 | 90.000000 | 4.499810 | 89.000000 |
max | 34.000000 | 98.000000 | 4.584967 | 90.000000 |
Split-Apply-Combine¶
In [37]:
grouped = df.groupby('sex')
grouped.groups
Out[37]:
{'F': [0, 4], 'M': [2, 3, 1]}
In [38]:
df.groupby('sex').count()
Out[38]:
name | age | group | score | log_score | adjusted_score | |
---|---|---|---|---|---|---|
sex | ||||||
F | 2 | 2 | 2 | 2 | 2 | 2 |
M | 3 | 3 | 3 | 3 | 3 | 3 |
In [39]:
df.groupby('sex').mean()
Out[39]:
age | score | log_score | adjusted_score | |
---|---|---|---|---|
sex | ||||
F | 25.000000 | 89.500000 | 4.494223 | 89.500000 |
M | 30.333333 | 88.666667 | 4.480495 | 78.666667 |
In [40]:
df.groupby(['sex', 'group']).mean()
Out[40]:
age | score | log_score | adjusted_score | ||
---|---|---|---|---|---|
sex | group | ||||
F | A | 23.0 | 89.0 | 4.488636 | 89.0 |
B | 27.0 | 90.0 | 4.499810 | 90.0 | |
M | A | 30.5 | 84.0 | 4.428259 | 74.0 |
B | 30.0 | 98.0 | 4.584967 | 88.0 |
In [41]:
df.groupby(['sex', 'group']).agg(['count', 'mean', 'sum'])
Out[41]:
age | score | log_score | adjusted_score | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | sum | count | mean | sum | count | mean | sum | count | mean | sum | ||
sex | group | ||||||||||||
F | A | 1 | 23.0 | 23 | 1 | 89 | 89 | 1 | 4.488636 | 4.488636 | 1 | 89 | 89 |
B | 1 | 27.0 | 27 | 1 | 90 | 90 | 1 | 4.499810 | 4.499810 | 1 | 90 | 90 | |
M | A | 2 | 30.5 | 61 | 2 | 84 | 168 | 2 | 4.428259 | 8.856518 | 2 | 74 | 148 |
B | 1 | 30.0 | 30 | 1 | 98 | 98 | 1 | 4.584967 | 4.584967 | 1 | 88 | 88 |
In [42]:
grouped[['age', 'score']].agg('mean')
Out[42]:
age | score | |
---|---|---|
sex | ||
F | 25.000000 | 89.500000 |
M | 30.333333 | 88.666667 |
In [43]:
grouped.get_group('F')
Out[43]:
name | sex | age | group | score | log_score | adjusted_score | |
---|---|---|---|---|---|---|---|
0 | ann | F | 23 | A | 89 | 4.488636 | 89 |
4 | ella | F | 27 | B | 90 | 4.499810 | 90 |
In [44]:
for name, group in df.groupby('sex'):
print(name)
print(group.sum())
print()
F
name annella
sex FF
age 50
group AB
score 179
log_score 8.98845
adjusted_score 179
dtype: object
M
name charlesdavidbob
sex MMM
age 91
group ABA
score 266
log_score 13.4415
adjusted_score 236
dtype: object
Pivot tables¶
In [45]:
from collections import OrderedDict
In [46]:
d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [47]:
df.head()
Out[47]:
pid | visit | stim | tnf | ifn | il2 | |
---|---|---|---|---|---|---|
0 | 1 | 1 | cmv | 1.0 | 11.0 | 0.0 |
1 | 1 | 1 | hiv | 2.0 | 12.0 | 0.0 |
2 | 1 | 2 | cmv | 1.1 | 11.1 | 0.1 |
3 | 1 | 2 | hiv | 2.1 | 12.1 | 0.1 |
4 | 1 | 3 | cmv | 1.2 | 11.2 | 0.2 |
In [48]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['stim'], aggfunc = 'mean')
Out[48]:
ifn | il2 | tnf | |
---|---|---|---|
stim | |||
cmv | 11.88 | 0.10 | 1.88 |
hiv | 12.88 | 0.14 | 2.88 |
In [49]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid'], aggfunc = 'mean')
Out[49]:
ifn | il2 | tnf | |
---|---|---|---|
pid | |||
1 | 11.60 | 0.10 | 1.60 |
2 | 13.55 | 0.15 | 3.55 |
In [50]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'])
Out[50]:
ifn | il2 | tnf | ||
---|---|---|---|---|
pid | stim | |||
1 | cmv | 11.10 | 0.1 | 1.10 |
hiv | 12.10 | 0.1 | 2.10 | |
2 | cmv | 13.05 | 0.1 | 3.05 |
hiv | 14.05 | 0.2 | 4.05 |
In [51]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'], aggfunc = 'count')
Out[51]:
ifn | il2 | tnf | ||
---|---|---|---|---|
pid | stim | |||
1 | cmv | 3 | 3 | 3 |
hiv | 3 | 3 | 3 | |
2 | cmv | 2 | 2 | 2 |
hiv | 2 | 2 | 2 |
Exercises¶
1a. Create two DataFrames from the files data/set1.csv
and
data/set2.csv
and save as df1
and df2
respectively.
In [ ]:
1b. Show the top and bottom 6 rows from df1
and df2
.
In [ ]:
1c. Create a new DataFrame df1_wide
from df1
so that there
are two new columns in place of (gene1, gene2 and gene3). The first new
column should be named variable
and contain the names of the gene,
and the second should be named activity
and contain the value. This
should have 60 rows.
In [ ]:
1d. Merge df1
and df2
into a single DataFrame named df
.
This should have 7 columns.
In [ ]:
1e. Find the average values of age
, gene1
, gene2
and
gene3
grouping by time
.
In [ ]:
1f. Sort df1
by gene1
in from largest to smallest.
In [ ]:
1g. Sort df1
by the average value across gene1
, gene2
and gene3
from smallest to largest.
In [ ]:
1h. Insert a new column lr_2_1
inot df1
that is the log
ratio of gene2
to gene1
.
In [ ]:
Version information¶
In [52]:
%load_ext version_information
%version_information
The version_information extension is already loaded. To reload it, use:
%reload_ext version_information
Out[52]:
Software | Version |
---|---|
Python | 3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)] |
IPython | 5.0.0 |
OS | Darwin 15.6.0 x86_64 i386 64bit |
Tue Aug 16 09:05:57 2016 EDT |