In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext version_information
%load_ext rpy2.ipython

Manipulating and querying data

Constructing a DataFrame

In [2]:
subject = ['ann', 'bob', 'charles', 'david', 'ella']
sex = ['F', 'M', 'M', 'M', 'F']
group = ['A', 'A', 'A', 'B', 'B']
age = [23, 34, 27, 30, 27]
score = [89, 90, 78, 98, 90]
df = pd.DataFrame(data = {'name': subject, 'sex': sex, 'age': age, 'group': group, 'score': score})
In [3]:
df
Out[3]:
age group name score sex
0 23 A ann 89 F
1 34 A bob 90 M
2 27 A charles 78 M
3 30 B david 98 M
4 27 B ella 90 F

Getting basic information

In [4]:
df.dtypes
Out[4]:
age       int64
group    object
name     object
score     int64
sex      object
dtype: object
In [5]:
df.shape
Out[5]:
(5, 5)
In [6]:
df.index
Out[6]:
RangeIndex(start=0, stop=5, step=1)
In [7]:
df.columns
Out[7]:
Index(['age', 'group', 'name', 'score', 'sex'], dtype='object')
In [8]:
df = df[['name', 'sex', 'age', 'group', 'score']]
df
Out[8]:
name sex age group score
0 ann F 23 A 89
1 bob M 34 A 90
2 charles M 27 A 78
3 david M 30 B 98
4 ella F 27 B 90

Row subsets

In [9]:
df.head(2)
Out[9]:
name sex age group score
0 ann F 23 A 89
1 bob M 34 A 90
In [10]:
df.tail(2)
Out[10]:
name sex age group score
3 david M 30 B 98
4 ella F 27 B 90
In [11]:
df.sample(2)
Out[11]:
name sex age group score
0 ann F 23 A 89
2 charles M 27 A 78

Column subsets

In [12]:
df['name']
Out[12]:
0        ann
1        bob
2    charles
3      david
4       ella
Name: name, dtype: object
In [13]:
df.name
Out[13]:
0        ann
1        bob
2    charles
3      david
4       ella
Name: name, dtype: object
In [14]:
df[[0]]
Out[14]:
name
0 ann
1 bob
2 charles
3 david
4 ella
In [15]:
df[['age', 'sex']]
Out[15]:
age sex
0 23 F
1 34 M
2 27 M
3 30 M
4 27 F
In [16]:
df[[1,2]]
Out[16]:
sex age
0 F 23
1 M 34
2 M 27
3 M 30
4 F 27

Indexing

In [17]:
df.ix[0,0]
Out[17]:
'ann'
In [18]:
df.ix[1,1]
Out[18]:
'M'
In [19]:
df.ix[1:4, :2]
Out[19]:
name sex
1 bob M
2 charles M
3 david M
4 ella F
In [20]:
df.ix[[1,3,4], ['age', 'name']]
Out[20]:
age name
1 34 bob
3 30 david
4 27 ella

Logical indexing

In [21]:
df[df.age > 25]
Out[21]:
name sex age group score
1 bob M 34 A 90
2 charles M 27 A 78
3 david M 30 B 98
4 ella F 27 B 90
In [22]:
df[(df.age > 25) & (df.sex == 'M')]
Out[22]:
name sex age group score
1 bob M 34 A 90
2 charles M 27 A 78
3 david M 30 B 98

Sorting

In [23]:
df.sort_values('age')
Out[23]:
name sex age group score
0 ann F 23 A 89
2 charles M 27 A 78
4 ella F 27 B 90
3 david M 30 B 98
1 bob M 34 A 90
In [24]:
df.sort_values('age', ascending = False)
Out[24]:
name sex age group score
1 bob M 34 A 90
3 david M 30 B 98
2 charles M 27 A 78
4 ella F 27 B 90
0 ann F 23 A 89
In [25]:
df.sort_values(['age', 'score'], ascending = ['True', 'True'])
Out[25]:
name sex age group score
0 ann F 23 A 89
2 charles M 27 A 78
4 ella F 27 B 90
3 david M 30 B 98
1 bob M 34 A 90
In [26]:
df.sort_values(['age', 'score'], ascending = ['True', 'False'], inplace = True)
df
Out[26]:
name sex age group score
0 ann F 23 A 89
2 charles M 27 A 78
4 ella F 27 B 90
3 david M 30 B 98
1 bob M 34 A 90
In [27]:
df.sort_index()
Out[27]:
name sex age group score
0 ann F 23 A 89
1 bob M 34 A 90
2 charles M 27 A 78
3 david M 30 B 98
4 ella F 27 B 90

Transformation

In [28]:
df['log_score'] = np.log(df['score'])
df
Out[28]:
name sex age group score log_score
0 ann F 23 A 89 4.488636
2 charles M 27 A 78 4.356709
4 ella F 27 B 90 4.499810
3 david M 30 B 98 4.584967
1 bob M 34 A 90 4.499810
In [29]:
df['adjusted_score'] = np.where(df.sex == 'M', df.score - 10, df.score)
df
Out[29]:
name sex age group score log_score adjusted_score
0 ann F 23 A 89 4.488636 89
2 charles M 27 A 78 4.356709 68
4 ella F 27 B 90 4.499810 90
3 david M 30 B 98 4.584967 88
1 bob M 34 A 90 4.499810 80

String Operations

In [30]:
df.name.str[:3]
Out[30]:
0    ann
2    cha
4    ell
3    dav
1    bob
Name: name, dtype: object
In [31]:
df.name.str.upper()
Out[31]:
0        ANN
2    CHARLES
4       ELLA
3      DAVID
1        BOB
Name: name, dtype: object
In [32]:
df.name.str.replace('a', 'A')
Out[32]:
0        Ann
2    chArles
4       ellA
3      dAvid
1        bob
Name: name, dtype: object

Summary Statistics

In [33]:
df.mean()
Out[33]:
age               28.200000
score             89.000000
log_score          4.485986
adjusted_score    83.000000
dtype: float64
In [34]:
df.score.var()
Out[34]:
51.0
In [35]:
df.count()
Out[35]:
name              5
sex               5
age               5
group             5
score             5
log_score         5
adjusted_score    5
dtype: int64
In [36]:
df.describe()
Out[36]:
age score log_score adjusted_score
count 5.000000 5.000000 5.000000 5.000000
mean 28.200000 89.000000 4.485986 83.000000
std 4.086563 7.141428 0.082005 9.273618
min 23.000000 78.000000 4.356709 68.000000
25% 27.000000 89.000000 4.488636 80.000000
50% 27.000000 90.000000 4.499810 88.000000
75% 30.000000 90.000000 4.499810 89.000000
max 34.000000 98.000000 4.584967 90.000000

Split-Apply-Combine

In [37]:
grouped = df.groupby('sex')
grouped.groups
Out[37]:
{'F': [0, 4], 'M': [2, 3, 1]}
In [38]:
df.groupby('sex').count()
Out[38]:
name age group score log_score adjusted_score
sex
F 2 2 2 2 2 2
M 3 3 3 3 3 3
In [39]:
df.groupby('sex').mean()
Out[39]:
age score log_score adjusted_score
sex
F 25.000000 89.500000 4.494223 89.500000
M 30.333333 88.666667 4.480495 78.666667
In [40]:
df.groupby(['sex', 'group']).mean()
Out[40]:
age score log_score adjusted_score
sex group
F A 23.0 89.0 4.488636 89.0
B 27.0 90.0 4.499810 90.0
M A 30.5 84.0 4.428259 74.0
B 30.0 98.0 4.584967 88.0
In [41]:
df.groupby(['sex', 'group']).agg(['count', 'mean', 'sum'])
Out[41]:
age score log_score adjusted_score
count mean sum count mean sum count mean sum count mean sum
sex group
F A 1 23.0 23 1 89 89 1 4.488636 4.488636 1 89 89
B 1 27.0 27 1 90 90 1 4.499810 4.499810 1 90 90
M A 2 30.5 61 2 84 168 2 4.428259 8.856518 2 74 148
B 1 30.0 30 1 98 98 1 4.584967 4.584967 1 88 88
In [42]:
grouped[['age', 'score']].agg('mean')
Out[42]:
age score
sex
F 25.000000 89.500000
M 30.333333 88.666667
In [43]:
grouped.get_group('F')
Out[43]:
name sex age group score log_score adjusted_score
0 ann F 23 A 89 4.488636 89
4 ella F 27 B 90 4.499810 90
In [44]:
for name, group in df.groupby('sex'):
    print(name)
    print(group.sum())
    print()
F
name              annella
sex                    FF
age                    50
group                  AB
score                 179
log_score         8.98845
adjusted_score        179
dtype: object

M
name              charlesdavidbob
sex                           MMM
age                            91
group                         ABA
score                         266
log_score                 13.4415
adjusted_score                236
dtype: object

Pivot tables

In [45]:
from collections import OrderedDict
In [46]:
d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)
In [47]:
df.head()
Out[47]:
pid visit stim tnf ifn il2
0 1 1 cmv 1.0 11.0 0.0
1 1 1 hiv 2.0 12.0 0.0
2 1 2 cmv 1.1 11.1 0.1
3 1 2 hiv 2.1 12.1 0.1
4 1 3 cmv 1.2 11.2 0.2
In [48]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['stim'], aggfunc = 'mean')
Out[48]:
ifn il2 tnf
stim
cmv 11.88 0.10 1.88
hiv 12.88 0.14 2.88
In [49]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid'], aggfunc = 'mean')
Out[49]:
ifn il2 tnf
pid
1 11.60 0.10 1.60
2 13.55 0.15 3.55
In [50]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'])
Out[50]:
ifn il2 tnf
pid stim
1 cmv 11.10 0.1 1.10
hiv 12.10 0.1 2.10
2 cmv 13.05 0.1 3.05
hiv 14.05 0.2 4.05
In [51]:
df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'], aggfunc = 'count')
Out[51]:
ifn il2 tnf
pid stim
1 cmv 3 3 3
hiv 3 3 3
2 cmv 2 2 2
hiv 2 2 2

Exercises

1a. Create two DataFrames from the files data/set1.csv and data/set2.csv and save as df1 and df2 respectively.

In [ ]:




1b. Show the top and bottom 6 rows from df1 and df2.

In [ ]:




1c. Create a new DataFrame df1_wide from df1 so that there are two new columns in place of (gene1, gene2 and gene3). The first new column should be named variable and contain the names of the gene, and the second should be named activity and contain the value. This should have 60 rows.

In [ ]:




1d. Merge df1 and df2 into a single DataFrame named df. This should have 7 columns.

In [ ]:




1e. Find the average values of age, gene1, gene2 and gene3grouping by time.

In [ ]:




1f. Sort df1 by gene1 in from largest to smallest.

In [ ]:




1g. Sort df1 by the average value across gene1, gene2 and gene3 from smallest to largest.

In [ ]:




1h. Insert a new column lr_2_1 inot df1 that is the log ratio of gene2 to gene1.

In [ ]:




Version information

In [52]:
%load_ext version_information
%version_information
The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information
Out[52]:
SoftwareVersion
Python3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython5.0.0
OSDarwin 15.6.0 x86_64 i386 64bit
Tue Aug 16 09:05:57 2016 EDT