In [1]:

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext version_information
%load_ext rpy2.ipython

Manipulating and querying data¶

Constructing a DataFrame¶

In [2]:

subject = ['ann', 'bob', 'charles', 'david', 'ella']
sex = ['F', 'M', 'M', 'M', 'F']
group = ['A', 'A', 'A', 'B', 'B']
age = [23, 34, 27, 30, 27]
score = [89, 90, 78, 98, 90]
df = pd.DataFrame(data = {'name': subject, 'sex': sex, 'age': age, 'group': group, 'score': score})

In [3]:

df

Out[3]:

	age	group	name	score	sex
0	23	A	ann	89	F
1	34	A	bob	90	M
2	27	A	charles	78	M
3	30	B	david	98	M
4	27	B	ella	90	F

Getting basic information¶

In [4]:

df.dtypes

Out[4]:

age       int64
group    object
name     object
score     int64
sex      object
dtype: object

In [5]:

df.shape

Out[5]:

(5, 5)

In [6]:

df.index

Out[6]:

RangeIndex(start=0, stop=5, step=1)

In [7]:

df.columns

Out[7]:

Index(['age', 'group', 'name', 'score', 'sex'], dtype='object')

In [8]:

df = df[['name', 'sex', 'age', 'group', 'score']]
df

Out[8]:

	name	sex	age	group	score
0	ann	F	23	A	89
1	bob	M	34	A	90
2	charles	M	27	A	78
3	david	M	30	B	98
4	ella	F	27	B	90

Row subsets¶

In [9]:

df.head(2)

Out[9]:

	name	sex	age	group	score
0	ann	F	23	A	89
1	bob	M	34	A	90

In [10]:

df.tail(2)

Out[10]:

	name	sex	age	group	score
3	david	M	30	B	98
4	ella	F	27	B	90

In [11]:

df.sample(2)

Out[11]:

	name	sex	age	group	score
0	ann	F	23	A	89
2	charles	M	27	A	78

Column subsets¶

In [12]:

df['name']

Out[12]:

      ann
      bob
  charles
    david
     ella
Name: name, dtype: object

In [13]:

df.name

Out[13]:

      ann
      bob
  charles
    david
     ella
Name: name, dtype: object

In [14]:

df[[0]]

Out[14]:

	name
0	ann
1	bob
2	charles
3	david
4	ella

In [15]:

df[['age', 'sex']]

Out[15]:

	age	sex
0	23	F
1	34	M
2	27	M
3	30	M
4	27	F

In [16]:

df[[1,2]]

Out[16]:

	sex	age
0	F	23
1	M	34
2	M	27
3	M	30
4	F	27

Indexing¶

In [17]:

df.ix[0,0]

Out[17]:

'ann'

In [18]:

df.ix[1,1]

Out[18]:

'M'

In [19]:

df.ix[1:4, :2]

Out[19]:

	name	sex
1	bob	M
2	charles	M
3	david	M
4	ella	F

In [20]:

df.ix[[1,3,4], ['age', 'name']]

Out[20]:

	age	name
1	34	bob
3	30	david
4	27	ella

Logical indexing¶

In [21]:

df[df.age > 25]

Out[21]:

	name	sex	age	group	score
1	bob	M	34	A	90
2	charles	M	27	A	78
3	david	M	30	B	98
4	ella	F	27	B	90

In [22]:

df[(df.age > 25) & (df.sex == 'M')]

Out[22]:

	name	sex	age	group	score
1	bob	M	34	A	90
2	charles	M	27	A	78
3	david	M	30	B	98

Sorting¶

In [23]:

df.sort_values('age')

Out[23]:

	name	sex	age	group	score
0	ann	F	23	A	89
2	charles	M	27	A	78
4	ella	F	27	B	90
3	david	M	30	B	98
1	bob	M	34	A	90

In [24]:

df.sort_values('age', ascending = False)

Out[24]:

	name	sex	age	group	score
1	bob	M	34	A	90
3	david	M	30	B	98
2	charles	M	27	A	78
4	ella	F	27	B	90
0	ann	F	23	A	89

In [25]:

df.sort_values(['age', 'score'], ascending = ['True', 'True'])

Out[25]:

	name	sex	age	group	score
0	ann	F	23	A	89
2	charles	M	27	A	78
4	ella	F	27	B	90
3	david	M	30	B	98
1	bob	M	34	A	90

In [26]:

df.sort_values(['age', 'score'], ascending = ['True', 'False'], inplace = True)
df

Out[26]:

	name	sex	age	group	score
0	ann	F	23	A	89
2	charles	M	27	A	78
4	ella	F	27	B	90
3	david	M	30	B	98
1	bob	M	34	A	90

In [27]:

df.sort_index()

Out[27]:

	name	sex	age	group	score
0	ann	F	23	A	89
1	bob	M	34	A	90
2	charles	M	27	A	78
3	david	M	30	B	98
4	ella	F	27	B	90

Transformation¶

In [28]:

df['log_score'] = np.log(df['score'])
df

Out[28]:

	name	sex	age	group	score	log_score
0	ann	F	23	A	89	4.488636
2	charles	M	27	A	78	4.356709
4	ella	F	27	B	90	4.499810
3	david	M	30	B	98	4.584967
1	bob	M	34	A	90	4.499810

In [29]:

df['adjusted_score'] = np.where(df.sex == 'M', df.score - 10, df.score)
df

Out[29]:

	name	sex	age	group	score	log_score	adjusted_score
0	ann	F	23	A	89	4.488636	89
2	charles	M	27	A	78	4.356709	68
4	ella	F	27	B	90	4.499810	90
3	david	M	30	B	98	4.584967	88
1	bob	M	34	A	90	4.499810	80

String Operations¶

In [30]:

df.name.str[:3]

Out[30]:

  ann
  cha
  ell
  dav
  bob
Name: name, dtype: object

In [31]:

df.name.str.upper()

Out[31]:

      ANN
  CHARLES
     ELLA
    DAVID
      BOB
Name: name, dtype: object

In [32]:

df.name.str.replace('a', 'A')

Out[32]:

      Ann
  chArles
     ellA
    dAvid
      bob
Name: name, dtype: object

Summary Statistics¶

In [33]:

df.mean()

Out[33]:

age               28.200000
score             89.000000
log_score          4.485986
adjusted_score    83.000000
dtype: float64

In [34]:

df.score.var()

Out[34]:

51.0

In [35]:

df.count()

Out[35]:

name              5
sex               5
age               5
group             5
score             5
log_score         5
adjusted_score    5
dtype: int64

In [36]:

df.describe()

Out[36]:

	age	score	log_score	adjusted_score
count	5.000000	5.000000	5.000000	5.000000
mean	28.200000	89.000000	4.485986	83.000000
std	4.086563	7.141428	0.082005	9.273618
min	23.000000	78.000000	4.356709	68.000000
25%	27.000000	89.000000	4.488636	80.000000
50%	27.000000	90.000000	4.499810	88.000000
75%	30.000000	90.000000	4.499810	89.000000
max	34.000000	98.000000	4.584967	90.000000

Split-Apply-Combine¶

In [37]:

grouped = df.groupby('sex')
grouped.groups

Out[37]:

{'F': [0, 4], 'M': [2, 3, 1]}

In [38]:

df.groupby('sex').count()

Out[38]:

	name	age	group	score	log_score	adjusted_score
sex
F	2	2	2	2	2	2
M	3	3	3	3	3	3

In [39]:

df.groupby('sex').mean()

Out[39]:

	age	score	log_score	adjusted_score
sex
F	25.000000	89.500000	4.494223	89.500000
M	30.333333	88.666667	4.480495	78.666667

In [40]:

df.groupby(['sex', 'group']).mean()

Out[40]:

		age	score	log_score	adjusted_score
sex	group
F	A	23.0	89.0	4.488636	89.0
F	B	27.0	90.0	4.499810	90.0
M	A	30.5	84.0	4.428259	74.0
M	B	30.0	98.0	4.584967	88.0

In [41]:

df.groupby(['sex', 'group']).agg(['count', 'mean', 'sum'])

Out[41]:

		age			score			log_score			adjusted_score
		count	mean	sum	count	mean	sum	count	mean	sum	count	mean	sum
sex	group
F	A	1	23.0	23	1	89	89	1	4.488636	4.488636	1	89	89
F	B	1	27.0	27	1	90	90	1	4.499810	4.499810	1	90	90
M	A	2	30.5	61	2	84	168	2	4.428259	8.856518	2	74	148
M	B	1	30.0	30	1	98	98	1	4.584967	4.584967	1	88	88

In [42]:

grouped[['age', 'score']].agg('mean')

Out[42]:

	age	score
sex
F	25.000000	89.500000
M	30.333333	88.666667

In [43]:

grouped.get_group('F')

Out[43]:

	name	sex	age	group	score	log_score	adjusted_score
0	ann	F	23	A	89	4.488636	89
4	ella	F	27	B	90	4.499810	90

In [44]:

for name, group in df.groupby('sex'):
    print(name)
    print(group.sum())
    print()

F
name              annella
sex                    FF
age                    50
group                  AB
score                 179
log_score         8.98845
adjusted_score        179
dtype: object

M
name              charlesdavidbob
sex                           MMM
age                            91
group                         ABA
score                         266
log_score                 13.4415
adjusted_score                236
dtype: object

Pivot tables¶

In [45]:

from collections import OrderedDict

In [46]:

d = OrderedDict()
d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']
d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']
d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']
d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]
d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]
d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]
df = pd.DataFrame(d)

In [47]:

df.head()

Out[47]:

	pid	visit	stim	tnf	ifn	il2
0	1	1	cmv	1.0	11.0	0.0
1	1	1	hiv	2.0	12.0	0.0
2	1	2	cmv	1.1	11.1	0.1
3	1	2	hiv	2.1	12.1	0.1
4	1	3	cmv	1.2	11.2	0.2

In [48]:

df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['stim'], aggfunc = 'mean')

Out[48]:

	ifn	il2	tnf
stim
cmv	11.88	0.10	1.88
hiv	12.88	0.14	2.88

In [49]:

df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid'], aggfunc = 'mean')

Out[49]:

	ifn	il2	tnf
pid
1	11.60	0.10	1.60
2	13.55	0.15	3.55

In [50]:

df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'])

Out[50]:

		ifn	il2	tnf
pid	stim
1	cmv	11.10	0.1	1.10
1	hiv	12.10	0.1	2.10
2	cmv	13.05	0.1	3.05
2	hiv	14.05	0.2	4.05

In [51]:

df.pivot_table(values = ['tnf', 'ifn', 'il2'], index = ['pid', 'stim'], aggfunc = 'count')

Out[51]:

		ifn	il2	tnf
pid	stim
1	cmv	3	3	3
1	hiv	3	3	3
2	cmv	2	2	2
2	hiv	2	2	2

Exercises¶

1a. Create two DataFrames from the files data/set1.csv and data/set2.csv and save as df1 and df2 respectively.

In [ ]:

1b. Show the top and bottom 6 rows from df1 and df2.

In [ ]:

1c. Create a new DataFrame df1_wide from df1 so that there are two new columns in place of (gene1, gene2 and gene3). The first new column should be named variable and contain the names of the gene, and the second should be named activity and contain the value. This should have 60 rows.

In [ ]:

1d. Merge df1 and df2 into a single DataFrame named df. This should have 7 columns.

In [ ]:

1e. Find the average values of age, gene1, gene2 and gene3grouping by time.

In [ ]:

1f. Sort df1 by gene1 in from largest to smallest.

In [ ]:

1g. Sort df1 by the average value across gene1, gene2 and gene3 from smallest to largest.

In [ ]:

1h. Insert a new column lr_2_1 inot df1 that is the log ratio of gene2 to gene1.

In [ ]:

Version information¶

In [52]:

%load_ext version_information
%version_information

The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information

Out[52]:

Software	Version
Python	3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython	5.0.0
OS	Darwin 15.6.0 x86_64 i386 64bit
Tue Aug 16 09:05:57 2016 EDT