{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (1) Combining data sets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining rows" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df_versiocolor = pd.read_csv('data/versicolor.csv')\n", "df_virginica = pd.read_csv('data/virginica.csv')\n", "df_sertosa = pd.read_csv('data/setosa.csv')\n", "dfs = [df_versiocolor, df_virginica, df_sertosa]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
07.03.24.71.4versicolor
16.43.24.51.5versicolor
26.93.14.91.5versicolor
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "0 7.0 3.2 4.7 1.4 versicolor\n", "1 6.4 3.2 4.5 1.5 versicolor\n", "2 6.9 3.1 4.9 1.5 versicolor" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_versiocolor.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
06.33.36.02.5virginica
15.82.75.11.9virginica
27.13.05.92.1virginica
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "0 6.3 3.3 6.0 2.5 virginica\n", "1 5.8 2.7 5.1 1.9 virginica\n", "2 7.1 3.0 5.9 2.1 virginica" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_virginica.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sertosa.head(3)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(50, 5)\n", "(50, 5)\n", "(50, 5)\n" ] } ], "source": [ "for df in dfs:\n", " print(df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Each DataFrame only contains data about one species of iris" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['versicolor']\n", "['virginica']\n", "['setosa']\n" ] } ], "source": [ "for df in dfs:\n", " print(df.Species.unique())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Combine with `concat`\n", "\n", "We can append rows of one data frame to another using `concat`. Here `axis = 0` means combining by rows, in contrast to `axis = 1` which means combining by columns." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(150, 5)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.concat(dfs, axis=0)\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Note that the indices are maintained\n", "\n", "Here we have to use `iloc` because `ix` and `loc` work with the integer index values, not the positions, and there are no values equal to 50 or 51." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
485.12.53.01.1versicolor
495.72.84.11.3versicolor
06.33.36.02.5virginica
15.82.75.11.9virginica
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "48 5.1 2.5 3.0 1.1 versicolor\n", "49 5.7 2.8 4.1 1.3 versicolor\n", "0 6.3 3.3 6.0 2.5 virginica\n", "1 5.8 2.7 5.1 1.9 virginica" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[48:52]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### We can ask for a new consecutive index" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
485.12.53.01.1versicolor
495.72.84.11.3versicolor
506.33.36.02.5virginica
515.82.75.11.9virginica
\n", "
" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "48 5.1 2.5 3.0 1.1 versicolor\n", "49 5.7 2.8 4.1 1.3 versicolor\n", "50 6.3 3.3 6.0 2.5 virginica\n", "51 5.8 2.7 5.1 1.9 virginica" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[48:52]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Combined DataFrame contains all 3 species" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['versicolor', 'virginica', 'setosa'], dtype=object)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Species.unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining columns\n", "\n", "When combining rows, we are usually just adding new observations with the same variables and there is little ambiguity about what we are trying to do.\n", "\n", "When combining columns, we are usually trying to merge information based on some unique identifier, for example, to combine clinical and laboratory information for specific patients with unique PIDs. Often the ordering of the PIDs in the two data frames are not exactly lined up so \"horizontal stacking\" will not work. Usually we need to `merge` the data on a unique identifier from one or more columns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Simplest case - rows line up perfectly across data frames" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df_sepal = pd.read_csv('data/sepal.csv')\n", "df_petal = pd.read_csv('data/petal.csv')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SpeciesSepal.LengthSepal.Width
0setosa5.13.5
1setosa4.93.0
2setosa4.73.2
\n", "
" ], "text/plain": [ " Species Sepal.Length Sepal.Width\n", "0 setosa 5.1 3.5\n", "1 setosa 4.9 3.0\n", "2 setosa 4.7 3.2" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sepal.head(3)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SpeciesPetal.LengthPetal.Width
0setosa1.40.2
1setosa1.40.2
2setosa1.30.2
\n", "
" ], "text/plain": [ " Species Petal.Length Petal.Width\n", "0 setosa 1.4 0.2\n", "1 setosa 1.4 0.2\n", "2 setosa 1.3 0.2" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_petal.head(3)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "((150, 3), (150, 3))" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sepal.shape, df_petal.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using `concat`" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SpeciesSepal.LengthSepal.WidthSpeciesPetal.LengthPetal.Width
0setosa5.13.5setosa1.40.2
1setosa4.93.0setosa1.40.2
2setosa4.73.2setosa1.30.2
3setosa4.63.1setosa1.50.2
4setosa5.03.6setosa1.40.2
\n", "
" ], "text/plain": [ " Species Sepal.Length Sepal.Width Species Petal.Length Petal.Width\n", "0 setosa 5.1 3.5 setosa 1.4 0.2\n", "1 setosa 4.9 3.0 setosa 1.4 0.2\n", "2 setosa 4.7 3.2 setosa 1.3 0.2\n", "3 setosa 4.6 3.1 setosa 1.5 0.2\n", "4 setosa 5.0 3.6 setosa 1.4 0.2" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df_sepal, df_petal], axis=1).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Joining on a single unique column\n", "\n", "Combining values for the same subject across different variables." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Make up dummy data sets for illustration" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidval
048.115657
1229.385071
28312.240448
3909.173178
49810.119245
58510.476193
\n", "
" ], "text/plain": [ " pid val\n", "0 4 8.115657\n", "1 22 9.385071\n", "2 83 12.240448\n", "3 90 9.173178\n", "4 98 10.119245\n", "5 85 10.476193" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pid1 = np.random.choice(100, 6, replace=False)\n", "val1 = np.random.normal(10, 1, 6)\n", "df1 = pd.DataFrame({'pid': pid1, 'val': val1})\n", "df1" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidval
09815.945742
12214.873965
29016.505458
38513.775417
48314.690199
5414.749047
\n", "
" ], "text/plain": [ " pid val\n", "0 98 15.945742\n", "1 22 14.873965\n", "2 90 16.505458\n", "3 85 13.775417\n", "4 83 14.690199\n", "5 4 14.749047" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pid2 = np.random.permutation(pid1)\n", "val2 = np.random.normal(15, 1, 6)\n", "df2 = pd.DataFrame({'pid': pid2, 'val': val2})\n", "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using merge to join on columns containing unique information\n", "\n", "Note that the `pid` order is not the same, so we cannot simply stack horizontally." ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidval_visit_1val_visit_2
048.11565714.749047
1229.38507114.873965
28312.24044814.690199
3909.17317816.505458
49810.11924515.945742
58510.47619313.775417
\n", "
" ], "text/plain": [ " pid val_visit_1 val_visit_2\n", "0 4 8.115657 14.749047\n", "1 22 9.385071 14.873965\n", "2 83 12.240448 14.690199\n", "3 90 9.173178 16.505458\n", "4 98 10.119245 15.945742\n", "5 85 10.476193 13.775417" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on='pid', suffixes=['_visit_1', '_visit_2'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1a = df1.set_index('pid')\n", "df2a = df2.set_index('pid')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using merge on multiple columns\n", "\n", "Sometimes we need multiple columns to define a unique identifier. \n", "\n", "**Note**: In the data frames being merged, not all the unique identifiers need to be shared in both data frames. For example, when merging laboratory and clinical data, a patient may have laboratory results but no clinical results (perhaps due to a typo) or have clinical data but no laboratory data (because results are still being processed by the lab). We illustrate different ways to resolve the merge in this scenario in the example below." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval
04cmv8.115657
122flu9.385071
283cmv12.240448
390flu9.173178
498flu10.119245
585cmv10.476193
\n", "
" ], "text/plain": [ " pid stim val\n", "0 4 cmv 8.115657\n", "1 22 flu 9.385071\n", "2 83 cmv 12.240448\n", "3 90 flu 9.173178\n", "4 98 flu 10.119245\n", "5 85 cmv 10.476193" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)\n", "df1 = df1[['pid', 'stim', 'val']]\n", "df1" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval
098cmv15.945742
122flu14.873965
290flu16.505458
385cmv13.775417
483cmv14.690199
54cmv14.749047
\n", "
" ], "text/plain": [ " pid stim val\n", "0 98 cmv 15.945742\n", "1 22 flu 14.873965\n", "2 90 flu 16.505458\n", "3 85 cmv 13.775417\n", "4 83 cmv 14.690199\n", "5 4 cmv 14.749047" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2['stim'] = np.random.choice(['cmv', 'flu'], 6, replace=True)\n", "df2 = df2[['pid', 'stim', 'val']]\n", "df2" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval_visit_1val_visit_2
04cmv8.11565714.749047
122flu9.38507114.873965
283cmv12.24044814.690199
390flu9.17317816.505458
485cmv10.47619313.775417
\n", "
" ], "text/plain": [ " pid stim val_visit_1 val_visit_2\n", "0 4 cmv 8.115657 14.749047\n", "1 22 flu 9.385071 14.873965\n", "2 83 cmv 12.240448 14.690199\n", "3 90 flu 9.173178 16.505458\n", "4 85 cmv 10.476193 13.775417" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on = ['pid', 'stim'], suffixes = ['_visit_1', '_visit_2'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval_visit_1val_visit_2
04cmv8.11565714.749047
122flu9.38507114.873965
283cmv12.24044814.690199
390flu9.17317816.505458
498flu10.119245NaN
585cmv10.47619313.775417
\n", "
" ], "text/plain": [ " pid stim val_visit_1 val_visit_2\n", "0 4 cmv 8.115657 14.749047\n", "1 22 flu 9.385071 14.873965\n", "2 83 cmv 12.240448 14.690199\n", "3 90 flu 9.173178 16.505458\n", "4 98 flu 10.119245 NaN\n", "5 85 cmv 10.476193 13.775417" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on = ['pid', 'stim'], how = 'left', suffixes = ['_visit_1', '_visit_2'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval_visit_1val_visit_2
04.0cmv8.11565714.749047
122.0flu9.38507114.873965
283.0cmv12.24044814.690199
390.0flu9.17317816.505458
485.0cmv10.47619313.775417
598.0cmvNaN15.945742
\n", "
" ], "text/plain": [ " pid stim val_visit_1 val_visit_2\n", "0 4.0 cmv 8.115657 14.749047\n", "1 22.0 flu 9.385071 14.873965\n", "2 83.0 cmv 12.240448 14.690199\n", "3 90.0 flu 9.173178 16.505458\n", "4 85.0 cmv 10.476193 13.775417\n", "5 98.0 cmv NaN 15.945742" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on = ['pid', 'stim'], how = 'right', suffixes = ['_visit_1', '_visit_2'])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimval_visit_1val_visit_2
04.0cmv8.11565714.749047
122.0flu9.38507114.873965
283.0cmv12.24044814.690199
390.0flu9.17317816.505458
498.0flu10.119245NaN
585.0cmv10.47619313.775417
698.0cmvNaN15.945742
\n", "
" ], "text/plain": [ " pid stim val_visit_1 val_visit_2\n", "0 4.0 cmv 8.115657 14.749047\n", "1 22.0 flu 9.385071 14.873965\n", "2 83.0 cmv 12.240448 14.690199\n", "3 90.0 flu 9.173178 16.505458\n", "4 98.0 flu 10.119245 NaN\n", "5 85.0 cmv 10.476193 13.775417\n", "6 98.0 cmv NaN 15.945742" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on = ['pid', 'stim'], how = 'outer', suffixes = ['_visit_1', '_visit_2'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using `join` when indexes are the keys" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
val
pid
48.115657
229.385071
8312.240448
909.173178
9810.119245
8510.476193
\n", "
" ], "text/plain": [ " val\n", "pid \n", "4 8.115657\n", "22 9.385071\n", "83 12.240448\n", "90 9.173178\n", "98 10.119245\n", "85 10.476193" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1a" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
val
pid
9815.945742
2214.873965
9016.505458
8513.775417
8314.690199
414.749047
\n", "
" ], "text/plain": [ " val\n", "pid \n", "98 15.945742\n", "22 14.873965\n", "90 16.505458\n", "85 13.775417\n", "83 14.690199\n", "4 14.749047" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2a" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
val_1val_2
pid
48.11565714.749047
229.38507114.873965
8312.24044814.690199
909.17317816.505458
9810.11924515.945742
8510.47619313.775417
\n", "
" ], "text/plain": [ " val_1 val_2\n", "pid \n", "4 8.115657 14.749047\n", "22 9.385071 14.873965\n", "83 12.240448 14.690199\n", "90 9.173178 16.505458\n", "98 10.119245 15.945742\n", "85 10.476193 13.775417" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1a.join(df2a, lsuffix='_1', rsuffix='_2')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (2) Separate multiple values in a single column\n", "\n", "Remember that for a tidy data frame, each column must contain a single variable. " ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import OrderedDict" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "d = OrderedDict()\n", "d['pid-visit-stim'] = ['1-1-cmv', '1-1-hiv', '1-2-cmv', '1-2-hiv', '1-3-cmv', '1-3-hiv', '2-1-cmv', '2-1-hiv', '2-2-cmv', '2-2-hiv']\n", "d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]\n", "d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]\n", "d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]\n", "df = pd.DataFrame(d)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pid-visit-stimtnfifnil2
01-1-cmv1.011.00.0
11-1-hiv2.012.00.0
21-2-cmv1.111.10.1
31-2-hiv2.112.10.1
41-3-cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid-visit-stim tnf ifn il2\n", "0 1-1-cmv 1.0 11.0 0.0\n", "1 1-1-hiv 2.0 12.0 0.0\n", "2 1-2-cmv 1.1 11.1 0.1\n", "3 1-2-hiv 2.1 12.1 0.1\n", "4 1-3-cmv 1.2 11.2 0.2" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstim
011cmv
111hiv
212cmv
312hiv
413cmv
\n", "
" ], "text/plain": [ " pid visit stim\n", "0 1 1 cmv\n", "1 1 1 hiv\n", "2 1 2 cmv\n", "3 1 2 hiv\n", "4 1 3 cmv" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.DataFrame(df['pid-visit-stim'].str.split('-').tolist(), \n", " columns = ['pid', 'visit', 'stim'])\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstimpid-visit-stimtnfifnil2
011cmv1-1-cmv1.011.00.0
111hiv1-1-hiv2.012.00.0
212cmv1-2-cmv1.111.10.1
312hiv1-2-hiv2.112.10.1
413cmv1-3-cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid visit stim pid-visit-stim tnf ifn il2\n", "0 1 1 cmv 1-1-cmv 1.0 11.0 0.0\n", "1 1 1 hiv 1-1-hiv 2.0 12.0 0.0\n", "2 1 2 cmv 1-2-cmv 1.1 11.1 0.1\n", "3 1 2 hiv 1-2-hiv 2.1 12.1 0.1\n", "4 1 3 cmv 1-3-cmv 1.2 11.2 0.2" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.concat([df1, df], axis=1)\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstimtnfifnil2
011cmv1.011.00.0
111hiv2.012.00.0
212cmv1.111.10.1
312hiv2.112.10.1
413cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid visit stim tnf ifn il2\n", "0 1 1 cmv 1.0 11.0 0.0\n", "1 1 1 hiv 2.0 12.0 0.0\n", "2 1 2 cmv 1.1 11.1 0.1\n", "3 1 2 hiv 2.1 12.1 0.1\n", "4 1 3 cmv 1.2 11.2 0.2" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.drop('pid-visit-stim', axis=1, inplace=True)\n", "df1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Wrap into a convenient function" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def separate(df, column, sep):\n", " df1 = pd.DataFrame(df[column].str.split(sep).tolist(), columns = column.split(sep))\n", " df1 = pd.concat([df1, df], axis=1)\n", " return df1.drop(column, axis = 1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstimtnfifnil2
011cmv1.011.00.0
111hiv2.012.00.0
212cmv1.111.10.1
312hiv2.112.10.1
413cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid visit stim tnf ifn il2\n", "0 1 1 cmv 1.0 11.0 0.0\n", "1 1 1 hiv 2.0 12.0 0.0\n", "2 1 2 cmv 1.1 11.1 0.1\n", "3 1 2 hiv 2.1 12.1 0.1\n", "4 1 3 cmv 1.2 11.2 0.2" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "separate(df, 'pid-visit-stim', '-').head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (3) Reshaping DataFrames" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": true }, "outputs": [], "source": [ "d = OrderedDict()\n", "d['pid'] = ['1', '1', '1', '1', '1', '1', '2', '2', '2', '2']\n", "d['visit'] = ['1', '1', '2', '2', '3', '3', '1', '1', '2', '2']\n", "d['stim'] = ['cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv', 'cmv', 'hiv']\n", "d['tnf'] = [1.0, 2.0, 1.1, 2.1, 1.2, 2.2, 3, 4, 3.1, 4.1]\n", "d['ifn'] = [11.0, 12.0, 11.1, 12.1, 11.2, 12.2, 13, 14, 13.1, 14.1]\n", "d['il2'] = [0.0, 0.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1]\n", "df = pd.DataFrame(d)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstimtnfifnil2
011cmv1.011.00.0
111hiv2.012.00.0
212cmv1.111.10.1
312hiv2.112.10.1
413cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid visit stim tnf ifn il2\n", "0 1 1 cmv 1.0 11.0 0.0\n", "1 1 1 hiv 2.0 12.0 0.0\n", "2 1 2 cmv 1.1 11.1 0.1\n", "3 1 2 hiv 2.1 12.1 0.1\n", "4 1 3 cmv 1.2 11.2 0.2" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Wide to long\n", "\n", "Suppose it is more convenient to consider the type of cytokine as new variable. We need to put the name of the cytokine in one column and its value in another - going from 3 to 2 columns (wide to long)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### We use teh `melt` method and specify the identifier (columns that stay the same) and value variables." ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitvariablevalue
51hiv3tnf2.2
82cmv2tnf3.1
101cmv1ifn11.0
151hiv3ifn12.2
172hiv1ifn14.0
231hiv2il20.1
\n", "
" ], "text/plain": [ " pid stim visit variable value\n", "5 1 hiv 3 tnf 2.2\n", "8 2 cmv 2 tnf 3.1\n", "10 1 cmv 1 ifn 11.0\n", "15 1 hiv 3 ifn 12.2\n", "17 2 hiv 1 ifn 14.0\n", "23 1 hiv 2 il2 0.1" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "long1 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'], \n", " value_vars = ['tnf', 'ifn', 'il2'])\n", "long1.sample(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### By default, all non-id variabels are treated as values" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitvariablevalue
172hiv1ifn14.0
201cmv1il20.0
21cmv2tnf1.1
182cmv2ifn13.1
82cmv2tnf3.1
41cmv3tnf1.2
\n", "
" ], "text/plain": [ " pid stim visit variable value\n", "17 2 hiv 1 ifn 14.0\n", "20 1 cmv 1 il2 0.0\n", "2 1 cmv 2 tnf 1.1\n", "18 2 cmv 2 ifn 13.1\n", "8 2 cmv 2 tnf 3.1\n", "4 1 cmv 3 tnf 1.2" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "long2 = pd.melt(df, id_vars =['pid', 'stim', 'visit'])\n", "long2.sample(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### We can exclude columns we are not intersted in" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitvariablevalue
182cmv2il20.1
41cmv3tnf1.2
101cmv1il20.0
01cmv1tnf1.0
162cmv1il20.1
62cmv1tnf3.0
\n", "
" ], "text/plain": [ " pid stim visit variable value\n", "18 2 cmv 2 il2 0.1\n", "4 1 cmv 3 tnf 1.2\n", "10 1 cmv 1 il2 0.0\n", "0 1 cmv 1 tnf 1.0\n", "16 2 cmv 1 il2 0.1\n", "6 2 cmv 1 tnf 3.0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "long3 = pd.melt(df, id_vars = ['pid', 'stim', 'visit'], \n", " value_vars = ['tnf', 'il2'])\n", "long3.sample(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Long to Wide\n", "\n", "There is no function in `pandas` to undo the `wide_to_long` operation, and the details are tricky, so I have written a small function to do this." ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def long_to_wide(df, index):\n", " df = df.set_index(index).unstack().reset_index()\n", " cols = [t[1] if t[1] else t[0] for t in df.columns]\n", " df.columns = cols\n", " return df" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitifnil2tnf
11cmv211.10.11.1
41hiv212.10.12.1
21cmv311.20.21.2
31hiv112.00.02.0
92hiv214.10.14.1
51hiv312.20.22.2
\n", "
" ], "text/plain": [ " pid stim visit ifn il2 tnf\n", "1 1 cmv 2 11.1 0.1 1.1\n", "4 1 hiv 2 12.1 0.1 2.1\n", "2 1 cmv 3 11.2 0.2 1.2\n", "3 1 hiv 1 12.0 0.0 2.0\n", "9 2 hiv 2 14.1 0.1 4.1\n", "5 1 hiv 3 12.2 0.2 2.2" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wide1 = long_to_wide(long1, ['pid', 'stim', 'visit', 'variable'])\n", "wide1.sample(6)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitifnil2tnf
01cmv111.00.01.0
41hiv212.10.12.1
51hiv312.20.22.2
21cmv311.20.21.2
62cmv113.00.13.0
82hiv114.00.34.0
\n", "
" ], "text/plain": [ " pid stim visit ifn il2 tnf\n", "0 1 cmv 1 11.0 0.0 1.0\n", "4 1 hiv 2 12.1 0.1 2.1\n", "5 1 hiv 3 12.2 0.2 2.2\n", "2 1 cmv 3 11.2 0.2 1.2\n", "6 2 cmv 1 13.0 0.1 3.0\n", "8 2 hiv 1 14.0 0.3 4.0" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wide2 = long_to_wide(long2, ['pid', 'stim', 'visit', 'variable'])\n", "wide2.sample(6)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisitil2tnf
51hiv30.22.2
41hiv20.12.1
82hiv10.34.0
62cmv10.13.0
31hiv10.02.0
92hiv20.14.1
\n", "
" ], "text/plain": [ " pid stim visit il2 tnf\n", "5 1 hiv 3 0.2 2.2\n", "4 1 hiv 2 0.1 2.1\n", "8 2 hiv 1 0.3 4.0\n", "6 2 cmv 1 0.1 3.0\n", "3 1 hiv 1 0.0 2.0\n", "9 2 hiv 2 0.1 4.1" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wide3 = long_to_wide(long3, ['pid', 'stim', 'visit', 'variable'])\n", "wide3.sample(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (4) Hierarchical Indexes (More advanced material)\n", "\n", "Sometimes your data is best considered as stacks of 2D DataFrames, for example, when each patient has laboratory data for multiple visits. The handling of such \"3D\" or even higher-dimensional data is handled in `pandas` by the sue of hierarchical indexes." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidvisitstimtnfifnil2
011cmv1.011.00.0
111hiv2.012.00.0
212cmv1.111.10.1
312hiv2.112.10.1
413cmv1.211.20.2
\n", "
" ], "text/plain": [ " pid visit stim tnf ifn il2\n", "0 1 1 cmv 1.0 11.0 0.0\n", "1 1 1 hiv 2.0 12.0 0.0\n", "2 1 2 cmv 1.1 11.1 0.1\n", "3 1 2 hiv 2.1 12.1 0.1\n", "4 1 3 cmv 1.2 11.2 0.2" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Add a multi-index consisting of 3 levels - pid, stim and visit" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pidstimvisit
1cmv11.011.00.0
hiv12.012.00.0
cmv21.111.10.1
hiv22.112.10.1
cmv31.211.20.2
hiv32.212.20.2
2cmv13.013.00.1
hiv14.014.00.3
cmv23.113.10.1
hiv24.114.10.1
\n", "
" ], "text/plain": [ " tnf ifn il2\n", "pid stim visit \n", "1 cmv 1 1.0 11.0 0.0\n", " hiv 1 2.0 12.0 0.0\n", " cmv 2 1.1 11.1 0.1\n", " hiv 2 2.1 12.1 0.1\n", " cmv 3 1.2 11.2 0.2\n", " hiv 3 2.2 12.2 0.2\n", "2 cmv 1 3.0 13.0 0.1\n", " hiv 1 4.0 14.0 0.3\n", " cmv 2 3.1 13.1 0.1\n", " hiv 2 4.1 14.1 0.1" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = df.set_index(['pid', 'stim', 'visit'])\n", "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing for multi-index\n", "\n", "With the multi-index, each \"cell\" is now a block of values for the combinations (pid, stim, visit)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Find TNF values" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnf
pidstimvisit
1cmv11.0
hiv12.0
cmv21.1
hiv22.1
cmv31.2
hiv32.2
2cmv13.0
hiv14.0
cmv23.1
hiv24.1
\n", "
" ], "text/plain": [ " tnf\n", "pid stim visit \n", "1 cmv 1 1.0\n", " hiv 1 2.0\n", " cmv 2 1.1\n", " hiv 2 2.1\n", " cmv 3 1.2\n", " hiv 3 2.2\n", "2 cmv 1 3.0\n", " hiv 1 4.0\n", " cmv 2 3.1\n", " hiv 2 4.1" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1[['tnf']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Find all values for Subject 2" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
stimvisit
cmv13.013.00.1
hiv14.014.00.3
cmv23.113.10.1
hiv24.114.10.1
\n", "
" ], "text/plain": [ " tnf ifn il2\n", "stim visit \n", "cmv 1 3.0 13.0 0.1\n", "hiv 1 4.0 14.0 0.3\n", "cmv 2 3.1 13.1 0.1\n", "hiv 2 4.1 14.1 0.1" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.ix['2']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Find TNF values for subject 2" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnf
stimvisit
cmv13.0
hiv14.0
cmv23.1
hiv24.1
\n", "
" ], "text/plain": [ " tnf\n", "stim visit \n", "cmv 1 3.0\n", "hiv 1 4.0\n", "cmv 2 3.1\n", "hiv 2 4.1" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.ix['2', ['tnf']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Flattening hierarchicla indexes into column variables\n", "\n", "To actually undo, you would have to save the result to a variable." ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pidstimvisittnfifnil2
01cmv11.011.00.0
11hiv12.012.00.0
21cmv21.111.10.1
31hiv22.112.10.1
41cmv31.211.20.2
51hiv32.212.20.2
62cmv13.013.00.1
72hiv14.014.00.3
82cmv23.113.10.1
92hiv24.114.10.1
\n", "
" ], "text/plain": [ " pid stim visit tnf ifn il2\n", "0 1 cmv 1 1.0 11.0 0.0\n", "1 1 hiv 1 2.0 12.0 0.0\n", "2 1 cmv 2 1.1 11.1 0.1\n", "3 1 hiv 2 2.1 12.1 0.1\n", "4 1 cmv 3 1.2 11.2 0.2\n", "5 1 hiv 3 2.2 12.2 0.2\n", "6 2 cmv 1 3.0 13.0 0.1\n", "7 2 hiv 1 4.0 14.0 0.3\n", "8 2 cmv 2 3.1 13.1 0.1\n", "9 2 hiv 2 4.1 14.1 0.1" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.reset_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Moving parts of an index into columns with `unstack`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Move pid from column to row" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pid121212
stimvisit
cmv11.03.011.013.00.00.1
21.13.111.113.10.10.1
31.2NaN11.2NaN0.2NaN
hiv12.04.012.014.00.00.3
22.14.112.114.10.10.1
32.2NaN12.2NaN0.2NaN
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "pid 1 2 1 2 1 2\n", "stim visit \n", "cmv 1 1.0 3.0 11.0 13.0 0.0 0.1\n", " 2 1.1 3.1 11.1 13.1 0.1 0.1\n", " 3 1.2 NaN 11.2 NaN 0.2 NaN\n", "hiv 1 2.0 4.0 12.0 14.0 0.0 0.3\n", " 2 2.1 4.1 12.1 14.1 0.1 0.1\n", " 3 2.2 NaN 12.2 NaN 0.2 NaN" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.unstack('pid')" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pid121212
stimvisit
cmv11.03.011.013.00.00.1
21.13.111.113.10.10.1
31.2NaN11.2NaN0.2NaN
hiv12.04.012.014.00.00.3
22.14.112.114.10.10.1
32.2NaN12.2NaN0.2NaN
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "pid 1 2 1 2 1 2\n", "stim visit \n", "cmv 1 1.0 3.0 11.0 13.0 0.0 0.1\n", " 2 1.1 3.1 11.1 13.1 0.1 0.1\n", " 3 1.2 NaN 11.2 NaN 0.2 NaN\n", "hiv 1 2.0 4.0 12.0 14.0 0.0 0.3\n", " 2 2.1 4.1 12.1 14.1 0.1 0.1\n", " 3 2.2 NaN 12.2 NaN 0.2 NaN" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.unstack(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Move stim from column to row" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
stimcmvhivcmvhivcmvhiv
pidvisit
111.02.011.012.00.00.0
21.12.111.112.10.10.1
31.22.211.212.20.20.2
213.04.013.014.00.10.3
23.14.113.114.10.10.1
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "stim cmv hiv cmv hiv cmv hiv\n", "pid visit \n", "1 1 1.0 2.0 11.0 12.0 0.0 0.0\n", " 2 1.1 2.1 11.1 12.1 0.1 0.1\n", " 3 1.2 2.2 11.2 12.2 0.2 0.2\n", "2 1 3.0 4.0 13.0 14.0 0.1 0.3\n", " 2 3.1 4.1 13.1 14.1 0.1 0.1" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.unstack('stim')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
stimcmvhivcmvhivcmvhiv
pidvisit
111.02.011.012.00.00.0
21.12.111.112.10.10.1
31.22.211.212.20.20.2
213.04.013.014.00.10.3
23.14.113.114.10.10.1
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "stim cmv hiv cmv hiv cmv hiv\n", "pid visit \n", "1 1 1.0 2.0 11.0 12.0 0.0 0.0\n", " 2 1.1 2.1 11.1 12.1 0.1 0.1\n", " 3 1.2 2.2 11.2 12.2 0.2 0.2\n", "2 1 3.0 4.0 13.0 14.0 0.1 0.3\n", " 2 3.1 4.1 13.1 14.1 0.1 0.1" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.unstack(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Move pid and stim from column to row" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pid121212
stimcmvhivcmvhivcmvhivcmvhivcmvhivcmvhiv
visit
11.02.03.04.011.012.013.014.00.00.00.10.3
21.12.13.14.111.112.113.114.10.10.10.10.1
31.22.2NaNNaN11.212.2NaNNaN0.20.2NaNNaN
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "pid 1 2 1 2 1 2 \n", "stim cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv\n", "visit \n", "1 1.0 2.0 3.0 4.0 11.0 12.0 13.0 14.0 0.0 0.0 0.1 0.3\n", "2 1.1 2.1 3.1 4.1 11.1 12.1 13.1 14.1 0.1 0.1 0.1 0.1\n", "3 1.2 2.2 NaN NaN 11.2 12.2 NaN NaN 0.2 0.2 NaN NaN" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.unstack(['pid', 'stim'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Moving columns to a multi-index" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pidstimvisit
1cmv11.011.00.0
hiv12.012.00.0
cmv21.111.10.1
hiv22.112.10.1
cmv31.211.20.2
\n", "
" ], "text/plain": [ " tnf ifn il2\n", "pid stim visit \n", "1 cmv 1 1.0 11.0 0.0\n", " hiv 1 2.0 12.0 0.0\n", " cmv 2 1.1 11.1 0.1\n", " hiv 2 2.1 12.1 0.1\n", " cmv 3 1.2 11.2 0.2" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.head()" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tnfifnil2
pid121212
stimcmvhivcmvhivcmvhivcmvhivcmvhivcmvhiv
visit
11.02.03.04.011.012.013.014.00.00.00.10.3
21.12.13.14.111.112.113.114.10.10.10.10.1
31.22.2NaNNaN11.212.2NaNNaN0.20.2NaNNaN
\n", "
" ], "text/plain": [ " tnf ifn il2 \n", "pid 1 2 1 2 1 2 \n", "stim cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv cmv hiv\n", "visit \n", "1 1.0 2.0 3.0 4.0 11.0 12.0 13.0 14.0 0.0 0.0 0.1 0.3\n", "2 1.1 2.1 3.1 4.1 11.1 12.1 13.1 14.1 0.1 0.1 0.1 0.1\n", "3 1.2 2.2 NaN NaN 11.2 12.2 NaN NaN 0.2 0.2 NaN NaN" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df1.unstack(['pid', 'stim'])\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ifnil2tnf
visitpidstim
11cmv11.00.01.0
hiv12.00.02.0
2cmv13.00.13.0
hiv14.00.34.0
21cmv11.10.11.1
\n", "
" ], "text/plain": [ " ifn il2 tnf\n", "visit pid stim \n", "1 1 cmv 11.0 0.0 1.0\n", " hiv 12.0 0.0 2.0\n", " 2 cmv 13.0 0.1 3.0\n", " hiv 14.0 0.3 4.0\n", "2 1 cmv 11.1 0.1 1.1" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = df2.stack(['pid', 'stim'])\n", "df3.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exercises" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You are given three DataFrames `d1`, `d2`, and `d3`. Answer the questions below." ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": true }, "outputs": [], "source": [ "d1 = pd.DataFrame({'subject': ['a', 'b', 'c', 'd'], 'group': ['A', 'B', 'C', 'D'], 'value': [1, 2, 3, 4]})\n", "d2 = pd.DataFrame({'subject': ['w', 'x', 'y', 'z'], 'group': ['A', 'B', 'C', 'D'], 'value': [2, 1, 4, 3]})\n", "d3 = pd.DataFrame({'subject': ['a', 'x', 'w', 'd', 'b', 'c', 'y', 'z'], 'age': [10, 20, 30, 40, 50, 60, 70, 80]})" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
groupsubjectvalue
0Aa1
1Bb2
2Cc3
3Dd4
\n", "
" ], "text/plain": [ " group subject value\n", "0 A a 1\n", "1 B b 2\n", "2 C c 3\n", "3 D d 4" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d1" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
groupsubjectvalue
0Aw2
1Bx1
2Cy4
3Dz3
\n", "
" ], "text/plain": [ " group subject value\n", "0 A w 2\n", "1 B x 1\n", "2 C y 4\n", "3 D z 3" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d2" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesubject
010a
120x
230w
340d
450b
560c
670y
780z
\n", "
" ], "text/plain": [ " age subject\n", "0 10 a\n", "1 20 x\n", "2 30 w\n", "3 40 d\n", "4 50 b\n", "5 60 c\n", "6 70 y\n", "7 80 z" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**1**. Combine `d1` and `d2` to create a DataFrame called `df1` with 8 rows and 3 columns." ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
groupsubjectvalue
0Aa1
1Bb2
2Cc3
3Dd4
0Aw2
1Bx1
2Cy4
3Dz3
\n", "
" ], "text/plain": [ " group subject value\n", "0 A a 1\n", "1 B b 2\n", "2 C c 3\n", "3 D d 4\n", "0 A w 2\n", "1 B x 1\n", "2 C y 4\n", "3 D z 3" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.concat([d1, d2])\n", "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**2**. Combine `df1` with `d3` to create a DataFrame called `df2` with 8 rows and 4 columns." ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
groupsubjectvalueage
0Aa110
1Bb250
2Cc360
3Dd440
4Aw230
5Bx120
6Cy470
7Dz380
\n", "
" ], "text/plain": [ " group subject value age\n", "0 A a 1 10\n", "1 B b 2 50\n", "2 C c 3 60\n", "3 D d 4 40\n", "4 A w 2 30\n", "5 B x 1 20\n", "6 C y 4 70\n", "7 D z 3 80" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.merge(df1, d3)\n", "df2" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "latex_envs": { "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 0 } }, "nbformat": 4, "nbformat_minor": 1 }