{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Grouping and Aggregation\n",
"===="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sorting data\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
\n",
"\t2 | 4.9 | 3 | 1.4 | 0.2 | setosa |
\n",
"\t3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
\n",
"\t4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
\n",
"\t5 | 5 | 3.6 | 1.4 | 0.2 | setosa |
\n",
"\t6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t1 & 5.1 & 3.5 & 1.4 & 0.2 & setosa\\\\\n",
"\t2 & 4.9 & 3 & 1.4 & 0.2 & setosa\\\\\n",
"\t3 & 4.7 & 3.2 & 1.3 & 0.2 & setosa\\\\\n",
"\t4 & 4.6 & 3.1 & 1.5 & 0.2 & setosa\\\\\n",
"\t5 & 5 & 3.6 & 1.4 & 0.2 & setosa\\\\\n",
"\t6 & 5.4 & 3.9 & 1.7 & 0.4 & setosa\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"1 5.1 3.5 1.4 0.2 setosa\n",
"2 4.9 3.0 1.4 0.2 setosa\n",
"3 4.7 3.2 1.3 0.2 setosa\n",
"4 4.6 3.1 1.5 0.2 setosa\n",
"5 5.0 3.6 1.4 0.2 setosa\n",
"6 5.4 3.9 1.7 0.4 setosa"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(iris, 6)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
\n",
"\t9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
\n",
"\t39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
\n",
"\t43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n",
"\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n",
"\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n",
"\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"14 4.3 3.0 1.1 0.1 setosa\n",
"9 4.4 2.9 1.4 0.2 setosa\n",
"39 4.4 3.0 1.3 0.2 setosa\n",
"43 4.4 3.2 1.3 0.2 setosa"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridx <- order(iris$Sepal.Length)\n",
"head(iris[ridx,], 4)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t132 | 7.9 | 3.8 | 6.4 | 2 | virginica |
\n",
"\t118 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
\n",
"\t119 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
\n",
"\t123 | 7.7 | 2.8 | 6.7 | 2 | virginica |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t132 & 7.9 & 3.8 & 6.4 & 2 & virginica\\\\\n",
"\t118 & 7.7 & 3.8 & 6.7 & 2.2 & virginica\\\\\n",
"\t119 & 7.7 & 2.6 & 6.9 & 2.3 & virginica\\\\\n",
"\t123 & 7.7 & 2.8 & 6.7 & 2 & virginica\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"132 7.9 3.8 6.4 2.0 virginica\n",
"118 7.7 3.8 6.7 2.2 virginica\n",
"119 7.7 2.6 6.9 2.3 virginica\n",
"123 7.7 2.8 6.7 2.0 virginica"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridx <- order(iris$Sepal.Length, decreasing = TRUE)\n",
"head(iris[ridx,], 4)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
\n",
"\t39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
\n",
"\t43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
\n",
"\t9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n",
"\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n",
"\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n",
"\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"14 4.3 3.0 1.1 0.1 setosa\n",
"39 4.4 3.0 1.3 0.2 setosa\n",
"43 4.4 3.2 1.3 0.2 setosa\n",
"9 4.4 2.9 1.4 0.2 setosa"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridx <- order(iris$Sepal.Length, iris$Petal.Length)\n",
"head(iris[ridx,], 4)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
\n",
"\t9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
\n",
"\t39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
\n",
"\t43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n",
"\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n",
"\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n",
"\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"14 4.3 3.0 1.1 0.1 setosa\n",
"9 4.4 2.9 1.4 0.2 setosa\n",
"39 4.4 3.0 1.3 0.2 setosa\n",
"43 4.4 3.2 1.3 0.2 setosa"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ridx <- order(iris$Sepal.Length, -iris$Petal.Length)\n",
"head(iris[ridx,], 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Trnasposing data\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
\n",
"\n",
"\t112 | 6.4 | 2.7 | 5.3 | 1.9 | virginica |
\n",
"\t146 | 6.7 | 3 | 5.2 | 2.3 | virginica |
\n",
"\t56 | 5.7 | 2.8 | 4.5 | 1.3 | versicolor |
\n",
"\t32 | 5.4 | 3.4 | 1.5 | 0.4 | setosa |
\n",
"\t98 | 6.2 | 2.9 | 4.3 | 1.3 | versicolor |
\n",
"\t129 | 6.4 | 2.8 | 5.6 | 2.1 | virginica |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n",
"\\hline\n",
"\t112 & 6.4 & 2.7 & 5.3 & 1.9 & virginica\\\\\n",
"\t146 & 6.7 & 3 & 5.2 & 2.3 & virginica\\\\\n",
"\t56 & 5.7 & 2.8 & 4.5 & 1.3 & versicolor\\\\\n",
"\t32 & 5.4 & 3.4 & 1.5 & 0.4 & setosa\\\\\n",
"\t98 & 6.2 & 2.9 & 4.3 & 1.3 & versicolor\\\\\n",
"\t129 & 6.4 & 2.8 & 5.6 & 2.1 & virginica\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n",
"112 6.4 2.7 5.3 1.9 virginica\n",
"146 6.7 3.0 5.2 2.3 virginica\n",
"56 5.7 2.8 4.5 1.3 versicolor\n",
"32 5.4 3.4 1.5 0.4 setosa\n",
"98 6.2 2.9 4.3 1.3 versicolor\n",
"129 6.4 2.8 5.6 2.1 virginica"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ir6 <- iris[sample(1:nrow(iris), 6, replace=FALSE),]\n",
"ir6"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | 112 | 146 | 56 | 32 | 98 | 129 |
\n",
"\n",
"\tSepal.Length | 6.4 | 6.7 | 5.7 | 5.4 | 6.2 | 6.4 |
\n",
"\tSepal.Width | 2.7 | 3.0 | 2.8 | 3.4 | 2.9 | 2.8 |
\n",
"\tPetal.Length | 5.3 | 5.2 | 4.5 | 1.5 | 4.3 | 5.6 |
\n",
"\tPetal.Width | 1.9 | 2.3 | 1.3 | 0.4 | 1.3 | 2.1 |
\n",
"\tSpecies | virginica | virginica | versicolor | setosa | versicolor | virginica |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llllll}\n",
" & 112 & 146 & 56 & 32 & 98 & 129\\\\\n",
"\\hline\n",
"\tSepal.Length & 6.4 & 6.7 & 5.7 & 5.4 & 6.2 & 6.4\\\\\n",
"\tSepal.Width & 2.7 & 3.0 & 2.8 & 3.4 & 2.9 & 2.8\\\\\n",
"\tPetal.Length & 5.3 & 5.2 & 4.5 & 1.5 & 4.3 & 5.6\\\\\n",
"\tPetal.Width & 1.9 & 2.3 & 1.3 & 0.4 & 1.3 & 2.1\\\\\n",
"\tSpecies & virginica & virginica & versicolor & setosa & versicolor & virginica \\\\\n",
"\\end{tabular}\n"
],
"text/markdown": [
"1. '6.4'\n",
"2. '2.7'\n",
"3. '5.3'\n",
"4. '1.9'\n",
"5. 'virginica'\n",
"6. '6.7'\n",
"7. '3.0'\n",
"8. '5.2'\n",
"9. '2.3'\n",
"10. 'virginica'\n",
"11. '5.7'\n",
"12. '2.8'\n",
"13. '4.5'\n",
"14. '1.3'\n",
"15. 'versicolor'\n",
"16. '5.4'\n",
"17. '3.4'\n",
"18. '1.5'\n",
"19. '0.4'\n",
"20. 'setosa'\n",
"21. '6.2'\n",
"22. '2.9'\n",
"23. '4.3'\n",
"24. '1.3'\n",
"25. 'versicolor'\n",
"26. '6.4'\n",
"27. '2.8'\n",
"28. '5.6'\n",
"29. '2.1'\n",
"30. 'virginica'\n",
"\n",
"\n"
],
"text/plain": [
" 112 146 56 32 98 \n",
"Sepal.Length \"6.4\" \"6.7\" \"5.7\" \"5.4\" \"6.2\" \n",
"Sepal.Width \"2.7\" \"3.0\" \"2.8\" \"3.4\" \"2.9\" \n",
"Petal.Length \"5.3\" \"5.2\" \"4.5\" \"1.5\" \"4.3\" \n",
"Petal.Width \"1.9\" \"2.3\" \"1.3\" \"0.4\" \"1.3\" \n",
"Species \"virginica\" \"virginica\" \"versicolor\" \"setosa\" \"versicolor\"\n",
" 129 \n",
"Sepal.Length \"6.4\" \n",
"Sepal.Width \"2.8\" \n",
"Petal.Length \"5.6\" \n",
"Petal.Width \"2.1\" \n",
"Species \"virginica\""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"t(ir6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Aggregation (Subgrouping)\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
\n",
"\n",
"\tDodge Challenger | 15.5 | 8 | 318 | 150 | 2.76 | 3.52 | 16.87 | 0 | 0 | 3 | 2 |
\n",
"\tHornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.44 | 17.02 | 0 | 0 | 3 | 2 |
\n",
"\tFiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.2 | 19.47 | 1 | 1 | 4 | 1 |
\n",
"\tMerc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.78 | 18 | 0 | 0 | 3 | 3 |
\n",
"\tChrysler Imperial | 14.7 | 8 | 440 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
\n",
"\tVolvo 142E | 21.4 | 4 | 121 | 109 | 4.11 | 2.78 | 18.6 | 1 | 1 | 4 | 2 |
\n",
"\tToyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.9 | 1 | 1 | 4 | 1 |
\n",
"\tMerc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.07 | 17.4 | 0 | 0 | 3 | 3 |
\n",
"\tMazda RX4 | 21 | 6 | 160 | 110 | 3.9 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
\n",
"\tPorsche 914-2 | 26 | 4 | 120.3 | 91 | 4.43 | 2.14 | 16.7 | 0 | 1 | 5 | 2 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllllllllll}\n",
" & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n",
"\\hline\n",
"\tDodge Challenger & 15.5 & 8 & 318 & 150 & 2.76 & 3.52 & 16.87 & 0 & 0 & 3 & 2\\\\\n",
"\tHornet Sportabout & 18.7 & 8 & 360 & 175 & 3.15 & 3.44 & 17.02 & 0 & 0 & 3 & 2\\\\\n",
"\tFiat 128 & 32.4 & 4 & 78.7 & 66 & 4.08 & 2.2 & 19.47 & 1 & 1 & 4 & 1\\\\\n",
"\tMerc 450SLC & 15.2 & 8 & 275.8 & 180 & 3.07 & 3.78 & 18 & 0 & 0 & 3 & 3\\\\\n",
"\tChrysler Imperial & 14.7 & 8 & 440 & 230 & 3.23 & 5.345 & 17.42 & 0 & 0 & 3 & 4\\\\\n",
"\tVolvo 142E & 21.4 & 4 & 121 & 109 & 4.11 & 2.78 & 18.6 & 1 & 1 & 4 & 2\\\\\n",
"\tToyota Corolla & 33.9 & 4 & 71.1 & 65 & 4.22 & 1.835 & 19.9 & 1 & 1 & 4 & 1\\\\\n",
"\tMerc 450SE & 16.4 & 8 & 275.8 & 180 & 3.07 & 4.07 & 17.4 & 0 & 0 & 3 & 3\\\\\n",
"\tMazda RX4 & 21 & 6 & 160 & 110 & 3.9 & 2.62 & 16.46 & 0 & 1 & 4 & 4\\\\\n",
"\tPorsche 914-2 & 26 & 4 & 120.3 & 91 & 4.43 & 2.14 & 16.7 & 0 & 1 & 5 & 2\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" mpg cyl disp hp drat wt qsec vs am gear carb\n",
"Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n",
"Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n",
"Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n",
"Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n",
"Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n",
"Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2\n",
"Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n",
"Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n",
"Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n",
"Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mt10 <- mtcars[sample(1:nrow(mtcars), 10, replace=FALSE),]\n",
"mt10"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | cyl | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
\n",
"\n",
"\t1 | 4 | 26.66364 | 4 | 105.1364 | 82.63636 | 4.070909 | 2.285727 | 19.13727 | 0.9090909 | 0.7272727 | 4.090909 | 1.545455 |
\n",
"\t2 | 6 | 19.74286 | 6 | 183.3143 | 122.2857 | 3.585714 | 3.117143 | 17.97714 | 0.5714286 | 0.4285714 | 3.857143 | 3.428571 |
\n",
"\t3 | 8 | 15.1 | 8 | 353.1 | 209.2143 | 3.229286 | 3.999214 | 16.77214 | 0 | 0.1428571 | 3.285714 | 3.5 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llllllllllll}\n",
" & cyl & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n",
"\\hline\n",
"\t1 & 4 & 26.66364 & 4 & 105.1364 & 82.63636 & 4.070909 & 2.285727 & 19.13727 & 0.9090909 & 0.7272727 & 4.090909 & 1.545455\\\\\n",
"\t2 & 6 & 19.74286 & 6 & 183.3143 & 122.2857 & 3.585714 & 3.117143 & 17.97714 & 0.5714286 & 0.4285714 & 3.857143 & 3.428571\\\\\n",
"\t3 & 8 & 15.1 & 8 & 353.1 & 209.2143 & 3.229286 & 3.999214 & 16.77214 & 0 & 0.1428571 & 3.285714 & 3.5\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" cyl mpg cyl disp hp drat wt qsec vs\n",
"1 4 26.66364 4 105.1364 82.63636 4.070909 2.285727 19.13727 0.9090909\n",
"2 6 19.74286 6 183.3143 122.28571 3.585714 3.117143 17.97714 0.5714286\n",
"3 8 15.10000 8 353.1000 209.21429 3.229286 3.999214 16.77214 0.0000000\n",
" am gear carb\n",
"1 0.7272727 4.090909 1.545455\n",
"2 0.4285714 3.857143 3.428571\n",
"3 0.1428571 3.285714 3.500000"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with(mtcars, aggregate(mtcars, by=list(cyl=cyl), FUN=mean))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | cyl | gear | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
\n",
"\n",
"\t1 | 4 | 3 | 21.5 | 4 | 120.1 | 97 | 3.7 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
\n",
"\t2 | 6 | 3 | 19.75 | 6 | 241.5 | 107.5 | 2.92 | 3.3375 | 19.83 | 1 | 0 | 3 | 1 |
\n",
"\t3 | 8 | 3 | 15.2 | 8 | 355 | 180 | 3.075 | 3.81 | 17.35 | 0 | 0 | 3 | 3 |
\n",
"\t4 | 4 | 4 | 25.85 | 4 | 93.5 | 66 | 4.08 | 2.26 | 19.185 | 1 | 1 | 4 | 1.5 |
\n",
"\t5 | 6 | 4 | 20.1 | 6 | 163.8 | 116.5 | 3.91 | 3.1575 | 17.66 | 0.5 | 0.5 | 4 | 4 |
\n",
"\t6 | 4 | 5 | 28.2 | 4 | 107.7 | 102 | 4.1 | 1.8265 | 16.8 | 0.5 | 1 | 5 | 2 |
\n",
"\t7 | 6 | 5 | 19.7 | 6 | 145 | 175 | 3.62 | 2.77 | 15.5 | 0 | 1 | 5 | 6 |
\n",
"\t8 | 8 | 5 | 15.4 | 8 | 326 | 299.5 | 3.88 | 3.37 | 14.55 | 0 | 1 | 5 | 6 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllllllllllll}\n",
" & cyl & gear & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n",
"\\hline\n",
"\t1 & 4 & 3 & 21.5 & 4 & 120.1 & 97 & 3.7 & 2.465 & 20.01 & 1 & 0 & 3 & 1\\\\\n",
"\t2 & 6 & 3 & 19.75 & 6 & 241.5 & 107.5 & 2.92 & 3.3375 & 19.83 & 1 & 0 & 3 & 1\\\\\n",
"\t3 & 8 & 3 & 15.2 & 8 & 355 & 180 & 3.075 & 3.81 & 17.35 & 0 & 0 & 3 & 3\\\\\n",
"\t4 & 4 & 4 & 25.85 & 4 & 93.5 & 66 & 4.08 & 2.26 & 19.185 & 1 & 1 & 4 & 1.5\\\\\n",
"\t5 & 6 & 4 & 20.1 & 6 & 163.8 & 116.5 & 3.91 & 3.1575 & 17.66 & 0.5 & 0.5 & 4 & 4\\\\\n",
"\t6 & 4 & 5 & 28.2 & 4 & 107.7 & 102 & 4.1 & 1.8265 & 16.8 & 0.5 & 1 & 5 & 2\\\\\n",
"\t7 & 6 & 5 & 19.7 & 6 & 145 & 175 & 3.62 & 2.77 & 15.5 & 0 & 1 & 5 & 6\\\\\n",
"\t8 & 8 & 5 & 15.4 & 8 & 326 & 299.5 & 3.88 & 3.37 & 14.55 & 0 & 1 & 5 & 6\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" cyl gear mpg cyl disp hp drat wt qsec vs am gear carb\n",
"1 4 3 21.50 4 120.1 97.0 3.700 2.4650 20.010 1.0 0.0 3 1.0\n",
"2 6 3 19.75 6 241.5 107.5 2.920 3.3375 19.830 1.0 0.0 3 1.0\n",
"3 8 3 15.20 8 355.0 180.0 3.075 3.8100 17.350 0.0 0.0 3 3.0\n",
"4 4 4 25.85 4 93.5 66.0 4.080 2.2600 19.185 1.0 1.0 4 1.5\n",
"5 6 4 20.10 6 163.8 116.5 3.910 3.1575 17.660 0.5 0.5 4 4.0\n",
"6 4 5 28.20 4 107.7 102.0 4.100 1.8265 16.800 0.5 1.0 5 2.0\n",
"7 6 5 19.70 6 145.0 175.0 3.620 2.7700 15.500 0.0 1.0 5 6.0\n",
"8 8 5 15.40 8 326.0 299.5 3.880 3.3700 14.550 0.0 1.0 5 6.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with(mtcars, aggregate(mtcars, by=list(cyl=cyl, gear=gear), FUN=median))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Reshaping data\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warning message:\n",
": package ‘plyr’ was built under R version 3.1.3"
]
}
],
"source": [
"library(reshape2)\n",
"library(plyr) # needed for the . function"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Starting data frame"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Time | Gene1 | Gene2 |
\n",
"\n",
"\t1 | 1 | 1 | -1.658798 | -1.509927 |
\n",
"\t2 | 1 | 2 | 6.422999 | 2.617535 |
\n",
"\t3 | 1 | 3 | 8.703917 | 11.20532 |
\n",
"\t4 | 2 | 1 | 11.80803 | -0.9115341 |
\n",
"\t5 | 2 | 2 | 4.670361 | 4.869456 |
\n",
"\t6 | 2 | 3 | -0.5982849 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & PID & Time & Gene1 & Gene2\\\\\n",
"\\hline\n",
"\t1 & 1 & 1 & -1.658798 & -1.509927\\\\\n",
"\t2 & 1 & 2 & 6.422999 & 2.617535\\\\\n",
"\t3 & 1 & 3 & 8.703917 & 11.20532\\\\\n",
"\t4 & 2 & 1 & 11.80803 & -0.9115341\\\\\n",
"\t5 & 2 & 2 & 4.670361 & 4.869456\\\\\n",
"\t6 & 2 & 3 & -0.5982849 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Time Gene1 Gene2\n",
"1 1 1 -1.6587983 -1.5099270\n",
"2 1 2 6.4229986 2.6175354\n",
"3 1 3 8.7039172 11.2053234\n",
"4 2 1 11.8080301 -0.9115341\n",
"5 2 2 4.6703610 4.8694561\n",
"6 2 3 -0.5982849 8.8712385"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ID <- factor(rep(1:2, each=3))\n",
"Time <- rep(1:3, 2)\n",
"Gene1 <- c(0,5,10,10,5,0) + rnorm(6)\n",
"Gene2 <- c(0,5,10,0,5,10) + rnorm(6)\n",
"expt <- data.frame(PID=ID, Time=Time, Gene1=Gene1, Gene2=Gene2)\n",
"expt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\"Melt\" into a \"tall\" format with all values in a single column. This reuqires identifying all the columns that are needed to uniquely define a row value. In this csse, the \"id\" columns are \"PID\" and \"Time\"."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Time | Gene | value |
\n",
"\n",
"\t1 | 1 | 1 | Gene1 | -1.658798 |
\n",
"\t2 | 1 | 2 | Gene1 | 6.422999 |
\n",
"\t3 | 1 | 3 | Gene1 | 8.703917 |
\n",
"\t4 | 2 | 1 | Gene1 | 11.80803 |
\n",
"\t5 | 2 | 2 | Gene1 | 4.670361 |
\n",
"\t6 | 2 | 3 | Gene1 | -0.5982849 |
\n",
"\t7 | 1 | 1 | Gene2 | -1.509927 |
\n",
"\t8 | 1 | 2 | Gene2 | 2.617535 |
\n",
"\t9 | 1 | 3 | Gene2 | 11.20532 |
\n",
"\t10 | 2 | 1 | Gene2 | -0.9115341 |
\n",
"\t11 | 2 | 2 | Gene2 | 4.869456 |
\n",
"\t12 | 2 | 3 | Gene2 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & PID & Time & Gene & value\\\\\n",
"\\hline\n",
"\t1 & 1 & 1 & Gene1 & -1.658798\\\\\n",
"\t2 & 1 & 2 & Gene1 & 6.422999\\\\\n",
"\t3 & 1 & 3 & Gene1 & 8.703917\\\\\n",
"\t4 & 2 & 1 & Gene1 & 11.80803\\\\\n",
"\t5 & 2 & 2 & Gene1 & 4.670361\\\\\n",
"\t6 & 2 & 3 & Gene1 & -0.5982849\\\\\n",
"\t7 & 1 & 1 & Gene2 & -1.509927\\\\\n",
"\t8 & 1 & 2 & Gene2 & 2.617535\\\\\n",
"\t9 & 1 & 3 & Gene2 & 11.20532\\\\\n",
"\t10 & 2 & 1 & Gene2 & -0.9115341\\\\\n",
"\t11 & 2 & 2 & Gene2 & 4.869456\\\\\n",
"\t12 & 2 & 3 & Gene2 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Time Gene value\n",
"1 1 1 Gene1 -1.6587983\n",
"2 1 2 Gene1 6.4229986\n",
"3 1 3 Gene1 8.7039172\n",
"4 2 1 Gene1 11.8080301\n",
"5 2 2 Gene1 4.6703610\n",
"6 2 3 Gene1 -0.5982849\n",
"7 1 1 Gene2 -1.5099270\n",
"8 1 2 Gene2 2.6175354\n",
"9 1 3 Gene2 11.2053234\n",
"10 2 1 Gene2 -0.9115341\n",
"11 2 2 Gene2 4.8694561\n",
"12 2 3 Gene2 8.8712385"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m.expt <- melt(expt, id=c(\"PID\", \"Time\"), variable.name=\"Gene\")\n",
"m.expt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use dcast to reshape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Show the time series for each (PID, Gene) combination."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Gene | 1 | 2 | 3 |
\n",
"\n",
"\t1 | 1 | Gene1 | -1.658798 | 6.422999 | 8.703917 |
\n",
"\t2 | 1 | Gene2 | -1.509927 | 2.617535 | 11.20532 |
\n",
"\t3 | 2 | Gene1 | 11.80803 | 4.670361 | -0.5982849 |
\n",
"\t4 | 2 | Gene2 | -0.9115341 | 4.869456 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & PID & Gene & 1 & 2 & 3\\\\\n",
"\\hline\n",
"\t1 & 1 & Gene1 & -1.658798 & 6.422999 & 8.703917\\\\\n",
"\t2 & 1 & Gene2 & -1.509927 & 2.617535 & 11.20532\\\\\n",
"\t3 & 2 & Gene1 & 11.80803 & 4.670361 & -0.5982849\\\\\n",
"\t4 & 2 & Gene2 & -0.9115341 & 4.869456 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Gene 1 2 3\n",
"1 1 Gene1 -1.6587983 6.422999 8.7039172\n",
"2 1 Gene2 -1.5099270 2.617535 11.2053234\n",
"3 2 Gene1 11.8080301 4.670361 -0.5982849\n",
"4 2 Gene2 -0.9115341 4.869456 8.8712385"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, PID + Gene ~ Time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Show the time series for each (Gene, PID) combination."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Gene | PID | 1 | 2 | 3 |
\n",
"\n",
"\t1 | Gene1 | 1 | -1.658798 | 6.422999 | 8.703917 |
\n",
"\t2 | Gene1 | 2 | 11.80803 | 4.670361 | -0.5982849 |
\n",
"\t3 | Gene2 | 1 | -1.509927 | 2.617535 | 11.20532 |
\n",
"\t4 | Gene2 | 2 | -0.9115341 | 4.869456 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & Gene & PID & 1 & 2 & 3\\\\\n",
"\\hline\n",
"\t1 & Gene1 & 1 & -1.658798 & 6.422999 & 8.703917\\\\\n",
"\t2 & Gene1 & 2 & 11.80803 & 4.670361 & -0.5982849\\\\\n",
"\t3 & Gene2 & 1 & -1.509927 & 2.617535 & 11.20532\\\\\n",
"\t4 & Gene2 & 2 & -0.9115341 & 4.869456 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Gene PID 1 2 3\n",
"1 Gene1 1 -1.6587983 6.422999 8.7039172\n",
"2 Gene1 2 11.8080301 4.670361 -0.5982849\n",
"3 Gene2 1 -1.5099270 2.617535 11.2053234\n",
"4 Gene2 2 -0.9115341 4.869456 8.8712385"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, Gene + PID ~ Time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Recreate the original data set"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Time | Gene1 | Gene2 |
\n",
"\n",
"\t1 | 1 | 1 | -1.658798 | -1.509927 |
\n",
"\t2 | 1 | 2 | 6.422999 | 2.617535 |
\n",
"\t3 | 1 | 3 | 8.703917 | 11.20532 |
\n",
"\t4 | 2 | 1 | 11.80803 | -0.9115341 |
\n",
"\t5 | 2 | 2 | 4.670361 | 4.869456 |
\n",
"\t6 | 2 | 3 | -0.5982849 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & PID & Time & Gene1 & Gene2\\\\\n",
"\\hline\n",
"\t1 & 1 & 1 & -1.658798 & -1.509927\\\\\n",
"\t2 & 1 & 2 & 6.422999 & 2.617535\\\\\n",
"\t3 & 1 & 3 & 8.703917 & 11.20532\\\\\n",
"\t4 & 2 & 1 & 11.80803 & -0.9115341\\\\\n",
"\t5 & 2 & 2 & 4.670361 & 4.869456\\\\\n",
"\t6 & 2 & 3 & -0.5982849 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Time Gene1 Gene2\n",
"1 1 1 -1.6587983 -1.5099270\n",
"2 1 2 6.4229986 2.6175354\n",
"3 1 3 8.7039172 11.2053234\n",
"4 2 1 11.8080301 -0.9115341\n",
"5 2 2 4.6703610 4.8694561\n",
"6 2 3 -0.5982849 8.8712385"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, PID + Time ~ Gene)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Show all data for each subject in a single row"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Gene1_1 | Gene1_2 | Gene1_3 | Gene2_1 | Gene2_2 | Gene2_3 |
\n",
"\n",
"\t1 | 1 | -1.658798 | 6.422999 | 8.703917 | -1.509927 | 2.617535 | 11.20532 |
\n",
"\t2 | 2 | 11.80803 | 4.670361 | -0.5982849 | -0.9115341 | 4.869456 | 8.871238 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllllll}\n",
" & PID & Gene1_1 & Gene1_2 & Gene1_3 & Gene2_1 & Gene2_2 & Gene2_3\\\\\n",
"\\hline\n",
"\t1 & 1 & -1.658798 & 6.422999 & 8.703917 & -1.509927 & 2.617535 & 11.20532\\\\\n",
"\t2 & 2 & 11.80803 & 4.670361 & -0.5982849 & -0.9115341 & 4.869456 & 8.871238\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Gene1_1 Gene1_2 Gene1_3 Gene2_1 Gene2_2 Gene2_3\n",
"1 1 -1.658798 6.422999 8.7039172 -1.5099270 2.617535 11.205323\n",
"2 2 11.808030 4.670361 -0.5982849 -0.9115341 4.869456 8.871238"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, PID ~ Gene + Time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We can also aggregate while reshaping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### What is the average expression value for each gene for each subject over all time points?"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | PID | Gene1 | Gene2 |
\n",
"\n",
"\t1 | 1 | 4.489372 | 4.104311 |
\n",
"\t2 | 2 | 5.293369 | 4.276387 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & PID & Gene1 & Gene2\\\\\n",
"\\hline\n",
"\t1 & 1 & 4.489372 & 4.104311\\\\\n",
"\t2 & 2 & 5.293369 & 4.276387\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" PID Gene1 Gene2\n",
"1 1 4.489372 4.104311\n",
"2 2 5.293369 4.276387"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, PID ~ Gene, mean)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### What is the average expression value for each gene for each time point over all subjects?"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Time | Gene1 | Gene2 |
\n",
"\n",
"\t1 | 1 | 5.074616 | -1.210731 |
\n",
"\t2 | 2 | 5.54668 | 3.743496 |
\n",
"\t3 | 3 | 4.052816 | 10.03828 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & Time & Gene1 & Gene2\\\\\n",
"\\hline\n",
"\t1 & 1 & 5.074616 & -1.210731\\\\\n",
"\t2 & 2 & 5.54668 & 3.743496\\\\\n",
"\t3 & 3 & 4.052816 & 10.03828\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Time Gene1 Gene2\n",
"1 1 5.074616 -1.210731\n",
"2 2 5.546680 3.743496\n",
"3 3 4.052816 10.038281"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, Time ~ Gene, mean)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Finally, we can perform subssetting on the named vairables.\n",
"\n",
"For example, restrict the previous query to subject with PID= 1."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | Time | Gene1 | Gene2 |
\n",
"\n",
"\t1 | 1 | -1.658798 | -1.509927 |
\n",
"\t2 | 2 | 6.422999 | 2.617535 |
\n",
"\t3 | 3 | 8.703917 | 11.20532 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & Time & Gene1 & Gene2\\\\\n",
"\\hline\n",
"\t1 & 1 & -1.658798 & -1.509927\\\\\n",
"\t2 & 2 & 6.422999 & 2.617535\\\\\n",
"\t3 & 3 & 8.703917 & 11.20532\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" Time Gene1 Gene2\n",
"1 1 -1.658798 -1.509927\n",
"2 2 6.422999 2.617535\n",
"3 3 8.703917 11.205323"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dcast(m.expt, Time ~ Gene, mean, subset = .(PID == 1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Merging data\n",
"----\n",
"\n",
"A common task in data analysis is to link data from two or more datasts, for example, to relate assay data to clinical phenoytpe. \n",
"\n",
"Here we will work thought a typical example where the genotype and phenoytpe information come from two different data sets, and the ID information needed to link the two is from a third data set."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"phenodat <- read.csv(\"phenodat.csv\") \n",
"gdat1 <- read.csv(\"gdat1.csv\") \n",
"gdat2 <- read.csv(\"gdat2.csv\")\n",
"iddat <- read.csv(\"iddat.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Eyeball data sett\n",
"----\n",
"\n",
"A quick sanity check to see what the data look like."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 10
\n",
"\t- 2
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 10\n",
"\\item 2\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 10\n",
"2. 2\n",
"\n",
"\n"
],
"text/plain": [
"[1] 10 2"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"\n",
"\t- 11
\n",
"\t- 3
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 11\n",
"\\item 3\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 11\n",
"2. 3\n",
"\n",
"\n"
],
"text/plain": [
"[1] 11 3"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"\n",
"\t- 11
\n",
"\t- 3
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 11\n",
"\\item 3\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 11\n",
"2. 3\n",
"\n",
"\n"
],
"text/plain": [
"[1] 11 3"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"\n",
"\t- 20
\n",
"\t- 2
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 20\n",
"\\item 2\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 20\n",
"2. 2\n",
"\n",
"\n"
],
"text/plain": [
"[1] 20 2"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(dim(phenodat))\n",
"(dim(gdat1))\n",
"(dim(gdat2))\n",
"(dim(iddat))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | trt |
\n",
"\n",
"\t1 | pid6 | 0 |
\n",
"\t2 | pid15 | 1 |
\n",
"\t3 | pid8 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|ll}\n",
" & pid & trt\\\\\n",
"\\hline\n",
"\t1 & pid6 & 0\\\\\n",
"\t2 & pid15 & 1\\\\\n",
"\t3 & pid8 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" pid trt\n",
"1 pid6 0\n",
"2 pid15 1\n",
"3 pid8 0"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(phenodat, 3)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 |
\n",
"\n",
"\t1 | 100020 | -0.4321298 | -0.2288958 |
\n",
"\t2 | 100018 | -1.318938 | 0.7935853 |
\n",
"\t3 | 100013 | 1.242919 | -1.334354 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & expid & gene1 & gene2\\\\\n",
"\\hline\n",
"\t1 & 100020 & -0.4321298 & -0.2288958\\\\\n",
"\t2 & 100018 & -1.318938 & 0.7935853\\\\\n",
"\t3 & 100013 & 1.242919 & -1.334354\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2\n",
"1 100020 -0.4321298 -0.2288958\n",
"2 100018 -1.3189376 0.7935853\n",
"3 100013 1.2429188 -1.3343536"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(gdat1, 3)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 |
\n",
"\n",
"\t1 | 100009 | -1.220512 | -0.2416898 |
\n",
"\t2 | 100008 | 0.2865486 | 1.685887 |
\n",
"\t3 | 100007 | -0.7717918 | -1.070068 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & expid & gene1 & gene2\\\\\n",
"\\hline\n",
"\t1 & 100009 & -1.220512 & -0.2416898\\\\\n",
"\t2 & 100008 & 0.2865486 & 1.685887\\\\\n",
"\t3 & 100007 & -0.7717918 & -1.070068\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2\n",
"1 100009 -1.2205120 -0.2416898\n",
"2 100008 0.2865486 1.6858872\n",
"3 100007 -0.7717918 -1.0700682"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(gdat2, 3)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | expid |
\n",
"\n",
"\t1 | pid20 | 100020 |
\n",
"\t2 | pid9 | 100009 |
\n",
"\t3 | pid13 | 100013 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|ll}\n",
" & pid & expid\\\\\n",
"\\hline\n",
"\t1 & pid20 & 100020\\\\\n",
"\t2 & pid9 & 100009\\\\\n",
"\t3 & pid13 & 100013\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" pid expid\n",
"1 pid20 100020\n",
"2 pid9 100009\n",
"3 pid13 100013"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(iddat, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Combine gene data from two data sets\n",
"----\n",
"\n",
"Often, we have the same type of data stroed in mulitple data sets, for example, one per batch. In this case, we want to combine **rows**."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gdat <- rbind(gdat1, gdat2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Checking for duplicates\n",
"---- "
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"show.dups <- function(df) {\n",
" return(df[duplicated(df), ])\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | trt |
\n",
"\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|ll}\n",
" & pid & trt\\\\\n",
"\\hline\n",
"\\end{tabular}\n"
],
"text/plain": [
"[1] pid trt\n",
"<0 rows> (or 0-length row.names)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"show.dups(phenodat)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | expid |
\n",
"\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|ll}\n",
" & pid & expid\\\\\n",
"\\hline\n",
"\\end{tabular}\n"
],
"text/plain": [
"[1] pid expid\n",
"<0 rows> (or 0-length row.names)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"show.dups(iddat)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 |
\n",
"\n",
"\t13 | 100008 | 0.2865486 | 1.685887 |
\n",
"\t15 | 100003 | 0.8867361 | 0.2760235 |
\n",
"\t17 | 100004 | -0.151396 | -1.048976 |
\n",
"\t18 | 100018 | -1.318938 | 0.7935853 |
\n",
"\t20 | 100011 | 0.8001769 | -0.7729782 |
\n",
"\t21 | 100001 | -0.5996083 | 1.689873 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & expid & gene1 & gene2\\\\\n",
"\\hline\n",
"\t13 & 100008 & 0.2865486 & 1.685887\\\\\n",
"\t15 & 100003 & 0.8867361 & 0.2760235\\\\\n",
"\t17 & 100004 & -0.151396 & -1.048976\\\\\n",
"\t18 & 100018 & -1.318938 & 0.7935853\\\\\n",
"\t20 & 100011 & 0.8001769 & -0.7729782\\\\\n",
"\t21 & 100001 & -0.5996083 & 1.689873\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2\n",
"13 100008 0.2865486 1.6858872\n",
"15 100003 0.8867361 0.2760235\n",
"17 100004 -0.1513960 -1.0489755\n",
"18 100018 -1.3189376 0.7935853\n",
"20 100011 0.8001769 -0.7729782\n",
"21 100001 -0.5996083 1.6898725"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"show.dups(gdat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove duplicates\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gdat <- unique(gdat)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 16
\n",
"\t- 3
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 16\n",
"\\item 3\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 16\n",
"2. 3\n",
"\n",
"\n"
],
"text/plain": [
"[1] 16 3"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dim(gdat)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 |
\n",
"\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & expid & gene1 & gene2\\\\\n",
"\\hline\n",
"\\end{tabular}\n"
],
"text/plain": [
"[1] expid gene1 gene2\n",
"<0 rows> (or 0-length row.names)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"show.dups(gdat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Merging\n",
"----\n",
"\n",
"To combine columns from different data sets, we can perform a `merge` operation. Rows in the different data set need some common identifier to be merged, typcialy information from one or more \"ID\" columns."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merge all rows with information for both phenotype and gene"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### First we merge phnenoytpe data with the ID data"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | trt | expid |
\n",
"\n",
"\t1 | pid1 | 0 | 100001 |
\n",
"\t2 | pid12 | 1 | 100012 |
\n",
"\t3 | pid15 | 1 | 100015 |
\n",
"\t4 | pid16 | 0 | 100016 |
\n",
"\t5 | pid17 | 0 | 100017 |
\n",
"\t6 | pid18 | 1 | 100018 |
\n",
"\t7 | pid20 | 0 | 100020 |
\n",
"\t8 | pid6 | 0 | 100006 |
\n",
"\t9 | pid7 | 0 | 100007 |
\n",
"\t10 | pid8 | 0 | 100008 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lll}\n",
" & pid & trt & expid\\\\\n",
"\\hline\n",
"\t1 & pid1 & 0 & 100001\\\\\n",
"\t2 & pid12 & 1 & 100012\\\\\n",
"\t3 & pid15 & 1 & 100015\\\\\n",
"\t4 & pid16 & 0 & 100016\\\\\n",
"\t5 & pid17 & 0 & 100017\\\\\n",
"\t6 & pid18 & 1 & 100018\\\\\n",
"\t7 & pid20 & 0 & 100020\\\\\n",
"\t8 & pid6 & 0 & 100006\\\\\n",
"\t9 & pid7 & 0 & 100007\\\\\n",
"\t10 & pid8 & 0 & 100008\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" pid trt expid\n",
"1 pid1 0 100001\n",
"2 pid12 1 100012\n",
"3 pid15 1 100015\n",
"4 pid16 0 100016\n",
"5 pid17 0 100017\n",
"6 pid18 1 100018\n",
"7 pid20 0 100020\n",
"8 pid6 0 100006\n",
"9 pid7 0 100007\n",
"10 pid8 0 100008"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df1 <- merge(phenodat, iddat, by=\"pid\", all.x=TRUE))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Then we merge with gene data"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"3 100008 0.2865486 1.6858872 pid8 0\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1\n",
"7 100020 -0.4321298 -0.2288958 pid20 0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df2 <- merge(gdat, df1, by=\"expid\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that there are now only 7 rows becasue 3 phenotypes did not have matching gene data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What if we want to show all genes even if there is no matching phenotype data?"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100002 | -0.1294107 | 1.228393 | NA | NA |
\n",
"\t3 | 100003 | 0.8867361 | 0.2760235 | NA | NA |
\n",
"\t4 | 100004 | -0.151396 | -1.048976 | NA | NA |
\n",
"\t5 | 100005 | 0.3297912 | -0.5208693 | NA | NA |
\n",
"\t6 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t7 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t8 | 100009 | -1.220512 | -0.2416898 | NA | NA |
\n",
"\t9 | 100011 | 0.8001769 | -0.7729782 | NA | NA |
\n",
"\t10 | 100013 | 1.242919 | -1.334354 | NA | NA |
\n",
"\t11 | 100014 | -0.9343851 | 0.4958705 | NA | NA |
\n",
"\t12 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t13 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t14 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t15 | 100019 | 0.02884391 | -0.1524106 | NA | NA |
\n",
"\t16 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100002 & -0.1294107 & 1.228393 & NA & NA\\\\\n",
"\t3 & 100003 & 0.8867361 & 0.2760235 & NA & NA\\\\\n",
"\t4 & 100004 & -0.151396 & -1.048976 & NA & NA\\\\\n",
"\t5 & 100005 & 0.3297912 & -0.5208693 & NA & NA\\\\\n",
"\t6 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t7 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t8 & 100009 & -1.220512 & -0.2416898 & NA & NA\\\\\n",
"\t9 & 100011 & 0.8001769 & -0.7729782 & NA & NA\\\\\n",
"\t10 & 100013 & 1.242919 & -1.334354 & NA & NA\\\\\n",
"\t11 & 100014 & -0.9343851 & 0.4958705 & NA & NA\\\\\n",
"\t12 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t13 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t14 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t15 & 100019 & 0.02884391 & -0.1524106 & NA & NA\\\\\n",
"\t16 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.59960833 1.6898725 pid1 0\n",
"2 100002 -0.12941069 1.2283928 NA\n",
"3 100003 0.88673615 0.2760235 NA\n",
"4 100004 -0.15139596 -1.0489755 NA\n",
"5 100005 0.32979120 -0.5208693 NA\n",
"6 100007 -0.77179177 -1.0700682 pid7 0\n",
"7 100008 0.28654857 1.6858872 pid8 0\n",
"8 100009 -1.22051198 -0.2416898 NA\n",
"9 100011 0.80017687 -0.7729782 NA\n",
"10 100013 1.24291877 -1.3343536 NA\n",
"11 100014 -0.93438506 0.4958705 NA\n",
"12 100015 0.39370865 1.2339762 pid15 1\n",
"13 100017 -0.88643672 0.4120223 pid17 0\n",
"14 100018 -1.31893760 0.7935853 pid18 1\n",
"15 100019 0.02884391 -0.1524106 NA\n",
"16 100020 -0.43212979 -0.2288958 pid20 0"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df3 <- merge(gdat, df1, by=\"expid\", all.x=TRUE))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What if we want to show all phenotypes even if there is no matching gene data?"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100006 | NA | NA | pid6 | 0 |
\n",
"\t3 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t4 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t5 | 100012 | NA | NA | pid12 | 1 |
\n",
"\t6 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t7 | 100016 | NA | NA | pid16 | 0 |
\n",
"\t8 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t9 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t10 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100006 & NA & NA & pid6 & 0\\\\\n",
"\t3 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t4 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t5 & 100012 & NA & NA & pid12 & 1\\\\\n",
"\t6 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t7 & 100016 & NA & NA & pid16 & 0\\\\\n",
"\t8 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t9 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t10 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"2 100006 NA NA pid6 0\n",
"3 100007 -0.7717918 -1.0700682 pid7 0\n",
"4 100008 0.2865486 1.6858872 pid8 0\n",
"5 100012 NA NA pid12 1\n",
"6 100015 0.3937087 1.2339762 pid15 1\n",
"7 100016 NA NA pid16 0\n",
"8 100017 -0.8864367 0.4120223 pid17 0\n",
"9 100018 -1.3189376 0.7935853 pid18 1\n",
"10 100020 -0.4321298 -0.2288958 pid20 0"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df4 <- merge(gdat, df1, by=\"expid\", all.y=TRUE))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What if we want to show everything?"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100002 | -0.1294107 | 1.228393 | NA | NA |
\n",
"\t3 | 100003 | 0.8867361 | 0.2760235 | NA | NA |
\n",
"\t4 | 100004 | -0.151396 | -1.048976 | NA | NA |
\n",
"\t5 | 100005 | 0.3297912 | -0.5208693 | NA | NA |
\n",
"\t6 | 100006 | NA | NA | pid6 | 0 |
\n",
"\t7 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t8 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t9 | 100009 | -1.220512 | -0.2416898 | NA | NA |
\n",
"\t10 | 100011 | 0.8001769 | -0.7729782 | NA | NA |
\n",
"\t11 | 100012 | NA | NA | pid12 | 1 |
\n",
"\t12 | 100013 | 1.242919 | -1.334354 | NA | NA |
\n",
"\t13 | 100014 | -0.9343851 | 0.4958705 | NA | NA |
\n",
"\t14 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t15 | 100016 | NA | NA | pid16 | 0 |
\n",
"\t16 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t17 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t18 | 100019 | 0.02884391 | -0.1524106 | NA | NA |
\n",
"\t19 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100002 & -0.1294107 & 1.228393 & NA & NA\\\\\n",
"\t3 & 100003 & 0.8867361 & 0.2760235 & NA & NA\\\\\n",
"\t4 & 100004 & -0.151396 & -1.048976 & NA & NA\\\\\n",
"\t5 & 100005 & 0.3297912 & -0.5208693 & NA & NA\\\\\n",
"\t6 & 100006 & NA & NA & pid6 & 0\\\\\n",
"\t7 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t8 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t9 & 100009 & -1.220512 & -0.2416898 & NA & NA\\\\\n",
"\t10 & 100011 & 0.8001769 & -0.7729782 & NA & NA\\\\\n",
"\t11 & 100012 & NA & NA & pid12 & 1\\\\\n",
"\t12 & 100013 & 1.242919 & -1.334354 & NA & NA\\\\\n",
"\t13 & 100014 & -0.9343851 & 0.4958705 & NA & NA\\\\\n",
"\t14 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t15 & 100016 & NA & NA & pid16 & 0\\\\\n",
"\t16 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t17 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t18 & 100019 & 0.02884391 & -0.1524106 & NA & NA\\\\\n",
"\t19 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.59960833 1.6898725 pid1 0\n",
"2 100002 -0.12941069 1.2283928 NA\n",
"3 100003 0.88673615 0.2760235 NA\n",
"4 100004 -0.15139596 -1.0489755 NA\n",
"5 100005 0.32979120 -0.5208693 NA\n",
"6 100006 NA NA pid6 0\n",
"7 100007 -0.77179177 -1.0700682 pid7 0\n",
"8 100008 0.28654857 1.6858872 pid8 0\n",
"9 100009 -1.22051198 -0.2416898 NA\n",
"10 100011 0.80017687 -0.7729782 NA\n",
"11 100012 NA NA pid12 1\n",
"12 100013 1.24291877 -1.3343536 NA\n",
"13 100014 -0.93438506 0.4958705 NA\n",
"14 100015 0.39370865 1.2339762 pid15 1\n",
"15 100016 NA NA pid16 0\n",
"16 100017 -0.88643672 0.4120223 pid17 0\n",
"17 100018 -1.31893760 0.7935853 pid18 1\n",
"18 100019 0.02884391 -0.1524106 NA\n",
"19 100020 -0.43212979 -0.2288958 pid20 0"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df5 <- merge(gdat, df1, by=\"expid\", all.x=TRUE, all.y=TRUE))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Rearrange column order\n",
"-----"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | pid | expid | gene1 | gene2 | trt |
\n",
"\n",
"\t1 | pid1 | 100001 | -0.5996083 | 1.689873 | 0 |
\n",
"\t2 | pid7 | 100007 | -0.7717918 | -1.070068 | 0 |
\n",
"\t3 | pid8 | 100008 | 0.2865486 | 1.685887 | 0 |
\n",
"\t4 | pid15 | 100015 | 0.3937087 | 1.233976 | 1 |
\n",
"\t5 | pid17 | 100017 | -0.8864367 | 0.4120223 | 0 |
\n",
"\t6 | pid18 | 100018 | -1.318938 | 0.7935853 | 1 |
\n",
"\t7 | pid20 | 100020 | -0.4321298 | -0.2288958 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & pid & expid & gene1 & gene2 & trt\\\\\n",
"\\hline\n",
"\t1 & pid1 & 100001 & -0.5996083 & 1.689873 & 0\\\\\n",
"\t2 & pid7 & 100007 & -0.7717918 & -1.070068 & 0\\\\\n",
"\t3 & pid8 & 100008 & 0.2865486 & 1.685887 & 0\\\\\n",
"\t4 & pid15 & 100015 & 0.3937087 & 1.233976 & 1\\\\\n",
"\t5 & pid17 & 100017 & -0.8864367 & 0.4120223 & 0\\\\\n",
"\t6 & pid18 & 100018 & -1.318938 & 0.7935853 & 1\\\\\n",
"\t7 & pid20 & 100020 & -0.4321298 & -0.2288958 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" pid expid gene1 gene2 trt\n",
"1 pid1 100001 -0.5996083 1.6898725 0\n",
"2 pid7 100007 -0.7717918 -1.0700682 0\n",
"3 pid8 100008 0.2865486 1.6858872 0\n",
"4 pid15 100015 0.3937087 1.2339762 1\n",
"5 pid17 100017 -0.8864367 0.4120223 0\n",
"6 pid18 100018 -1.3189376 0.7935853 1\n",
"7 pid20 100020 -0.4321298 -0.2288958 0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2[, c(4,1,2,3,5)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sorting data\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"3 100008 0.2865486 1.6858872 pid8 0\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1\n",
"7 100020 -0.4321298 -0.2288958 pid20 0"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sort by expid"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"3 100008 0.2865486 1.6858872 pid8 0\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1\n",
"7 100020 -0.4321298 -0.2288958 pid20 0"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2[order(df2$expid),]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sort by pid"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1\n",
"7 100020 -0.4321298 -0.2288958 pid20 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"3 100008 0.2865486 1.6858872 pid8 0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2[order(df2$pid),]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sort by pid, then by expid"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1\n",
"7 100020 -0.4321298 -0.2288958 pid20 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"3 100008 0.2865486 1.6858872 pid8 0"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2[order(df2$pid, df2$expid),]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sort by gene1 in decreasing order"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | expid | gene1 | gene2 | pid | trt |
\n",
"\n",
"\t4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
\n",
"\t3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
\n",
"\t7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
\n",
"\t1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
\n",
"\t2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
\n",
"\t5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
\n",
"\t6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & expid & gene1 & gene2 & pid & trt\\\\\n",
"\\hline\n",
"\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n",
"\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n",
"\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n",
"\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n",
"\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n",
"\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n",
"\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" expid gene1 gene2 pid trt\n",
"4 100015 0.3937087 1.2339762 pid15 1\n",
"3 100008 0.2865486 1.6858872 pid8 0\n",
"7 100020 -0.4321298 -0.2288958 pid20 0\n",
"1 100001 -0.5996083 1.6898725 pid1 0\n",
"2 100007 -0.7717918 -1.0700682 pid7 0\n",
"5 100017 -0.8864367 0.4120223 pid17 0\n",
"6 100018 -1.3189376 0.7935853 pid18 1"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2[order(df2$gene1, decreasing = TRUE),]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.1.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}