{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Grouping and Aggregation\n", "====" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sorting data\n", "----" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
15.13.51.40.2setosa
24.931.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
553.61.40.2setosa
65.43.91.70.4setosa
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t1 & 5.1 & 3.5 & 1.4 & 0.2 & setosa\\\\\n", "\t2 & 4.9 & 3 & 1.4 & 0.2 & setosa\\\\\n", "\t3 & 4.7 & 3.2 & 1.3 & 0.2 & setosa\\\\\n", "\t4 & 4.6 & 3.1 & 1.5 & 0.2 & setosa\\\\\n", "\t5 & 5 & 3.6 & 1.4 & 0.2 & setosa\\\\\n", "\t6 & 5.4 & 3.9 & 1.7 & 0.4 & setosa\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "1 5.1 3.5 1.4 0.2 setosa\n", "2 4.9 3.0 1.4 0.2 setosa\n", "3 4.7 3.2 1.3 0.2 setosa\n", "4 4.6 3.1 1.5 0.2 setosa\n", "5 5.0 3.6 1.4 0.2 setosa\n", "6 5.4 3.9 1.7 0.4 setosa" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(iris, 6)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
94.42.91.40.2setosa
394.431.30.2setosa
434.43.21.30.2setosa
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n", "\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n", "\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n", "\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "14 4.3 3.0 1.1 0.1 setosa\n", "9 4.4 2.9 1.4 0.2 setosa\n", "39 4.4 3.0 1.3 0.2 setosa\n", "43 4.4 3.2 1.3 0.2 setosa" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ridx <- order(iris$Sepal.Length)\n", "head(iris[ridx,], 4)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
1327.93.86.42virginica
1187.73.86.72.2virginica
1197.72.66.92.3virginica
1237.72.86.72virginica
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t132 & 7.9 & 3.8 & 6.4 & 2 & virginica\\\\\n", "\t118 & 7.7 & 3.8 & 6.7 & 2.2 & virginica\\\\\n", "\t119 & 7.7 & 2.6 & 6.9 & 2.3 & virginica\\\\\n", "\t123 & 7.7 & 2.8 & 6.7 & 2 & virginica\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "132 7.9 3.8 6.4 2.0 virginica\n", "118 7.7 3.8 6.7 2.2 virginica\n", "119 7.7 2.6 6.9 2.3 virginica\n", "123 7.7 2.8 6.7 2.0 virginica" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ridx <- order(iris$Sepal.Length, decreasing = TRUE)\n", "head(iris[ridx,], 4)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
394.431.30.2setosa
434.43.21.30.2setosa
94.42.91.40.2setosa
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n", "\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n", "\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n", "\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "14 4.3 3.0 1.1 0.1 setosa\n", "39 4.4 3.0 1.3 0.2 setosa\n", "43 4.4 3.2 1.3 0.2 setosa\n", "9 4.4 2.9 1.4 0.2 setosa" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ridx <- order(iris$Sepal.Length, iris$Petal.Length)\n", "head(iris[ridx,], 4)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
94.42.91.40.2setosa
394.431.30.2setosa
434.43.21.30.2setosa
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t14 & 4.3 & 3 & 1.1 & 0.1 & setosa\\\\\n", "\t9 & 4.4 & 2.9 & 1.4 & 0.2 & setosa\\\\\n", "\t39 & 4.4 & 3 & 1.3 & 0.2 & setosa\\\\\n", "\t43 & 4.4 & 3.2 & 1.3 & 0.2 & setosa\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "14 4.3 3.0 1.1 0.1 setosa\n", "9 4.4 2.9 1.4 0.2 setosa\n", "39 4.4 3.0 1.3 0.2 setosa\n", "43 4.4 3.2 1.3 0.2 setosa" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ridx <- order(iris$Sepal.Length, -iris$Petal.Length)\n", "head(iris[ridx,], 4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Trnasposing data\n", "----" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
1126.42.75.31.9virginica
1466.735.22.3virginica
565.72.84.51.3versicolor
325.43.41.50.4setosa
986.22.94.31.3versicolor
1296.42.85.62.1virginica
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", "\\hline\n", "\t112 & 6.4 & 2.7 & 5.3 & 1.9 & virginica\\\\\n", "\t146 & 6.7 & 3 & 5.2 & 2.3 & virginica\\\\\n", "\t56 & 5.7 & 2.8 & 4.5 & 1.3 & versicolor\\\\\n", "\t32 & 5.4 & 3.4 & 1.5 & 0.4 & setosa\\\\\n", "\t98 & 6.2 & 2.9 & 4.3 & 1.3 & versicolor\\\\\n", "\t129 & 6.4 & 2.8 & 5.6 & 2.1 & virginica\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "112 6.4 2.7 5.3 1.9 virginica\n", "146 6.7 3.0 5.2 2.3 virginica\n", "56 5.7 2.8 4.5 1.3 versicolor\n", "32 5.4 3.4 1.5 0.4 setosa\n", "98 6.2 2.9 4.3 1.3 versicolor\n", "129 6.4 2.8 5.6 2.1 virginica" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ir6 <- iris[sample(1:nrow(iris), 6, replace=FALSE),]\n", "ir6" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
112146563298129
Sepal.Length6.46.75.75.46.26.4
Sepal.Width2.73.02.83.42.92.8
Petal.Length5.35.24.51.54.35.6
Petal.Width1.92.31.30.41.32.1
Speciesvirginica virginica versicolorsetosa versicolorvirginica
\n" ], "text/latex": [ "\\begin{tabular}{r|llllll}\n", " & 112 & 146 & 56 & 32 & 98 & 129\\\\\n", "\\hline\n", "\tSepal.Length & 6.4 & 6.7 & 5.7 & 5.4 & 6.2 & 6.4\\\\\n", "\tSepal.Width & 2.7 & 3.0 & 2.8 & 3.4 & 2.9 & 2.8\\\\\n", "\tPetal.Length & 5.3 & 5.2 & 4.5 & 1.5 & 4.3 & 5.6\\\\\n", "\tPetal.Width & 1.9 & 2.3 & 1.3 & 0.4 & 1.3 & 2.1\\\\\n", "\tSpecies & virginica & virginica & versicolor & setosa & versicolor & virginica \\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "1. '6.4'\n", "2. '2.7'\n", "3. '5.3'\n", "4. '1.9'\n", "5. 'virginica'\n", "6. '6.7'\n", "7. '3.0'\n", "8. '5.2'\n", "9. '2.3'\n", "10. 'virginica'\n", "11. '5.7'\n", "12. '2.8'\n", "13. '4.5'\n", "14. '1.3'\n", "15. 'versicolor'\n", "16. '5.4'\n", "17. '3.4'\n", "18. '1.5'\n", "19. '0.4'\n", "20. 'setosa'\n", "21. '6.2'\n", "22. '2.9'\n", "23. '4.3'\n", "24. '1.3'\n", "25. 'versicolor'\n", "26. '6.4'\n", "27. '2.8'\n", "28. '5.6'\n", "29. '2.1'\n", "30. 'virginica'\n", "\n", "\n" ], "text/plain": [ " 112 146 56 32 98 \n", "Sepal.Length \"6.4\" \"6.7\" \"5.7\" \"5.4\" \"6.2\" \n", "Sepal.Width \"2.7\" \"3.0\" \"2.8\" \"3.4\" \"2.9\" \n", "Petal.Length \"5.3\" \"5.2\" \"4.5\" \"1.5\" \"4.3\" \n", "Petal.Width \"1.9\" \"2.3\" \"1.3\" \"0.4\" \"1.3\" \n", "Species \"virginica\" \"virginica\" \"versicolor\" \"setosa\" \"versicolor\"\n", " 129 \n", "Sepal.Length \"6.4\" \n", "Sepal.Width \"2.8\" \n", "Petal.Length \"5.6\" \n", "Petal.Width \"2.1\" \n", "Species \"virginica\"" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t(ir6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Aggregation (Subgrouping)\n", "----" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
mpgcyldisphpdratwtqsecvsamgearcarb
Dodge Challenger15.583181502.763.5216.870032
Hornet Sportabout18.783601753.153.4417.020032
Fiat 12832.4478.7664.082.219.471141
Merc 450SLC15.28275.81803.073.78180033
Chrysler Imperial14.784402303.235.34517.420034
Volvo 142E21.441211094.112.7818.61142
Toyota Corolla33.9471.1654.221.83519.91141
Merc 450SE16.48275.81803.074.0717.40033
Mazda RX42161601103.92.6216.460144
Porsche 914-2264120.3914.432.1416.70152
\n" ], "text/latex": [ "\\begin{tabular}{r|lllllllllll}\n", " & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n", "\\hline\n", "\tDodge Challenger & 15.5 & 8 & 318 & 150 & 2.76 & 3.52 & 16.87 & 0 & 0 & 3 & 2\\\\\n", "\tHornet Sportabout & 18.7 & 8 & 360 & 175 & 3.15 & 3.44 & 17.02 & 0 & 0 & 3 & 2\\\\\n", "\tFiat 128 & 32.4 & 4 & 78.7 & 66 & 4.08 & 2.2 & 19.47 & 1 & 1 & 4 & 1\\\\\n", "\tMerc 450SLC & 15.2 & 8 & 275.8 & 180 & 3.07 & 3.78 & 18 & 0 & 0 & 3 & 3\\\\\n", "\tChrysler Imperial & 14.7 & 8 & 440 & 230 & 3.23 & 5.345 & 17.42 & 0 & 0 & 3 & 4\\\\\n", "\tVolvo 142E & 21.4 & 4 & 121 & 109 & 4.11 & 2.78 & 18.6 & 1 & 1 & 4 & 2\\\\\n", "\tToyota Corolla & 33.9 & 4 & 71.1 & 65 & 4.22 & 1.835 & 19.9 & 1 & 1 & 4 & 1\\\\\n", "\tMerc 450SE & 16.4 & 8 & 275.8 & 180 & 3.07 & 4.07 & 17.4 & 0 & 0 & 3 & 3\\\\\n", "\tMazda RX4 & 21 & 6 & 160 & 110 & 3.9 & 2.62 & 16.46 & 0 & 1 & 4 & 4\\\\\n", "\tPorsche 914-2 & 26 & 4 & 120.3 & 91 & 4.43 & 2.14 & 16.7 & 0 & 1 & 5 & 2\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " mpg cyl disp hp drat wt qsec vs am gear carb\n", "Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2\n", "Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2\n", "Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1\n", "Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3\n", "Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n", "Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2\n", "Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1\n", "Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3\n", "Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4\n", "Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mt10 <- mtcars[sample(1:nrow(mtcars), 10, replace=FALSE),]\n", "mt10" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
cylmpgcyldisphpdratwtqsecvsamgearcarb
1426.663644105.136482.636364.0709092.28572719.137270.90909090.72727274.0909091.545455
2619.742866183.3143122.28573.5857143.11714317.977140.57142860.42857143.8571433.428571
3815.18353.1209.21433.2292863.99921416.7721400.14285713.2857143.5
\n" ], "text/latex": [ "\\begin{tabular}{r|llllllllllll}\n", " & cyl & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n", "\\hline\n", "\t1 & 4 & 26.66364 & 4 & 105.1364 & 82.63636 & 4.070909 & 2.285727 & 19.13727 & 0.9090909 & 0.7272727 & 4.090909 & 1.545455\\\\\n", "\t2 & 6 & 19.74286 & 6 & 183.3143 & 122.2857 & 3.585714 & 3.117143 & 17.97714 & 0.5714286 & 0.4285714 & 3.857143 & 3.428571\\\\\n", "\t3 & 8 & 15.1 & 8 & 353.1 & 209.2143 & 3.229286 & 3.999214 & 16.77214 & 0 & 0.1428571 & 3.285714 & 3.5\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " cyl mpg cyl disp hp drat wt qsec vs\n", "1 4 26.66364 4 105.1364 82.63636 4.070909 2.285727 19.13727 0.9090909\n", "2 6 19.74286 6 183.3143 122.28571 3.585714 3.117143 17.97714 0.5714286\n", "3 8 15.10000 8 353.1000 209.21429 3.229286 3.999214 16.77214 0.0000000\n", " am gear carb\n", "1 0.7272727 4.090909 1.545455\n", "2 0.4285714 3.857143 3.428571\n", "3 0.1428571 3.285714 3.500000" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with(mtcars, aggregate(mtcars, by=list(cyl=cyl), FUN=mean))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
cylgearmpgcyldisphpdratwtqsecvsamgearcarb
14321.54120.1973.72.46520.011031
26319.756241.5107.52.923.337519.831031
38315.283551803.0753.8117.350033
44425.85493.5664.082.2619.1851141.5
56420.16163.8116.53.913.157517.660.50.544
64528.24107.71024.11.826516.80.5152
76519.761451753.622.7715.50156
88515.48326299.53.883.3714.550156
\n" ], "text/latex": [ "\\begin{tabular}{r|lllllllllllll}\n", " & cyl & gear & mpg & cyl & disp & hp & drat & wt & qsec & vs & am & gear & carb\\\\\n", "\\hline\n", "\t1 & 4 & 3 & 21.5 & 4 & 120.1 & 97 & 3.7 & 2.465 & 20.01 & 1 & 0 & 3 & 1\\\\\n", "\t2 & 6 & 3 & 19.75 & 6 & 241.5 & 107.5 & 2.92 & 3.3375 & 19.83 & 1 & 0 & 3 & 1\\\\\n", "\t3 & 8 & 3 & 15.2 & 8 & 355 & 180 & 3.075 & 3.81 & 17.35 & 0 & 0 & 3 & 3\\\\\n", "\t4 & 4 & 4 & 25.85 & 4 & 93.5 & 66 & 4.08 & 2.26 & 19.185 & 1 & 1 & 4 & 1.5\\\\\n", "\t5 & 6 & 4 & 20.1 & 6 & 163.8 & 116.5 & 3.91 & 3.1575 & 17.66 & 0.5 & 0.5 & 4 & 4\\\\\n", "\t6 & 4 & 5 & 28.2 & 4 & 107.7 & 102 & 4.1 & 1.8265 & 16.8 & 0.5 & 1 & 5 & 2\\\\\n", "\t7 & 6 & 5 & 19.7 & 6 & 145 & 175 & 3.62 & 2.77 & 15.5 & 0 & 1 & 5 & 6\\\\\n", "\t8 & 8 & 5 & 15.4 & 8 & 326 & 299.5 & 3.88 & 3.37 & 14.55 & 0 & 1 & 5 & 6\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " cyl gear mpg cyl disp hp drat wt qsec vs am gear carb\n", "1 4 3 21.50 4 120.1 97.0 3.700 2.4650 20.010 1.0 0.0 3 1.0\n", "2 6 3 19.75 6 241.5 107.5 2.920 3.3375 19.830 1.0 0.0 3 1.0\n", "3 8 3 15.20 8 355.0 180.0 3.075 3.8100 17.350 0.0 0.0 3 3.0\n", "4 4 4 25.85 4 93.5 66.0 4.080 2.2600 19.185 1.0 1.0 4 1.5\n", "5 6 4 20.10 6 163.8 116.5 3.910 3.1575 17.660 0.5 0.5 4 4.0\n", "6 4 5 28.20 4 107.7 102.0 4.100 1.8265 16.800 0.5 1.0 5 2.0\n", "7 6 5 19.70 6 145.0 175.0 3.620 2.7700 15.500 0.0 1.0 5 6.0\n", "8 8 5 15.40 8 326.0 299.5 3.880 3.3700 14.550 0.0 1.0 5 6.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with(mtcars, aggregate(mtcars, by=list(cyl=cyl, gear=gear), FUN=median))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reshaping data\n", "----" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning message:\n", ": package ‘plyr’ was built under R version 3.1.3" ] } ], "source": [ "library(reshape2)\n", "library(plyr) # needed for the . function" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Starting data frame" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
PIDTimeGene1Gene2
111-1.658798-1.509927
2126.4229992.617535
3138.70391711.20532
42111.80803-0.9115341
5224.6703614.869456
623-0.59828498.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & PID & Time & Gene1 & Gene2\\\\\n", "\\hline\n", "\t1 & 1 & 1 & -1.658798 & -1.509927\\\\\n", "\t2 & 1 & 2 & 6.422999 & 2.617535\\\\\n", "\t3 & 1 & 3 & 8.703917 & 11.20532\\\\\n", "\t4 & 2 & 1 & 11.80803 & -0.9115341\\\\\n", "\t5 & 2 & 2 & 4.670361 & 4.869456\\\\\n", "\t6 & 2 & 3 & -0.5982849 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Time Gene1 Gene2\n", "1 1 1 -1.6587983 -1.5099270\n", "2 1 2 6.4229986 2.6175354\n", "3 1 3 8.7039172 11.2053234\n", "4 2 1 11.8080301 -0.9115341\n", "5 2 2 4.6703610 4.8694561\n", "6 2 3 -0.5982849 8.8712385" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ID <- factor(rep(1:2, each=3))\n", "Time <- rep(1:3, 2)\n", "Gene1 <- c(0,5,10,10,5,0) + rnorm(6)\n", "Gene2 <- c(0,5,10,0,5,10) + rnorm(6)\n", "expt <- data.frame(PID=ID, Time=Time, Gene1=Gene1, Gene2=Gene2)\n", "expt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Melt\" into a \"tall\" format with all values in a single column. This reuqires identifying all the columns that are needed to uniquely define a row value. In this csse, the \"id\" columns are \"PID\" and \"Time\"." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
PIDTimeGenevalue
111Gene1-1.658798
212Gene16.422999
313Gene18.703917
421Gene111.80803
522Gene14.670361
623Gene1-0.5982849
711Gene2-1.509927
812Gene22.617535
913Gene211.20532
1021Gene2-0.9115341
1122Gene24.869456
1223Gene28.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & PID & Time & Gene & value\\\\\n", "\\hline\n", "\t1 & 1 & 1 & Gene1 & -1.658798\\\\\n", "\t2 & 1 & 2 & Gene1 & 6.422999\\\\\n", "\t3 & 1 & 3 & Gene1 & 8.703917\\\\\n", "\t4 & 2 & 1 & Gene1 & 11.80803\\\\\n", "\t5 & 2 & 2 & Gene1 & 4.670361\\\\\n", "\t6 & 2 & 3 & Gene1 & -0.5982849\\\\\n", "\t7 & 1 & 1 & Gene2 & -1.509927\\\\\n", "\t8 & 1 & 2 & Gene2 & 2.617535\\\\\n", "\t9 & 1 & 3 & Gene2 & 11.20532\\\\\n", "\t10 & 2 & 1 & Gene2 & -0.9115341\\\\\n", "\t11 & 2 & 2 & Gene2 & 4.869456\\\\\n", "\t12 & 2 & 3 & Gene2 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Time Gene value\n", "1 1 1 Gene1 -1.6587983\n", "2 1 2 Gene1 6.4229986\n", "3 1 3 Gene1 8.7039172\n", "4 2 1 Gene1 11.8080301\n", "5 2 2 Gene1 4.6703610\n", "6 2 3 Gene1 -0.5982849\n", "7 1 1 Gene2 -1.5099270\n", "8 1 2 Gene2 2.6175354\n", "9 1 3 Gene2 11.2053234\n", "10 2 1 Gene2 -0.9115341\n", "11 2 2 Gene2 4.8694561\n", "12 2 3 Gene2 8.8712385" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m.expt <- melt(expt, id=c(\"PID\", \"Time\"), variable.name=\"Gene\")\n", "m.expt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use dcast to reshape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show the time series for each (PID, Gene) combination." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
PIDGene123
11Gene1-1.6587986.4229998.703917
21Gene2-1.5099272.61753511.20532
32Gene111.808034.670361-0.5982849
42Gene2-0.91153414.8694568.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & PID & Gene & 1 & 2 & 3\\\\\n", "\\hline\n", "\t1 & 1 & Gene1 & -1.658798 & 6.422999 & 8.703917\\\\\n", "\t2 & 1 & Gene2 & -1.509927 & 2.617535 & 11.20532\\\\\n", "\t3 & 2 & Gene1 & 11.80803 & 4.670361 & -0.5982849\\\\\n", "\t4 & 2 & Gene2 & -0.9115341 & 4.869456 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Gene 1 2 3\n", "1 1 Gene1 -1.6587983 6.422999 8.7039172\n", "2 1 Gene2 -1.5099270 2.617535 11.2053234\n", "3 2 Gene1 11.8080301 4.670361 -0.5982849\n", "4 2 Gene2 -0.9115341 4.869456 8.8712385" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, PID + Gene ~ Time)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show the time series for each (Gene, PID) combination." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
GenePID123
1Gene11-1.6587986.4229998.703917
2Gene1211.808034.670361-0.5982849
3Gene21-1.5099272.61753511.20532
4Gene22-0.91153414.8694568.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & Gene & PID & 1 & 2 & 3\\\\\n", "\\hline\n", "\t1 & Gene1 & 1 & -1.658798 & 6.422999 & 8.703917\\\\\n", "\t2 & Gene1 & 2 & 11.80803 & 4.670361 & -0.5982849\\\\\n", "\t3 & Gene2 & 1 & -1.509927 & 2.617535 & 11.20532\\\\\n", "\t4 & Gene2 & 2 & -0.9115341 & 4.869456 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Gene PID 1 2 3\n", "1 Gene1 1 -1.6587983 6.422999 8.7039172\n", "2 Gene1 2 11.8080301 4.670361 -0.5982849\n", "3 Gene2 1 -1.5099270 2.617535 11.2053234\n", "4 Gene2 2 -0.9115341 4.869456 8.8712385" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, Gene + PID ~ Time)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Recreate the original data set" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
PIDTimeGene1Gene2
111-1.658798-1.509927
2126.4229992.617535
3138.70391711.20532
42111.80803-0.9115341
5224.6703614.869456
623-0.59828498.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & PID & Time & Gene1 & Gene2\\\\\n", "\\hline\n", "\t1 & 1 & 1 & -1.658798 & -1.509927\\\\\n", "\t2 & 1 & 2 & 6.422999 & 2.617535\\\\\n", "\t3 & 1 & 3 & 8.703917 & 11.20532\\\\\n", "\t4 & 2 & 1 & 11.80803 & -0.9115341\\\\\n", "\t5 & 2 & 2 & 4.670361 & 4.869456\\\\\n", "\t6 & 2 & 3 & -0.5982849 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Time Gene1 Gene2\n", "1 1 1 -1.6587983 -1.5099270\n", "2 1 2 6.4229986 2.6175354\n", "3 1 3 8.7039172 11.2053234\n", "4 2 1 11.8080301 -0.9115341\n", "5 2 2 4.6703610 4.8694561\n", "6 2 3 -0.5982849 8.8712385" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, PID + Time ~ Gene)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show all data for each subject in a single row" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
PIDGene1_1Gene1_2Gene1_3Gene2_1Gene2_2Gene2_3
11-1.6587986.4229998.703917-1.5099272.61753511.20532
2211.808034.670361-0.5982849-0.91153414.8694568.871238
\n" ], "text/latex": [ "\\begin{tabular}{r|lllllll}\n", " & PID & Gene1_1 & Gene1_2 & Gene1_3 & Gene2_1 & Gene2_2 & Gene2_3\\\\\n", "\\hline\n", "\t1 & 1 & -1.658798 & 6.422999 & 8.703917 & -1.509927 & 2.617535 & 11.20532\\\\\n", "\t2 & 2 & 11.80803 & 4.670361 & -0.5982849 & -0.9115341 & 4.869456 & 8.871238\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Gene1_1 Gene1_2 Gene1_3 Gene2_1 Gene2_2 Gene2_3\n", "1 1 -1.658798 6.422999 8.7039172 -1.5099270 2.617535 11.205323\n", "2 2 11.808030 4.670361 -0.5982849 -0.9115341 4.869456 8.871238" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, PID ~ Gene + Time)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### We can also aggregate while reshaping" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### What is the average expression value for each gene for each subject over all time points?" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
PIDGene1Gene2
114.4893724.104311
225.2933694.276387
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & PID & Gene1 & Gene2\\\\\n", "\\hline\n", "\t1 & 1 & 4.489372 & 4.104311\\\\\n", "\t2 & 2 & 5.293369 & 4.276387\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " PID Gene1 Gene2\n", "1 1 4.489372 4.104311\n", "2 2 5.293369 4.276387" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, PID ~ Gene, mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### What is the average expression value for each gene for each time point over all subjects?" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
TimeGene1Gene2
115.074616-1.210731
225.546683.743496
334.05281610.03828
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & Time & Gene1 & Gene2\\\\\n", "\\hline\n", "\t1 & 1 & 5.074616 & -1.210731\\\\\n", "\t2 & 2 & 5.54668 & 3.743496\\\\\n", "\t3 & 3 & 4.052816 & 10.03828\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Time Gene1 Gene2\n", "1 1 5.074616 -1.210731\n", "2 2 5.546680 3.743496\n", "3 3 4.052816 10.038281" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, Time ~ Gene, mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Finally, we can perform subssetting on the named vairables.\n", "\n", "For example, restrict the previous query to subject with PID= 1." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
TimeGene1Gene2
11-1.658798-1.509927
226.4229992.617535
338.70391711.20532
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & Time & Gene1 & Gene2\\\\\n", "\\hline\n", "\t1 & 1 & -1.658798 & -1.509927\\\\\n", "\t2 & 2 & 6.422999 & 2.617535\\\\\n", "\t3 & 3 & 8.703917 & 11.20532\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " Time Gene1 Gene2\n", "1 1 -1.658798 -1.509927\n", "2 2 6.422999 2.617535\n", "3 3 8.703917 11.205323" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dcast(m.expt, Time ~ Gene, mean, subset = .(PID == 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Merging data\n", "----\n", "\n", "A common task in data analysis is to link data from two or more datasts, for example, to relate assay data to clinical phenoytpe. \n", "\n", "Here we will work thought a typical example where the genotype and phenoytpe information come from two different data sets, and the ID information needed to link the two is from a third data set." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "phenodat <- read.csv(\"phenodat.csv\") \n", "gdat1 <- read.csv(\"gdat1.csv\") \n", "gdat2 <- read.csv(\"gdat2.csv\")\n", "iddat <- read.csv(\"iddat.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Eyeball data sett\n", "----\n", "\n", "A quick sanity check to see what the data look like." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 10
  2. \n", "\t
  3. 2
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 10\n", "\\item 2\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 10\n", "2. 2\n", "\n", "\n" ], "text/plain": [ "[1] 10 2" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
    \n", "\t
  1. 11
  2. \n", "\t
  3. 3
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 11\n", "\\item 3\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 11\n", "2. 3\n", "\n", "\n" ], "text/plain": [ "[1] 11 3" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
    \n", "\t
  1. 11
  2. \n", "\t
  3. 3
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 11\n", "\\item 3\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 11\n", "2. 3\n", "\n", "\n" ], "text/plain": [ "[1] 11 3" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
    \n", "\t
  1. 20
  2. \n", "\t
  3. 2
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 20\n", "\\item 2\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 20\n", "2. 2\n", "\n", "\n" ], "text/plain": [ "[1] 20 2" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(dim(phenodat))\n", "(dim(gdat1))\n", "(dim(gdat2))\n", "(dim(iddat))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
pidtrt
1pid60
2pid151
3pid80
\n" ], "text/latex": [ "\\begin{tabular}{r|ll}\n", " & pid & trt\\\\\n", "\\hline\n", "\t1 & pid6 & 0\\\\\n", "\t2 & pid15 & 1\\\\\n", "\t3 & pid8 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " pid trt\n", "1 pid6 0\n", "2 pid15 1\n", "3 pid8 0" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(phenodat, 3)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2
1100020-0.4321298-0.2288958
2100018-1.3189380.7935853
31000131.242919-1.334354
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & expid & gene1 & gene2\\\\\n", "\\hline\n", "\t1 & 100020 & -0.4321298 & -0.2288958\\\\\n", "\t2 & 100018 & -1.318938 & 0.7935853\\\\\n", "\t3 & 100013 & 1.242919 & -1.334354\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2\n", "1 100020 -0.4321298 -0.2288958\n", "2 100018 -1.3189376 0.7935853\n", "3 100013 1.2429188 -1.3343536" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(gdat1, 3)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2
1100009-1.220512-0.2416898
21000080.28654861.685887
3100007-0.7717918-1.070068
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & expid & gene1 & gene2\\\\\n", "\\hline\n", "\t1 & 100009 & -1.220512 & -0.2416898\\\\\n", "\t2 & 100008 & 0.2865486 & 1.685887\\\\\n", "\t3 & 100007 & -0.7717918 & -1.070068\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2\n", "1 100009 -1.2205120 -0.2416898\n", "2 100008 0.2865486 1.6858872\n", "3 100007 -0.7717918 -1.0700682" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(gdat2, 3)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
pidexpid
1pid20100020
2pid9100009
3pid13100013
\n" ], "text/latex": [ "\\begin{tabular}{r|ll}\n", " & pid & expid\\\\\n", "\\hline\n", "\t1 & pid20 & 100020\\\\\n", "\t2 & pid9 & 100009\\\\\n", "\t3 & pid13 & 100013\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " pid expid\n", "1 pid20 100020\n", "2 pid9 100009\n", "3 pid13 100013" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(iddat, 3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine gene data from two data sets\n", "----\n", "\n", "Often, we have the same type of data stroed in mulitple data sets, for example, one per batch. In this case, we want to combine **rows**." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [], "source": [ "gdat <- rbind(gdat1, gdat2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Checking for duplicates\n", "---- " ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "show.dups <- function(df) {\n", " return(df[duplicated(df), ])\n", " }" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
pidtrt
\n" ], "text/latex": [ "\\begin{tabular}{r|ll}\n", " & pid & trt\\\\\n", "\\hline\n", "\\end{tabular}\n" ], "text/plain": [ "[1] pid trt\n", "<0 rows> (or 0-length row.names)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show.dups(phenodat)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
pidexpid
\n" ], "text/latex": [ "\\begin{tabular}{r|ll}\n", " & pid & expid\\\\\n", "\\hline\n", "\\end{tabular}\n" ], "text/plain": [ "[1] pid expid\n", "<0 rows> (or 0-length row.names)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show.dups(iddat)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2
131000080.28654861.685887
151000030.88673610.2760235
17100004-0.151396-1.048976
18100018-1.3189380.7935853
201000110.8001769-0.7729782
21100001-0.59960831.689873
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & expid & gene1 & gene2\\\\\n", "\\hline\n", "\t13 & 100008 & 0.2865486 & 1.685887\\\\\n", "\t15 & 100003 & 0.8867361 & 0.2760235\\\\\n", "\t17 & 100004 & -0.151396 & -1.048976\\\\\n", "\t18 & 100018 & -1.318938 & 0.7935853\\\\\n", "\t20 & 100011 & 0.8001769 & -0.7729782\\\\\n", "\t21 & 100001 & -0.5996083 & 1.689873\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2\n", "13 100008 0.2865486 1.6858872\n", "15 100003 0.8867361 0.2760235\n", "17 100004 -0.1513960 -1.0489755\n", "18 100018 -1.3189376 0.7935853\n", "20 100011 0.8001769 -0.7729782\n", "21 100001 -0.5996083 1.6898725" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show.dups(gdat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove duplicates\n", "----" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "gdat <- unique(gdat)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 16
  2. \n", "\t
  3. 3
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 16\n", "\\item 3\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 16\n", "2. 3\n", "\n", "\n" ], "text/plain": [ "[1] 16 3" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dim(gdat)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "
expidgene1gene2
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & expid & gene1 & gene2\\\\\n", "\\hline\n", "\\end{tabular}\n" ], "text/plain": [ "[1] expid gene1 gene2\n", "<0 rows> (or 0-length row.names)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "show.dups(gdat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Merging\n", "----\n", "\n", "To combine columns from different data sets, we can perform a `merge` operation. Rows in the different data set need some common identifier to be merged, typcialy information from one or more \"ID\" columns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merge all rows with information for both phenotype and gene" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### First we merge phnenoytpe data with the ID data" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
pidtrtexpid
1pid10100001
2pid121100012
3pid151100015
4pid160100016
5pid170100017
6pid181100018
7pid200100020
8pid60100006
9pid70100007
10pid80100008
\n" ], "text/latex": [ "\\begin{tabular}{r|lll}\n", " & pid & trt & expid\\\\\n", "\\hline\n", "\t1 & pid1 & 0 & 100001\\\\\n", "\t2 & pid12 & 1 & 100012\\\\\n", "\t3 & pid15 & 1 & 100015\\\\\n", "\t4 & pid16 & 0 & 100016\\\\\n", "\t5 & pid17 & 0 & 100017\\\\\n", "\t6 & pid18 & 1 & 100018\\\\\n", "\t7 & pid20 & 0 & 100020\\\\\n", "\t8 & pid6 & 0 & 100006\\\\\n", "\t9 & pid7 & 0 & 100007\\\\\n", "\t10 & pid8 & 0 & 100008\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " pid trt expid\n", "1 pid1 0 100001\n", "2 pid12 1 100012\n", "3 pid15 1 100015\n", "4 pid16 0 100016\n", "5 pid17 0 100017\n", "6 pid18 1 100018\n", "7 pid20 0 100020\n", "8 pid6 0 100006\n", "9 pid7 0 100007\n", "10 pid8 0 100008" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df1 <- merge(phenodat, iddat, by=\"pid\", all.x=TRUE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Then we merge with gene data" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "3 100008 0.2865486 1.6858872 pid8 0\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1\n", "7 100020 -0.4321298 -0.2288958 pid20 0" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df2 <- merge(gdat, df1, by=\"expid\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that there are now only 7 rows becasue 3 phenotypes did not have matching gene data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What if we want to show all genes even if there is no matching phenotype data?" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100002-0.12941071.228393NANA
31000030.88673610.2760235NANA
4100004-0.151396-1.048976NANA
51000050.3297912-0.5208693NANA
6100007-0.7717918-1.070068pid70
71000080.28654861.685887pid80
8100009-1.220512-0.2416898NANA
91000110.8001769-0.7729782NANA
101000131.242919-1.334354NANA
11100014-0.93438510.4958705NANA
121000150.39370871.233976pid151
13100017-0.88643670.4120223pid170
14100018-1.3189380.7935853pid181
151000190.02884391-0.1524106NANA
16100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100002 & -0.1294107 & 1.228393 & NA & NA\\\\\n", "\t3 & 100003 & 0.8867361 & 0.2760235 & NA & NA\\\\\n", "\t4 & 100004 & -0.151396 & -1.048976 & NA & NA\\\\\n", "\t5 & 100005 & 0.3297912 & -0.5208693 & NA & NA\\\\\n", "\t6 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t7 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t8 & 100009 & -1.220512 & -0.2416898 & NA & NA\\\\\n", "\t9 & 100011 & 0.8001769 & -0.7729782 & NA & NA\\\\\n", "\t10 & 100013 & 1.242919 & -1.334354 & NA & NA\\\\\n", "\t11 & 100014 & -0.9343851 & 0.4958705 & NA & NA\\\\\n", "\t12 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t13 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t14 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t15 & 100019 & 0.02884391 & -0.1524106 & NA & NA\\\\\n", "\t16 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.59960833 1.6898725 pid1 0\n", "2 100002 -0.12941069 1.2283928 NA\n", "3 100003 0.88673615 0.2760235 NA\n", "4 100004 -0.15139596 -1.0489755 NA\n", "5 100005 0.32979120 -0.5208693 NA\n", "6 100007 -0.77179177 -1.0700682 pid7 0\n", "7 100008 0.28654857 1.6858872 pid8 0\n", "8 100009 -1.22051198 -0.2416898 NA\n", "9 100011 0.80017687 -0.7729782 NA\n", "10 100013 1.24291877 -1.3343536 NA\n", "11 100014 -0.93438506 0.4958705 NA\n", "12 100015 0.39370865 1.2339762 pid15 1\n", "13 100017 -0.88643672 0.4120223 pid17 0\n", "14 100018 -1.31893760 0.7935853 pid18 1\n", "15 100019 0.02884391 -0.1524106 NA\n", "16 100020 -0.43212979 -0.2288958 pid20 0" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df3 <- merge(gdat, df1, by=\"expid\", all.x=TRUE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What if we want to show all phenotypes even if there is no matching gene data?" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100006NANApid60
3100007-0.7717918-1.070068pid70
41000080.28654861.685887pid80
5100012NANApid121
61000150.39370871.233976pid151
7100016NANApid160
8100017-0.88643670.4120223pid170
9100018-1.3189380.7935853pid181
10100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100006 & NA & NA & pid6 & 0\\\\\n", "\t3 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t4 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t5 & 100012 & NA & NA & pid12 & 1\\\\\n", "\t6 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t7 & 100016 & NA & NA & pid16 & 0\\\\\n", "\t8 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t9 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t10 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "2 100006 NA NA pid6 0\n", "3 100007 -0.7717918 -1.0700682 pid7 0\n", "4 100008 0.2865486 1.6858872 pid8 0\n", "5 100012 NA NA pid12 1\n", "6 100015 0.3937087 1.2339762 pid15 1\n", "7 100016 NA NA pid16 0\n", "8 100017 -0.8864367 0.4120223 pid17 0\n", "9 100018 -1.3189376 0.7935853 pid18 1\n", "10 100020 -0.4321298 -0.2288958 pid20 0" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df4 <- merge(gdat, df1, by=\"expid\", all.y=TRUE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### What if we want to show everything?" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100002-0.12941071.228393NANA
31000030.88673610.2760235NANA
4100004-0.151396-1.048976NANA
51000050.3297912-0.5208693NANA
6100006NANApid60
7100007-0.7717918-1.070068pid70
81000080.28654861.685887pid80
9100009-1.220512-0.2416898NANA
101000110.8001769-0.7729782NANA
11100012NANApid121
121000131.242919-1.334354NANA
13100014-0.93438510.4958705NANA
141000150.39370871.233976pid151
15100016NANApid160
16100017-0.88643670.4120223pid170
17100018-1.3189380.7935853pid181
181000190.02884391-0.1524106NANA
19100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100002 & -0.1294107 & 1.228393 & NA & NA\\\\\n", "\t3 & 100003 & 0.8867361 & 0.2760235 & NA & NA\\\\\n", "\t4 & 100004 & -0.151396 & -1.048976 & NA & NA\\\\\n", "\t5 & 100005 & 0.3297912 & -0.5208693 & NA & NA\\\\\n", "\t6 & 100006 & NA & NA & pid6 & 0\\\\\n", "\t7 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t8 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t9 & 100009 & -1.220512 & -0.2416898 & NA & NA\\\\\n", "\t10 & 100011 & 0.8001769 & -0.7729782 & NA & NA\\\\\n", "\t11 & 100012 & NA & NA & pid12 & 1\\\\\n", "\t12 & 100013 & 1.242919 & -1.334354 & NA & NA\\\\\n", "\t13 & 100014 & -0.9343851 & 0.4958705 & NA & NA\\\\\n", "\t14 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t15 & 100016 & NA & NA & pid16 & 0\\\\\n", "\t16 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t17 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t18 & 100019 & 0.02884391 & -0.1524106 & NA & NA\\\\\n", "\t19 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.59960833 1.6898725 pid1 0\n", "2 100002 -0.12941069 1.2283928 NA\n", "3 100003 0.88673615 0.2760235 NA\n", "4 100004 -0.15139596 -1.0489755 NA\n", "5 100005 0.32979120 -0.5208693 NA\n", "6 100006 NA NA pid6 0\n", "7 100007 -0.77179177 -1.0700682 pid7 0\n", "8 100008 0.28654857 1.6858872 pid8 0\n", "9 100009 -1.22051198 -0.2416898 NA\n", "10 100011 0.80017687 -0.7729782 NA\n", "11 100012 NA NA pid12 1\n", "12 100013 1.24291877 -1.3343536 NA\n", "13 100014 -0.93438506 0.4958705 NA\n", "14 100015 0.39370865 1.2339762 pid15 1\n", "15 100016 NA NA pid16 0\n", "16 100017 -0.88643672 0.4120223 pid17 0\n", "17 100018 -1.31893760 0.7935853 pid18 1\n", "18 100019 0.02884391 -0.1524106 NA\n", "19 100020 -0.43212979 -0.2288958 pid20 0" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df5 <- merge(gdat, df1, by=\"expid\", all.x=TRUE, all.y=TRUE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Rearrange column order\n", "-----" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
pidexpidgene1gene2trt
1pid1100001-0.59960831.6898730
2pid7100007-0.7717918-1.0700680
3pid81000080.28654861.6858870
4pid151000150.39370871.2339761
5pid17100017-0.88643670.41202230
6pid18100018-1.3189380.79358531
7pid20100020-0.4321298-0.22889580
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & pid & expid & gene1 & gene2 & trt\\\\\n", "\\hline\n", "\t1 & pid1 & 100001 & -0.5996083 & 1.689873 & 0\\\\\n", "\t2 & pid7 & 100007 & -0.7717918 & -1.070068 & 0\\\\\n", "\t3 & pid8 & 100008 & 0.2865486 & 1.685887 & 0\\\\\n", "\t4 & pid15 & 100015 & 0.3937087 & 1.233976 & 1\\\\\n", "\t5 & pid17 & 100017 & -0.8864367 & 0.4120223 & 0\\\\\n", "\t6 & pid18 & 100018 & -1.318938 & 0.7935853 & 1\\\\\n", "\t7 & pid20 & 100020 & -0.4321298 & -0.2288958 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " pid expid gene1 gene2 trt\n", "1 pid1 100001 -0.5996083 1.6898725 0\n", "2 pid7 100007 -0.7717918 -1.0700682 0\n", "3 pid8 100008 0.2865486 1.6858872 0\n", "4 pid15 100015 0.3937087 1.2339762 1\n", "5 pid17 100017 -0.8864367 0.4120223 0\n", "6 pid18 100018 -1.3189376 0.7935853 1\n", "7 pid20 100020 -0.4321298 -0.2288958 0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[, c(4,1,2,3,5)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sorting data\n", "---" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "3 100008 0.2865486 1.6858872 pid8 0\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1\n", "7 100020 -0.4321298 -0.2288958 pid20 0" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort by expid" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "3 100008 0.2865486 1.6858872 pid8 0\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1\n", "7 100020 -0.4321298 -0.2288958 pid20 0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[order(df2$expid),]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort by pid" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1\n", "7 100020 -0.4321298 -0.2288958 pid20 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "3 100008 0.2865486 1.6858872 pid8 0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[order(df2$pid),]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort by pid, then by expid" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1\n", "7 100020 -0.4321298 -0.2288958 pid20 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "3 100008 0.2865486 1.6858872 pid8 0" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[order(df2$pid, df2$expid),]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sort by gene1 in decreasing order" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidgene1gene2pidtrt
41000150.39370871.233976pid151
31000080.28654861.685887pid80
7100020-0.4321298-0.2288958pid200
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & expid & gene1 & gene2 & pid & trt\\\\\n", "\\hline\n", "\t4 & 100015 & 0.3937087 & 1.233976 & pid15 & 1\\\\\n", "\t3 & 100008 & 0.2865486 & 1.685887 & pid8 & 0\\\\\n", "\t7 & 100020 & -0.4321298 & -0.2288958 & pid20 & 0\\\\\n", "\t1 & 100001 & -0.5996083 & 1.689873 & pid1 & 0\\\\\n", "\t2 & 100007 & -0.7717918 & -1.070068 & pid7 & 0\\\\\n", "\t5 & 100017 & -0.8864367 & 0.4120223 & pid17 & 0\\\\\n", "\t6 & 100018 & -1.318938 & 0.7935853 & pid18 & 1\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " expid gene1 gene2 pid trt\n", "4 100015 0.3937087 1.2339762 pid15 1\n", "3 100008 0.2865486 1.6858872 pid8 0\n", "7 100020 -0.4321298 -0.2288958 pid20 0\n", "1 100001 -0.5996083 1.6898725 pid1 0\n", "2 100007 -0.7717918 -1.0700682 pid7 0\n", "5 100017 -0.8864367 0.4120223 pid17 0\n", "6 100018 -1.3189376 0.7935853 pid18 1" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2[order(df2$gene1, decreasing = TRUE),]" ] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.1.2" } }, "nbformat": 4, "nbformat_minor": 0 }