{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Import count data using basic tools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We want to read in the count files using more basic tools (important if the count files are not generated fro,m htseq-count). We will import the files as matrices. We start with some of the preliminary steps" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "library(tools)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "phfile=\"sampletable.txt\"\n", "cntdir=\"COUNTS\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "sampletable.txt: '10cdaaa6e5fc76ab92b6d845d33a2a2f'" ], "text/latex": [ "\\textbf{sampletable.txt:} '10cdaaa6e5fc76ab92b6d845d33a2a2f'" ], "text/markdown": [ "**sampletable.txt:** '10cdaaa6e5fc76ab92b6d845d33a2a2f'" ], "text/plain": [ " sampletable.txt \n", "\"10cdaaa6e5fc76ab92b6d845d33a2a2f\" " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "md5sum(phfile)\n", "phdata=read.table(phfile,sep=\",\",stringsAsFactor=FALSE)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
filenamesampidtrtmd5sum
1AGTCAA_counts.tsvAGTCAA0a9eaa959aba1b02b3831583c2a9751c8
2AGTTCC_counts.tsvAGTTCC04183767e4eeb75dc582bcf438af13500
3ATGTCA_counts.tsvATGTCA026fbba06520758e5a3acd9bd432ebed4
4CCGTCC_counts.tsvCCGTCC150036a88fd48645f740a31f4f4352cfb
5GTCCGC_counts.tsvGTCCGC1bb1cecd886127159157e9431d072cad5
6GTGAAA_counts.tsvGTGAAA1fa544c0a076eedb54937c7189f4e1fbc
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & filename & sampid & trt & md5sum\\\\\n", "\\hline\n", "\t1 & AGTCAA_counts.tsv & AGTCAA & 0 & a9eaa959aba1b02b3831583c2a9751c8\\\\\n", "\t2 & AGTTCC_counts.tsv & AGTTCC & 0 & 4183767e4eeb75dc582bcf438af13500\\\\\n", "\t3 & ATGTCA_counts.tsv & ATGTCA & 0 & 26fbba06520758e5a3acd9bd432ebed4\\\\\n", "\t4 & CCGTCC_counts.tsv & CCGTCC & 1 & 50036a88fd48645f740a31f4f4352cfb\\\\\n", "\t5 & GTCCGC_counts.tsv & GTCCGC & 1 & bb1cecd886127159157e9431d072cad5\\\\\n", "\t6 & GTGAAA_counts.tsv & GTGAAA & 1 & fa544c0a076eedb54937c7189f4e1fbc\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " filename sampid trt md5sum\n", "1 AGTCAA_counts.tsv AGTCAA 0 a9eaa959aba1b02b3831583c2a9751c8\n", "2 AGTTCC_counts.tsv AGTTCC 0 4183767e4eeb75dc582bcf438af13500\n", "3 ATGTCA_counts.tsv ATGTCA 0 26fbba06520758e5a3acd9bd432ebed4\n", "4 CCGTCC_counts.tsv CCGTCC 1 50036a88fd48645f740a31f4f4352cfb\n", "5 GTCCGC_counts.tsv GTCCGC 1 bb1cecd886127159157e9431d072cad5\n", "6 GTGAAA_counts.tsv GTGAAA 1 fa544c0a076eedb54937c7189f4e1fbc" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "colnames(phdata)=c(\"filename\",\"sampid\",\"trt\")\n", "phdata[[\"md5sum\"]]=md5sum(file.path(cntdir,phdata[[\"filename\"]]))\n", "phdata" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
sampidfilenametrtmd5sum
1AGTCAAAGTCAA_counts.tsv0a9eaa959aba1b02b3831583c2a9751c8
2AGTTCCAGTTCC_counts.tsv04183767e4eeb75dc582bcf438af13500
3ATGTCAATGTCA_counts.tsv026fbba06520758e5a3acd9bd432ebed4
4CCGTCCCCGTCC_counts.tsv150036a88fd48645f740a31f4f4352cfb
5GTCCGCGTCCGC_counts.tsv1bb1cecd886127159157e9431d072cad5
6GTGAAAGTGAAA_counts.tsv1fa544c0a076eedb54937c7189f4e1fbc
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & sampid & filename & trt & md5sum\\\\\n", "\\hline\n", "\t1 & AGTCAA & AGTCAA_counts.tsv & 0 & a9eaa959aba1b02b3831583c2a9751c8\\\\\n", "\t2 & AGTTCC & AGTTCC_counts.tsv & 0 & 4183767e4eeb75dc582bcf438af13500\\\\\n", "\t3 & ATGTCA & ATGTCA_counts.tsv & 0 & 26fbba06520758e5a3acd9bd432ebed4\\\\\n", "\t4 & CCGTCC & CCGTCC_counts.tsv & 1 & 50036a88fd48645f740a31f4f4352cfb\\\\\n", "\t5 & GTCCGC & GTCCGC_counts.tsv & 1 & bb1cecd886127159157e9431d072cad5\\\\\n", "\t6 & GTGAAA & GTGAAA_counts.tsv & 1 & fa544c0a076eedb54937c7189f4e1fbc\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " sampid filename trt md5sum\n", "1 AGTCAA AGTCAA_counts.tsv 0 a9eaa959aba1b02b3831583c2a9751c8\n", "2 AGTTCC AGTTCC_counts.tsv 0 4183767e4eeb75dc582bcf438af13500\n", "3 ATGTCA ATGTCA_counts.tsv 0 26fbba06520758e5a3acd9bd432ebed4\n", "4 CCGTCC CCGTCC_counts.tsv 1 50036a88fd48645f740a31f4f4352cfb\n", "5 GTCCGC GTCCGC_counts.tsv 1 bb1cecd886127159157e9431d072cad5\n", "6 GTGAAA GTGAAA_counts.tsv 1 fa544c0a076eedb54937c7189f4e1fbc" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "phdata=phdata[c(\"sampid\",\"filename\",\"trt\",\"md5sum\")]\n", "phdata" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "phdata[[\"trt\"]]=as.factor(phdata[[\"trt\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This function reads the data from one file and returns the results as a matrix. It is designed to only keep the rows for which the gene id starts with the prefix \"GeneID\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "readcnts=function(fname,sep=\"\\t\",prefix=\"GeneID\",collab=\"V1\",header=FALSE)\n", " {\n", " ### Import text file\n", " dat=read.table(fname,sep=sep,header=header,stringsAsFactor=FALSE)\n", " ### Only keep rows for which the gene id matches with the prefix\n", " dat=dat[substr(dat[[collab]],1,nchar(prefix))==prefix,]\n", " return(dat)\n", " }\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's read in a file" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "file1=readcnts(\"COUNTS/AGTCAA_counts.tsv\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 4436
  2. \n", "\t
  3. 2
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 4436\n", "\\item 2\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 4436\n", "2. 2\n", "\n", "\n" ], "text/plain": [ "[1] 4436 2" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dim(file1)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
V1V2
1GeneID:12930114118
2GeneID:1293011530
3GeneID:1293011615
4GeneID:1293011712
5GeneID:12930118122
6GeneID:1293011960
\n" ], "text/latex": [ "\\begin{tabular}{r|ll}\n", " & V1 & V2\\\\\n", "\\hline\n", "\t1 & GeneID:12930114 & 118\\\\\n", "\t2 & GeneID:12930115 & 30\\\\\n", "\t3 & GeneID:12930116 & 15\\\\\n", "\t4 & GeneID:12930117 & 12\\\\\n", "\t5 & GeneID:12930118 & 122\\\\\n", "\t6 & GeneID:12930119 & 60\\\\\n", "\\end{tabular}\n" ], "text/plain": [ " V1 V2\n", "1 GeneID:12930114 118\n", "2 GeneID:12930115 30\n", "3 GeneID:12930116 15\n", "4 GeneID:12930117 12\n", "5 GeneID:12930118 122\n", "6 GeneID:12930119 60" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(file1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now read in all files " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## Read in the first file\n", "countdat=readcnts(file.path(cntdir,phdata$filename[1]))\n", "### Assign the sample id as the column name\n", "names(countdat)[2]=phdata$sampid[1]\n", "### Repeat the last two steps for files 2 ... 6 and merge\n", "### along each step\n", "for(i in 2:nrow(phdata))\n", " {\n", " dat2=readcnts(file.path(cntdir,phdata$filename[i]))\n", " names(dat2)[2]=phdata$sampid[i]\n", " countdat=merge(countdat,dat2,by=\"V1\")\n", " }\n", "\n", "geneid=countdat$V1\n", "countdat=as.matrix(countdat[,-1])\n", "row.names(countdat)=geneid" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
AGTCAAAGTTCCATGTCACCGTCCGTCCGCGTGAAA
GeneID:12930114118137149120161174
GeneID:12930115304225183234
GeneID:12930116155537493627
GeneID:1293011712121311 7 6
GeneID:12930118122137 94 48131 69
GeneID:12930119608878534329
\n" ], "text/latex": [ "\\begin{tabular}{r|llllll}\n", " & AGTCAA & AGTTCC & ATGTCA & CCGTCC & GTCCGC & GTGAAA\\\\\n", "\\hline\n", "\tGeneID:12930114 & 118 & 137 & 149 & 120 & 161 & 174\\\\\n", "\tGeneID:12930115 & 30 & 42 & 25 & 18 & 32 & 34\\\\\n", "\tGeneID:12930116 & 15 & 55 & 37 & 49 & 36 & 27\\\\\n", "\tGeneID:12930117 & 12 & 12 & 13 & 11 & 7 & 6\\\\\n", "\tGeneID:12930118 & 122 & 137 & 94 & 48 & 131 & 69\\\\\n", "\tGeneID:12930119 & 60 & 88 & 78 & 53 & 43 & 29\\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "1. 118\n", "2. 30\n", "3. 15\n", "4. 12\n", "5. 122\n", "6. 60\n", "7. 137\n", "8. 42\n", "9. 55\n", "10. 12\n", "11. 137\n", "12. 88\n", "13. 149\n", "14. 25\n", "15. 37\n", "16. 13\n", "17. 94\n", "18. 78\n", "19. 120\n", "20. 18\n", "21. 49\n", "22. 11\n", "23. 48\n", "24. 53\n", "25. 161\n", "26. 32\n", "27. 36\n", "28. 7\n", "29. 131\n", "30. 43\n", "31. 174\n", "32. 34\n", "33. 27\n", "34. 6\n", "35. 69\n", "36. 29\n", "\n", "\n" ], "text/plain": [ " AGTCAA AGTTCC ATGTCA CCGTCC GTCCGC GTGAAA\n", "GeneID:12930114 118 137 149 120 161 174\n", "GeneID:12930115 30 42 25 18 32 34\n", "GeneID:12930116 15 55 37 49 36 27\n", "GeneID:12930117 12 12 13 11 7 6\n", "GeneID:12930118 122 137 94 48 131 69\n", "GeneID:12930119 60 88 78 53 43 29" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(countdat)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading required package: S4Vectors\n", "Loading required package: stats4\n", "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,\n", " do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,\n", " intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,\n", " order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n", " rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n", " union, unique, unlist, unsplit\n", "\n", "Loading required package: IRanges\n", "Loading required package: GenomicRanges\n", "Loading required package: GenomeInfoDb\n", "Loading required package: SummarizedExperiment\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: Rcpp\n", "Loading required package: RcppArmadillo\n" ] } ], "source": [ "library(DESeq2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dds=DESeqDataSetFromMatrix(countdat,DataFrame(phdata),design=~trt)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "class: DESeqDataSet \n", "dim: 4436 6 \n", "metadata(0):\n", "assays(1): counts\n", "rownames(4436): GeneID:12930114 GeneID:12930115 ... GeneID:13406005\n", " GeneID:13406006\n", "rowRanges metadata column names(0):\n", "colnames(6): AGTCAA AGTTCC ... GTCCGC GTGAAA\n", "colData names(4): sampid filename trt md5sum" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dds" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "R version 3.2.3 (2015-12-10)\n", "Platform: x86_64-apple-darwin13.4.0 (64-bit)\n", "Running under: OS X 10.11.3 (El Capitan)\n", "\n", "locale:\n", "[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n", "\n", "attached base packages:\n", " [1] parallel stats4 tools stats graphics grDevices utils \n", " [8] datasets methods base \n", "\n", "other attached packages:\n", " [1] DESeq2_1.10.1 RcppArmadillo_0.6.500.4.0 \n", " [3] Rcpp_0.12.3 SummarizedExperiment_1.0.2\n", " [5] Biobase_2.30.0 GenomicRanges_1.22.4 \n", " [7] GenomeInfoDb_1.6.3 IRanges_2.4.6 \n", " [9] S4Vectors_0.8.11 BiocGenerics_0.16.1 \n", "\n", "loaded via a namespace (and not attached):\n", " [1] RColorBrewer_1.1-2 plyr_1.8.3 futile.logger_1.4.1 \n", " [4] XVector_0.10.0 base64enc_0.1-3 futile.options_1.0.0\n", " [7] zlibbioc_1.16.0 rpart_4.1-10 digest_0.6.9 \n", "[10] uuid_0.1-2 RSQLite_1.0.0 annotate_1.48.0 \n", "[13] jsonlite_0.9.19 evaluate_0.8 gtable_0.1.2 \n", "[16] lattice_0.20-33 DBI_0.3.1 IRdisplay_0.3 \n", "[19] IRkernel_0.5 gridExtra_2.0.0 rzmq_0.7.7 \n", "[22] genefilter_1.52.1 cluster_2.0.3 repr_0.4 \n", "[25] stringr_1.0.0 locfit_1.5-9.1 nnet_7.3-12 \n", "[28] grid_3.2.3 AnnotationDbi_1.32.3 XML_3.98-1.3 \n", "[31] survival_2.38-3 BiocParallel_1.4.3 foreign_0.8-66 \n", "[34] latticeExtra_0.6-26 Formula_1.2-1 geneplotter_1.48.0 \n", "[37] ggplot2_2.0.0 lambda.r_1.1.7 magrittr_1.5 \n", "[40] Hmisc_3.17-1 scales_0.3.0 splines_3.2.3 \n", "[43] xtable_1.8-2 colorspace_1.2-6 stringi_1.0-1 \n", "[46] acepack_1.3-3.3 munsell_0.4.2 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sessionInfo()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.2.3" } }, "nbformat": 4, "nbformat_minor": 0 }