{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import count data using basic tools"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to read in the count files using more basic tools (important if the count files are not generated fro,m htseq-count). We will import the files as matrices. We start with some of the preliminary steps"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"library(tools)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"phfile=\"sampletable.txt\"\n",
"cntdir=\"COUNTS\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"sampletable.txt: '10cdaaa6e5fc76ab92b6d845d33a2a2f'"
],
"text/latex": [
"\\textbf{sampletable.txt:} '10cdaaa6e5fc76ab92b6d845d33a2a2f'"
],
"text/markdown": [
"**sampletable.txt:** '10cdaaa6e5fc76ab92b6d845d33a2a2f'"
],
"text/plain": [
" sampletable.txt \n",
"\"10cdaaa6e5fc76ab92b6d845d33a2a2f\" "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"md5sum(phfile)\n",
"phdata=read.table(phfile,sep=\",\",stringsAsFactor=FALSE)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" | filename | sampid | trt | md5sum |
\n",
"\n",
"\t1 | AGTCAA_counts.tsv | AGTCAA | 0 | a9eaa959aba1b02b3831583c2a9751c8 |
\n",
"\t2 | AGTTCC_counts.tsv | AGTTCC | 0 | 4183767e4eeb75dc582bcf438af13500 |
\n",
"\t3 | ATGTCA_counts.tsv | ATGTCA | 0 | 26fbba06520758e5a3acd9bd432ebed4 |
\n",
"\t4 | CCGTCC_counts.tsv | CCGTCC | 1 | 50036a88fd48645f740a31f4f4352cfb |
\n",
"\t5 | GTCCGC_counts.tsv | GTCCGC | 1 | bb1cecd886127159157e9431d072cad5 |
\n",
"\t6 | GTGAAA_counts.tsv | GTGAAA | 1 | fa544c0a076eedb54937c7189f4e1fbc |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & filename & sampid & trt & md5sum\\\\\n",
"\\hline\n",
"\t1 & AGTCAA_counts.tsv & AGTCAA & 0 & a9eaa959aba1b02b3831583c2a9751c8\\\\\n",
"\t2 & AGTTCC_counts.tsv & AGTTCC & 0 & 4183767e4eeb75dc582bcf438af13500\\\\\n",
"\t3 & ATGTCA_counts.tsv & ATGTCA & 0 & 26fbba06520758e5a3acd9bd432ebed4\\\\\n",
"\t4 & CCGTCC_counts.tsv & CCGTCC & 1 & 50036a88fd48645f740a31f4f4352cfb\\\\\n",
"\t5 & GTCCGC_counts.tsv & GTCCGC & 1 & bb1cecd886127159157e9431d072cad5\\\\\n",
"\t6 & GTGAAA_counts.tsv & GTGAAA & 1 & fa544c0a076eedb54937c7189f4e1fbc\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" filename sampid trt md5sum\n",
"1 AGTCAA_counts.tsv AGTCAA 0 a9eaa959aba1b02b3831583c2a9751c8\n",
"2 AGTTCC_counts.tsv AGTTCC 0 4183767e4eeb75dc582bcf438af13500\n",
"3 ATGTCA_counts.tsv ATGTCA 0 26fbba06520758e5a3acd9bd432ebed4\n",
"4 CCGTCC_counts.tsv CCGTCC 1 50036a88fd48645f740a31f4f4352cfb\n",
"5 GTCCGC_counts.tsv GTCCGC 1 bb1cecd886127159157e9431d072cad5\n",
"6 GTGAAA_counts.tsv GTGAAA 1 fa544c0a076eedb54937c7189f4e1fbc"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colnames(phdata)=c(\"filename\",\"sampid\",\"trt\")\n",
"phdata[[\"md5sum\"]]=md5sum(file.path(cntdir,phdata[[\"filename\"]]))\n",
"phdata"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | sampid | filename | trt | md5sum |
\n",
"\n",
"\t1 | AGTCAA | AGTCAA_counts.tsv | 0 | a9eaa959aba1b02b3831583c2a9751c8 |
\n",
"\t2 | AGTTCC | AGTTCC_counts.tsv | 0 | 4183767e4eeb75dc582bcf438af13500 |
\n",
"\t3 | ATGTCA | ATGTCA_counts.tsv | 0 | 26fbba06520758e5a3acd9bd432ebed4 |
\n",
"\t4 | CCGTCC | CCGTCC_counts.tsv | 1 | 50036a88fd48645f740a31f4f4352cfb |
\n",
"\t5 | GTCCGC | GTCCGC_counts.tsv | 1 | bb1cecd886127159157e9431d072cad5 |
\n",
"\t6 | GTGAAA | GTGAAA_counts.tsv | 1 | fa544c0a076eedb54937c7189f4e1fbc |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & sampid & filename & trt & md5sum\\\\\n",
"\\hline\n",
"\t1 & AGTCAA & AGTCAA_counts.tsv & 0 & a9eaa959aba1b02b3831583c2a9751c8\\\\\n",
"\t2 & AGTTCC & AGTTCC_counts.tsv & 0 & 4183767e4eeb75dc582bcf438af13500\\\\\n",
"\t3 & ATGTCA & ATGTCA_counts.tsv & 0 & 26fbba06520758e5a3acd9bd432ebed4\\\\\n",
"\t4 & CCGTCC & CCGTCC_counts.tsv & 1 & 50036a88fd48645f740a31f4f4352cfb\\\\\n",
"\t5 & GTCCGC & GTCCGC_counts.tsv & 1 & bb1cecd886127159157e9431d072cad5\\\\\n",
"\t6 & GTGAAA & GTGAAA_counts.tsv & 1 & fa544c0a076eedb54937c7189f4e1fbc\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" sampid filename trt md5sum\n",
"1 AGTCAA AGTCAA_counts.tsv 0 a9eaa959aba1b02b3831583c2a9751c8\n",
"2 AGTTCC AGTTCC_counts.tsv 0 4183767e4eeb75dc582bcf438af13500\n",
"3 ATGTCA ATGTCA_counts.tsv 0 26fbba06520758e5a3acd9bd432ebed4\n",
"4 CCGTCC CCGTCC_counts.tsv 1 50036a88fd48645f740a31f4f4352cfb\n",
"5 GTCCGC GTCCGC_counts.tsv 1 bb1cecd886127159157e9431d072cad5\n",
"6 GTGAAA GTGAAA_counts.tsv 1 fa544c0a076eedb54937c7189f4e1fbc"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phdata=phdata[c(\"sampid\",\"filename\",\"trt\",\"md5sum\")]\n",
"phdata"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"phdata[[\"trt\"]]=as.factor(phdata[[\"trt\"]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This function reads the data from one file and returns the results as a matrix. It is designed to only keep the rows for which the gene id starts with the prefix \"GeneID\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"readcnts=function(fname,sep=\"\\t\",prefix=\"GeneID\",collab=\"V1\",header=FALSE)\n",
" {\n",
" ### Import text file\n",
" dat=read.table(fname,sep=sep,header=header,stringsAsFactor=FALSE)\n",
" ### Only keep rows for which the gene id matches with the prefix\n",
" dat=dat[substr(dat[[collab]],1,nchar(prefix))==prefix,]\n",
" return(dat)\n",
" }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's read in a file"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"file1=readcnts(\"COUNTS/AGTCAA_counts.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- 4436
\n",
"\t- 2
\n",
"
\n"
],
"text/latex": [
"\\begin{enumerate*}\n",
"\\item 4436\n",
"\\item 2\n",
"\\end{enumerate*}\n"
],
"text/markdown": [
"1. 4436\n",
"2. 2\n",
"\n",
"\n"
],
"text/plain": [
"[1] 4436 2"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dim(file1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | V1 | V2 |
\n",
"\n",
"\t1 | GeneID:12930114 | 118 |
\n",
"\t2 | GeneID:12930115 | 30 |
\n",
"\t3 | GeneID:12930116 | 15 |
\n",
"\t4 | GeneID:12930117 | 12 |
\n",
"\t5 | GeneID:12930118 | 122 |
\n",
"\t6 | GeneID:12930119 | 60 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|ll}\n",
" & V1 & V2\\\\\n",
"\\hline\n",
"\t1 & GeneID:12930114 & 118\\\\\n",
"\t2 & GeneID:12930115 & 30\\\\\n",
"\t3 & GeneID:12930116 & 15\\\\\n",
"\t4 & GeneID:12930117 & 12\\\\\n",
"\t5 & GeneID:12930118 & 122\\\\\n",
"\t6 & GeneID:12930119 & 60\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
" V1 V2\n",
"1 GeneID:12930114 118\n",
"2 GeneID:12930115 30\n",
"3 GeneID:12930116 15\n",
"4 GeneID:12930117 12\n",
"5 GeneID:12930118 122\n",
"6 GeneID:12930119 60"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(file1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now read in all files "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Read in the first file\n",
"countdat=readcnts(file.path(cntdir,phdata$filename[1]))\n",
"### Assign the sample id as the column name\n",
"names(countdat)[2]=phdata$sampid[1]\n",
"### Repeat the last two steps for files 2 ... 6 and merge\n",
"### along each step\n",
"for(i in 2:nrow(phdata))\n",
" {\n",
" dat2=readcnts(file.path(cntdir,phdata$filename[i]))\n",
" names(dat2)[2]=phdata$sampid[i]\n",
" countdat=merge(countdat,dat2,by=\"V1\")\n",
" }\n",
"\n",
"geneid=countdat$V1\n",
"countdat=as.matrix(countdat[,-1])\n",
"row.names(countdat)=geneid"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | AGTCAA | AGTTCC | ATGTCA | CCGTCC | GTCCGC | GTGAAA |
\n",
"\n",
"\tGeneID:12930114 | 118 | 137 | 149 | 120 | 161 | 174 |
\n",
"\tGeneID:12930115 | 30 | 42 | 25 | 18 | 32 | 34 |
\n",
"\tGeneID:12930116 | 15 | 55 | 37 | 49 | 36 | 27 |
\n",
"\tGeneID:12930117 | 12 | 12 | 13 | 11 | 7 | 6 |
\n",
"\tGeneID:12930118 | 122 | 137 | 94 | 48 | 131 | 69 |
\n",
"\tGeneID:12930119 | 60 | 88 | 78 | 53 | 43 | 29 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llllll}\n",
" & AGTCAA & AGTTCC & ATGTCA & CCGTCC & GTCCGC & GTGAAA\\\\\n",
"\\hline\n",
"\tGeneID:12930114 & 118 & 137 & 149 & 120 & 161 & 174\\\\\n",
"\tGeneID:12930115 & 30 & 42 & 25 & 18 & 32 & 34\\\\\n",
"\tGeneID:12930116 & 15 & 55 & 37 & 49 & 36 & 27\\\\\n",
"\tGeneID:12930117 & 12 & 12 & 13 & 11 & 7 & 6\\\\\n",
"\tGeneID:12930118 & 122 & 137 & 94 & 48 & 131 & 69\\\\\n",
"\tGeneID:12930119 & 60 & 88 & 78 & 53 & 43 & 29\\\\\n",
"\\end{tabular}\n"
],
"text/markdown": [
"1. 118\n",
"2. 30\n",
"3. 15\n",
"4. 12\n",
"5. 122\n",
"6. 60\n",
"7. 137\n",
"8. 42\n",
"9. 55\n",
"10. 12\n",
"11. 137\n",
"12. 88\n",
"13. 149\n",
"14. 25\n",
"15. 37\n",
"16. 13\n",
"17. 94\n",
"18. 78\n",
"19. 120\n",
"20. 18\n",
"21. 49\n",
"22. 11\n",
"23. 48\n",
"24. 53\n",
"25. 161\n",
"26. 32\n",
"27. 36\n",
"28. 7\n",
"29. 131\n",
"30. 43\n",
"31. 174\n",
"32. 34\n",
"33. 27\n",
"34. 6\n",
"35. 69\n",
"36. 29\n",
"\n",
"\n"
],
"text/plain": [
" AGTCAA AGTTCC ATGTCA CCGTCC GTCCGC GTGAAA\n",
"GeneID:12930114 118 137 149 120 161 174\n",
"GeneID:12930115 30 42 25 18 32 34\n",
"GeneID:12930116 15 55 37 49 36 27\n",
"GeneID:12930117 12 12 13 11 7 6\n",
"GeneID:12930118 122 137 94 48 131 69\n",
"GeneID:12930119 60 88 78 53 43 29"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(countdat)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading required package: S4Vectors\n",
"Loading required package: stats4\n",
"Loading required package: BiocGenerics\n",
"Loading required package: parallel\n",
"\n",
"Attaching package: ‘BiocGenerics’\n",
"\n",
"The following objects are masked from ‘package:parallel’:\n",
"\n",
" clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
" clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
" parLapplyLB, parRapply, parSapply, parSapplyLB\n",
"\n",
"The following objects are masked from ‘package:stats’:\n",
"\n",
" IQR, mad, xtabs\n",
"\n",
"The following objects are masked from ‘package:base’:\n",
"\n",
" anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,\n",
" do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,\n",
" intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget,\n",
" order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
" rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
" union, unique, unlist, unsplit\n",
"\n",
"Loading required package: IRanges\n",
"Loading required package: GenomicRanges\n",
"Loading required package: GenomeInfoDb\n",
"Loading required package: SummarizedExperiment\n",
"Loading required package: Biobase\n",
"Welcome to Bioconductor\n",
"\n",
" Vignettes contain introductory material; view with\n",
" 'browseVignettes()'. To cite Bioconductor, see\n",
" 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
"\n",
"Loading required package: Rcpp\n",
"Loading required package: RcppArmadillo\n"
]
}
],
"source": [
"library(DESeq2)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dds=DESeqDataSetFromMatrix(countdat,DataFrame(phdata),design=~trt)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"class: DESeqDataSet \n",
"dim: 4436 6 \n",
"metadata(0):\n",
"assays(1): counts\n",
"rownames(4436): GeneID:12930114 GeneID:12930115 ... GeneID:13406005\n",
" GeneID:13406006\n",
"rowRanges metadata column names(0):\n",
"colnames(6): AGTCAA AGTTCC ... GTCCGC GTGAAA\n",
"colData names(4): sampid filename trt md5sum"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dds"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"R version 3.2.3 (2015-12-10)\n",
"Platform: x86_64-apple-darwin13.4.0 (64-bit)\n",
"Running under: OS X 10.11.3 (El Capitan)\n",
"\n",
"locale:\n",
"[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n",
"\n",
"attached base packages:\n",
" [1] parallel stats4 tools stats graphics grDevices utils \n",
" [8] datasets methods base \n",
"\n",
"other attached packages:\n",
" [1] DESeq2_1.10.1 RcppArmadillo_0.6.500.4.0 \n",
" [3] Rcpp_0.12.3 SummarizedExperiment_1.0.2\n",
" [5] Biobase_2.30.0 GenomicRanges_1.22.4 \n",
" [7] GenomeInfoDb_1.6.3 IRanges_2.4.6 \n",
" [9] S4Vectors_0.8.11 BiocGenerics_0.16.1 \n",
"\n",
"loaded via a namespace (and not attached):\n",
" [1] RColorBrewer_1.1-2 plyr_1.8.3 futile.logger_1.4.1 \n",
" [4] XVector_0.10.0 base64enc_0.1-3 futile.options_1.0.0\n",
" [7] zlibbioc_1.16.0 rpart_4.1-10 digest_0.6.9 \n",
"[10] uuid_0.1-2 RSQLite_1.0.0 annotate_1.48.0 \n",
"[13] jsonlite_0.9.19 evaluate_0.8 gtable_0.1.2 \n",
"[16] lattice_0.20-33 DBI_0.3.1 IRdisplay_0.3 \n",
"[19] IRkernel_0.5 gridExtra_2.0.0 rzmq_0.7.7 \n",
"[22] genefilter_1.52.1 cluster_2.0.3 repr_0.4 \n",
"[25] stringr_1.0.0 locfit_1.5-9.1 nnet_7.3-12 \n",
"[28] grid_3.2.3 AnnotationDbi_1.32.3 XML_3.98-1.3 \n",
"[31] survival_2.38-3 BiocParallel_1.4.3 foreign_0.8-66 \n",
"[34] latticeExtra_0.6-26 Formula_1.2-1 geneplotter_1.48.0 \n",
"[37] ggplot2_2.0.0 lambda.r_1.1.7 magrittr_1.5 \n",
"[40] Hmisc_3.17-1 scales_0.3.0 splines_3.2.3 \n",
"[43] xtable_1.8-2 colorspace_1.2-6 stringi_1.0-1 \n",
"[46] acepack_1.3-3.3 munsell_0.4.2 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sessionInfo()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.2.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}