{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Set up environment" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading tidyverse: ggplot2\n", "Loading tidyverse: tibble\n", "Loading tidyverse: tidyr\n", "Loading tidyverse: readr\n", "Loading tidyverse: purrr\n", "Loading tidyverse: dplyr\n", "Conflicts with tidy packages ---------------------------------------------------\n", "filter(): dplyr, stats\n", "lag(): dplyr, stats\n", "\n", "Attaching package: ‘foreach’\n", "\n", "The following objects are masked from ‘package:purrr’:\n", "\n", " accumulate, when\n", "\n", "Loading required package: S4Vectors\n", "Loading required package: stats4\n", "Loading required package: BiocGenerics\n", "Loading required package: parallel\n", "\n", "Attaching package: ‘BiocGenerics’\n", "\n", "The following objects are masked from ‘package:parallel’:\n", "\n", " clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n", " clusterExport, clusterMap, parApply, parCapply, parLapply,\n", " parLapplyLB, parRapply, parSapply, parSapplyLB\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " combine, intersect, setdiff, union\n", "\n", "The following objects are masked from ‘package:stats’:\n", "\n", " IQR, mad, sd, var, xtabs\n", "\n", "The following objects are masked from ‘package:base’:\n", "\n", " anyDuplicated, append, as.data.frame, cbind, colMeans, colnames,\n", " colSums, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n", " grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match,\n", " mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n", " rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,\n", " table, tapply, union, unique, unsplit, which, which.max, which.min\n", "\n", "\n", "Attaching package: ‘S4Vectors’\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " first, rename\n", "\n", "The following object is masked from ‘package:tidyr’:\n", "\n", " expand\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " expand.grid\n", "\n", "Loading required package: IRanges\n", "\n", "Attaching package: ‘IRanges’\n", "\n", "The following objects are masked from ‘package:dplyr’:\n", "\n", " collapse, desc, slice\n", "\n", "The following object is masked from ‘package:purrr’:\n", "\n", " reduce\n", "\n", "Loading required package: GenomicRanges\n", "Loading required package: GenomeInfoDb\n", "Loading required package: SummarizedExperiment\n", "Loading required package: Biobase\n", "Welcome to Bioconductor\n", "\n", " Vignettes contain introductory material; view with\n", " 'browseVignettes()'. To cite Bioconductor, see\n", " 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n", "\n", "Loading required package: DelayedArray\n", "Loading required package: matrixStats\n", "\n", "Attaching package: ‘matrixStats’\n", "\n", "The following objects are masked from ‘package:Biobase’:\n", "\n", " anyMissing, rowMedians\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " count\n", "\n", "\n", "Attaching package: ‘DelayedArray’\n", "\n", "The following objects are masked from ‘package:matrixStats’:\n", "\n", " colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " apply\n", "\n", "\n", "Attaching package: ‘limma’\n", "\n", "The following object is masked from ‘package:DESeq2’:\n", "\n", " plotMA\n", "\n", "The following object is masked from ‘package:BiocGenerics’:\n", "\n", " plotMA\n", "\n", "\n", "Attaching package: ‘gridExtra’\n", "\n", "The following object is masked from ‘package:Biobase’:\n", "\n", " combine\n", "\n", "The following object is masked from ‘package:BiocGenerics’:\n", "\n", " combine\n", "\n", "The following object is masked from ‘package:dplyr’:\n", "\n", " combine\n", "\n", "\n", "---------------------\n", "Welcome to dendextend version 1.8.0\n", "Type citation('dendextend') for how to cite the package.\n", "\n", "Type browseVignettes(package = 'dendextend') for the package vignette.\n", "The github page is: https://github.com/talgalili/dendextend/\n", "\n", "Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues\n", "Or contact: \n", "\n", "\tTo suppress this message use: suppressPackageStartupMessages(library(dendextend))\n", "---------------------\n", "\n", "\n", "Attaching package: ‘dendextend’\n", "\n", "The following object is masked from ‘package:stats’:\n", "\n", " cutree\n", "\n", "\n", "Attaching package: ‘plotly’\n", "\n", "The following object is masked from ‘package:IRanges’:\n", "\n", " slice\n", "\n", "The following object is masked from ‘package:S4Vectors’:\n", "\n", " rename\n", "\n", "The following object is masked from ‘package:ggplot2’:\n", "\n", " last_plot\n", "\n", "The following object is masked from ‘package:stats’:\n", "\n", " filter\n", "\n", "The following object is masked from ‘package:graphics’:\n", "\n", " layout\n", "\n" ] } ], "source": [ "source(\"pilot_config.R\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# get files of all star output files\n", "stardirs <- list.files(DATDIR)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read in the count data output from STAR" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 204 files under this folder. Each file is generated from a fastq file using STAR." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "204" ], "text/latex": [ "204" ], "text/markdown": [ "204" ], "text/plain": [ "[1] 204" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "length(stardirs)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. '1_MA_J_S18_L001_ReadsPerGene.out.tab'
  2. \n", "\t
  3. '1_MA_J_S18_L002_ReadsPerGene.out.tab'
  4. \n", "\t
  5. '1_MA_J_S18_L003_ReadsPerGene.out.tab'
  6. \n", "\t
  7. '1_MA_J_S18_L004_ReadsPerGene.out.tab'
  8. \n", "\t
  9. '1_RZ_J_S26_L001_ReadsPerGene.out.tab'
  10. \n", "\t
  11. '1_RZ_J_S26_L002_ReadsPerGene.out.tab'
  12. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item '1\\_MA\\_J\\_S18\\_L001\\_ReadsPerGene.out.tab'\n", "\\item '1\\_MA\\_J\\_S18\\_L002\\_ReadsPerGene.out.tab'\n", "\\item '1\\_MA\\_J\\_S18\\_L003\\_ReadsPerGene.out.tab'\n", "\\item '1\\_MA\\_J\\_S18\\_L004\\_ReadsPerGene.out.tab'\n", "\\item '1\\_RZ\\_J\\_S26\\_L001\\_ReadsPerGene.out.tab'\n", "\\item '1\\_RZ\\_J\\_S26\\_L002\\_ReadsPerGene.out.tab'\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. '1_MA_J_S18_L001_ReadsPerGene.out.tab'\n", "2. '1_MA_J_S18_L002_ReadsPerGene.out.tab'\n", "3. '1_MA_J_S18_L003_ReadsPerGene.out.tab'\n", "4. '1_MA_J_S18_L004_ReadsPerGene.out.tab'\n", "5. '1_RZ_J_S26_L001_ReadsPerGene.out.tab'\n", "6. '1_RZ_J_S26_L002_ReadsPerGene.out.tab'\n", "\n", "\n" ], "text/plain": [ "[1] \"1_MA_J_S18_L001_ReadsPerGene.out.tab\"\n", "[2] \"1_MA_J_S18_L002_ReadsPerGene.out.tab\"\n", "[3] \"1_MA_J_S18_L003_ReadsPerGene.out.tab\"\n", "[4] \"1_MA_J_S18_L004_ReadsPerGene.out.tab\"\n", "[5] \"1_RZ_J_S26_L001_ReadsPerGene.out.tab\"\n", "[6] \"1_RZ_J_S26_L002_ReadsPerGene.out.tab\"" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "head(stardirs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "View the head of one file (1_MA_J_S18_L001_ReadsPerGene.out.tab). Notice that there are four columns and the columns we want is the **first** and the **fourth** columns." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
    1. \n", "\t
    2. 'N_unmapped'
    3. \n", "\t
    4. '2690'
    5. \n", "\t
    6. '2690'
    7. \n", "\t
    8. '2690'
    9. \n", "
    \n", "
  1. \n", "\t
    1. \n", "\t
    2. 'N_multimapping'
    3. \n", "\t
    4. '66100'
    5. \n", "\t
    6. '66100'
    7. \n", "\t
    8. '66100'
    9. \n", "
    \n", "
  2. \n", "\t
    1. \n", "\t
    2. 'N_noFeature'
    3. \n", "\t
    4. '10626'
    5. \n", "\t
    6. '2238382'
    7. \n", "\t
    8. '20347'
    9. \n", "
    \n", "
  3. \n", "\t
    1. \n", "\t
    2. 'N_ambiguous'
    3. \n", "\t
    4. '173170'
    5. \n", "\t
    6. '1622'
    7. \n", "\t
    8. '647'
    9. \n", "
    \n", "
  4. \n", "\t
    1. \n", "\t
    2. 'CNAG_04548'
    3. \n", "\t
    4. '0'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '0'
    9. \n", "
    \n", "
  5. \n", "\t
    1. \n", "\t
    2. 'CNAG_07303'
    3. \n", "\t
    4. '0'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '0'
    9. \n", "
    \n", "
  6. \n", "\t
    1. \n", "\t
    2. 'CNAG_07304'
    3. \n", "\t
    4. '8'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '8'
    9. \n", "
    \n", "
  7. \n", "\t
    1. \n", "\t
    2. 'CNAG_00001'
    3. \n", "\t
    4. '0'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '0'
    9. \n", "
    \n", "
  8. \n", "\t
    1. \n", "\t
    2. 'CNAG_07305'
    3. \n", "\t
    4. '0'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '0'
    9. \n", "
    \n", "
  9. \n", "\t
    1. \n", "\t
    2. 'CNAG_00002'
    3. \n", "\t
    4. '66'
    5. \n", "\t
    6. '0'
    7. \n", "\t
    8. '66'
    9. \n", "
    \n", "
  10. \n", "
\n" ], "text/latex": [ "\\begin{enumerate}\n", "\\item \\begin{enumerate*}\n", "\\item 'N\\_unmapped'\n", "\\item '2690'\n", "\\item '2690'\n", "\\item '2690'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'N\\_multimapping'\n", "\\item '66100'\n", "\\item '66100'\n", "\\item '66100'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'N\\_noFeature'\n", "\\item '10626'\n", "\\item '2238382'\n", "\\item '20347'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'N\\_ambiguous'\n", "\\item '173170'\n", "\\item '1622'\n", "\\item '647'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_04548'\n", "\\item '0'\n", "\\item '0'\n", "\\item '0'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_07303'\n", "\\item '0'\n", "\\item '0'\n", "\\item '0'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_07304'\n", "\\item '8'\n", "\\item '0'\n", "\\item '8'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_00001'\n", "\\item '0'\n", "\\item '0'\n", "\\item '0'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_07305'\n", "\\item '0'\n", "\\item '0'\n", "\\item '0'\n", "\\end{enumerate*}\n", "\n", "\\item \\begin{enumerate*}\n", "\\item 'CNAG\\_00002'\n", "\\item '66'\n", "\\item '0'\n", "\\item '66'\n", "\\end{enumerate*}\n", "\n", "\\end{enumerate}\n" ], "text/markdown": [ "1. 1. 'N_unmapped'\n", "2. '2690'\n", "3. '2690'\n", "4. '2690'\n", "\n", "\n", "\n", "2. 1. 'N_multimapping'\n", "2. '66100'\n", "3. '66100'\n", "4. '66100'\n", "\n", "\n", "\n", "3. 1. 'N_noFeature'\n", "2. '10626'\n", "3. '2238382'\n", "4. '20347'\n", "\n", "\n", "\n", "4. 1. 'N_ambiguous'\n", "2. '173170'\n", "3. '1622'\n", "4. '647'\n", "\n", "\n", "\n", "5. 1. 'CNAG_04548'\n", "2. '0'\n", "3. '0'\n", "4. '0'\n", "\n", "\n", "\n", "6. 1. 'CNAG_07303'\n", "2. '0'\n", "3. '0'\n", "4. '0'\n", "\n", "\n", "\n", "7. 1. 'CNAG_07304'\n", "2. '8'\n", "3. '0'\n", "4. '8'\n", "\n", "\n", "\n", "8. 1. 'CNAG_00001'\n", "2. '0'\n", "3. '0'\n", "4. '0'\n", "\n", "\n", "\n", "9. 1. 'CNAG_07305'\n", "2. '0'\n", "3. '0'\n", "4. '0'\n", "\n", "\n", "\n", "10. 1. 'CNAG_00002'\n", "2. '66'\n", "3. '0'\n", "4. '66'\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "[[1]]\n", "[1] \"N_unmapped\" \"2690\" \"2690\" \"2690\" \n", "\n", "[[2]]\n", "[1] \"N_multimapping\" \"66100\" \"66100\" \"66100\" \n", "\n", "[[3]]\n", "[1] \"N_noFeature\" \"10626\" \"2238382\" \"20347\" \n", "\n", "[[4]]\n", "[1] \"N_ambiguous\" \"173170\" \"1622\" \"647\" \n", "\n", "[[5]]\n", "[1] \"CNAG_04548\" \"0\" \"0\" \"0\" \n", "\n", "[[6]]\n", "[1] \"CNAG_07303\" \"0\" \"0\" \"0\" \n", "\n", "[[7]]\n", "[1] \"CNAG_07304\" \"8\" \"0\" \"8\" \n", "\n", "[[8]]\n", "[1] \"CNAG_00001\" \"0\" \"0\" \"0\" \n", "\n", "[[9]]\n", "[1] \"CNAG_07305\" \"0\" \"0\" \"0\" \n", "\n", "[[10]]\n", "[1] \"CNAG_00002\" \"66\" \"0\" \"66\" \n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cmdstr <- paste(\"head\", file.path(DATDIR, stardirs[1]))\n", "cmdout <- system(cmdstr, intern = TRUE)\n", "str_split(cmdout, pattern = \"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Construct a matrix that gathers all the count files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "helper functions to read the count files" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "mycombine <- function(df1, df2) {\n", " # Combine two data frames by gene names\n", " #\n", " # Args:\n", " # df1 (Dataframe): the first count data\n", " # df2 (Dataframe): the second count data\n", " #\n", " # Returns:\n", " # (Dataframe) The combined data frame of df1 and df2\n", " full_join(df1, df2, by = \"gene\")\n", "}\n", "\n", "myfile <- function(filedir, filename) {\n", " # Get the absolute paths of a file\n", " #\n", " # Args:\n", " # filedir (Character): the directory of the folder\n", " # filename (Character): the filename\n", " #\n", " # Returns:\n", " # (Character) the directory of the input file\n", " file.path(filedir, filename)\n", "}\n", "\n", "# Data type for each column\n", "coltypes <- list(col_character(), col_integer(), col_integer(), col_integer())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "read the count files and combine them" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "out <- foreach(stardir = stardirs, .combine = mycombine) %do% {\n", " \n", " # get a directory of each count file\n", " cntfile <- myfile(DATDIR, stardir)\n", " \n", " # read in the count file\n", " readr::read_tsv(cntfile, col_names = FALSE, col_types = coltypes) %>%\n", " dplyr::select(X1, X4) %>% # get the 1st and 4th columns\n", " dplyr::rename_(.dots=setNames(names(.), c(\"gene\",stardir)))\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 205 columns (204 count files + 1 rowname)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
    \n", "\t
  1. 8501
  2. \n", "\t
  3. 205
  4. \n", "
\n" ], "text/latex": [ "\\begin{enumerate*}\n", "\\item 8501\n", "\\item 205\n", "\\end{enumerate*}\n" ], "text/markdown": [ "1. 8501\n", "2. 205\n", "\n", "\n" ], "text/plain": [ "[1] 8501 205" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dim(out)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
gene1_MA_J_S18_L001_ReadsPerGene.out.tab1_MA_J_S18_L002_ReadsPerGene.out.tab1_MA_J_S18_L003_ReadsPerGene.out.tab1_MA_J_S18_L004_ReadsPerGene.out.tab1_RZ_J_S26_L001_ReadsPerGene.out.tab
N_unmapped 2690 2684 2672 2585 7218
N_multimapping66100 65234 66538 65066 395848
N_noFeature 20347 20004 20549 20505 768146
N_ambiguous 647 652 697 616 1431
CNAG_04548 0 0 0 1 0
CNAG_07303 0 0 0 0 0
\n" ], "text/latex": [ "\\begin{tabular}{r|llllll}\n", " gene & 1\\_MA\\_J\\_S18\\_L001\\_ReadsPerGene.out.tab & 1\\_MA\\_J\\_S18\\_L002\\_ReadsPerGene.out.tab & 1\\_MA\\_J\\_S18\\_L003\\_ReadsPerGene.out.tab & 1\\_MA\\_J\\_S18\\_L004\\_ReadsPerGene.out.tab & 1\\_RZ\\_J\\_S26\\_L001\\_ReadsPerGene.out.tab\\\\\n", "\\hline\n", "\t N\\_unmapped & 2690 & 2684 & 2672 & 2585 & 7218 \\\\\n", "\t N\\_multimapping & 66100 & 65234 & 66538 & 65066 & 395848 \\\\\n", "\t N\\_noFeature & 20347 & 20004 & 20549 & 20505 & 768146 \\\\\n", "\t N\\_ambiguous & 647 & 652 & 697 & 616 & 1431 \\\\\n", "\t CNAG\\_04548 & 0 & 0 & 0 & 1 & 0 \\\\\n", "\t CNAG\\_07303 & 0 & 0 & 0 & 0 & 0 \\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "gene | 1_MA_J_S18_L001_ReadsPerGene.out.tab | 1_MA_J_S18_L002_ReadsPerGene.out.tab | 1_MA_J_S18_L003_ReadsPerGene.out.tab | 1_MA_J_S18_L004_ReadsPerGene.out.tab | 1_RZ_J_S26_L001_ReadsPerGene.out.tab | \n", "|---|---|---|---|---|---|\n", "| N_unmapped | 2690 | 2684 | 2672 | 2585 | 7218 | \n", "| N_multimapping | 66100 | 65234 | 66538 | 65066 | 395848 | \n", "| N_noFeature | 20347 | 20004 | 20549 | 20505 | 768146 | \n", "| N_ambiguous | 647 | 652 | 697 | 616 | 1431 | \n", "| CNAG_04548 | 0 | 0 | 0 | 1 | 0 | \n", "| CNAG_07303 | 0 | 0 | 0 | 0 | 0 | \n", "\n", "\n" ], "text/plain": [ " gene 1_MA_J_S18_L001_ReadsPerGene.out.tab\n", "1 N_unmapped 2690 \n", "2 N_multimapping 66100 \n", "3 N_noFeature 20347 \n", "4 N_ambiguous 647 \n", "5 CNAG_04548 0 \n", "6 CNAG_07303 0 \n", " 1_MA_J_S18_L002_ReadsPerGene.out.tab 1_MA_J_S18_L003_ReadsPerGene.out.tab\n", "1 2684 2672 \n", "2 65234 66538 \n", "3 20004 20549 \n", "4 652 697 \n", "5 0 0 \n", "6 0 0 \n", " 1_MA_J_S18_L004_ReadsPerGene.out.tab 1_RZ_J_S26_L001_ReadsPerGene.out.tab\n", "1 2585 7218 \n", "2 65066 395848 \n", "3 20505 768146 \n", "4 616 1431 \n", "5 1 0 \n", "6 0 0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "out[1:6, 1:6]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Arrange the results from the count files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separate the first four rows (-> nmisc) and others (-> genecounts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that our count matrix contains both summarizing counts and specific counts of each gene for each sample. Note that our 'out' matrix is in biological format (i.e. the samples are the columns and the variables are the rows). Let's split this matrix up into two matrices: `nmisc` and `genecounts`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For `nmisc`, we will take the first 4 rows of `out` since those are the summarizing features. Next, we want to transform the data frame so that it is in statistical format (the samples are the rows and the feature types are the columns). Using a combination of gather and spread, we can transpose our matrix into the desired format." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "### Gather and spread the first four rows\n", "out %>%\n", " dplyr::slice(1:4) %>%\n", " gather(expid, value, -gene) %>% \n", " spread(gene, value) %>%\n", " rename_(.dots = setNames(names(.), c(\"expid\", \"namb\", \"nmulti\", \"nnofeat\",\"nunmap\"))) ->\n", " nmisc" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidnambnmultinnofeatnunmap
1_MA_J_S18_L001_ReadsPerGene.out.tab 647 66100 20347 2690
1_MA_J_S18_L002_ReadsPerGene.out.tab 652 65234 20004 2684
1_MA_J_S18_L003_ReadsPerGene.out.tab 697 66538 20549 2672
1_MA_J_S18_L004_ReadsPerGene.out.tab 616 65066 20505 2585
1_RZ_J_S26_L001_ReadsPerGene.out.tab1431 395848 768146 7218
1_RZ_J_S26_L002_ReadsPerGene.out.tab1337 388079 755654 7022
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " expid & namb & nmulti & nnofeat & nunmap\\\\\n", "\\hline\n", "\t 1\\_MA\\_J\\_S18\\_L001\\_ReadsPerGene.out.tab & 647 & 66100 & 20347 & 2690 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L002\\_ReadsPerGene.out.tab & 652 & 65234 & 20004 & 2684 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L003\\_ReadsPerGene.out.tab & 697 & 66538 & 20549 & 2672 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L004\\_ReadsPerGene.out.tab & 616 & 65066 & 20505 & 2585 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L001\\_ReadsPerGene.out.tab & 1431 & 395848 & 768146 & 7218 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L002\\_ReadsPerGene.out.tab & 1337 & 388079 & 755654 & 7022 \\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "expid | namb | nmulti | nnofeat | nunmap | \n", "|---|---|---|---|---|---|\n", "| 1_MA_J_S18_L001_ReadsPerGene.out.tab | 647 | 66100 | 20347 | 2690 | \n", "| 1_MA_J_S18_L002_ReadsPerGene.out.tab | 652 | 65234 | 20004 | 2684 | \n", "| 1_MA_J_S18_L003_ReadsPerGene.out.tab | 697 | 66538 | 20549 | 2672 | \n", "| 1_MA_J_S18_L004_ReadsPerGene.out.tab | 616 | 65066 | 20505 | 2585 | \n", "| 1_RZ_J_S26_L001_ReadsPerGene.out.tab | 1431 | 395848 | 768146 | 7218 | \n", "| 1_RZ_J_S26_L002_ReadsPerGene.out.tab | 1337 | 388079 | 755654 | 7022 | \n", "\n", "\n" ], "text/plain": [ " expid namb nmulti nnofeat nunmap\n", "1 1_MA_J_S18_L001_ReadsPerGene.out.tab 647 66100 20347 2690 \n", "2 1_MA_J_S18_L002_ReadsPerGene.out.tab 652 65234 20004 2684 \n", "3 1_MA_J_S18_L003_ReadsPerGene.out.tab 697 66538 20549 2672 \n", "4 1_MA_J_S18_L004_ReadsPerGene.out.tab 616 65066 20505 2585 \n", "5 1_RZ_J_S26_L001_ReadsPerGene.out.tab 1431 395848 768146 7218 \n", "6 1_RZ_J_S26_L002_ReadsPerGene.out.tab 1337 388079 755654 7022 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nmisc %>% head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To obtain the counts for specific genes, we will use the rest of our `out` matrix since it contains the gene counts. However, we still want to transform the data frame into statistical format, which we will accomplish using gather and spread." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "### Gather and spread the genes to get a count matrix\n", "out %>%\n", " dplyr::slice(-(1:4)) %>%\n", " gather(expid, value, -gene) %>% \n", " spread(gene, value) -> genecounts" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidCNAG_00001CNAG_00002CNAG_00003CNAG_00004CNAG_00005
1_MA_J_S18_L001_ReadsPerGene.out.tab0 66 38 74 33
1_MA_J_S18_L002_ReadsPerGene.out.tab0 59 25 79 25
1_MA_J_S18_L003_ReadsPerGene.out.tab0 74 27 79 32
1_MA_J_S18_L004_ReadsPerGene.out.tab0 66 22 69 24
1_RZ_J_S26_L001_ReadsPerGene.out.tab0 50 16 51 26
1_RZ_J_S26_L002_ReadsPerGene.out.tab0 45 7 51 31
\n" ], "text/latex": [ "\\begin{tabular}{r|llllll}\n", " expid & CNAG\\_00001 & CNAG\\_00002 & CNAG\\_00003 & CNAG\\_00004 & CNAG\\_00005\\\\\n", "\\hline\n", "\t 1\\_MA\\_J\\_S18\\_L001\\_ReadsPerGene.out.tab & 0 & 66 & 38 & 74 & 33 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L002\\_ReadsPerGene.out.tab & 0 & 59 & 25 & 79 & 25 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L003\\_ReadsPerGene.out.tab & 0 & 74 & 27 & 79 & 32 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L004\\_ReadsPerGene.out.tab & 0 & 66 & 22 & 69 & 24 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L001\\_ReadsPerGene.out.tab & 0 & 50 & 16 & 51 & 26 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L002\\_ReadsPerGene.out.tab & 0 & 45 & 7 & 51 & 31 \\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "expid | CNAG_00001 | CNAG_00002 | CNAG_00003 | CNAG_00004 | CNAG_00005 | \n", "|---|---|---|---|---|---|\n", "| 1_MA_J_S18_L001_ReadsPerGene.out.tab | 0 | 66 | 38 | 74 | 33 | \n", "| 1_MA_J_S18_L002_ReadsPerGene.out.tab | 0 | 59 | 25 | 79 | 25 | \n", "| 1_MA_J_S18_L003_ReadsPerGene.out.tab | 0 | 74 | 27 | 79 | 32 | \n", "| 1_MA_J_S18_L004_ReadsPerGene.out.tab | 0 | 66 | 22 | 69 | 24 | \n", "| 1_RZ_J_S26_L001_ReadsPerGene.out.tab | 0 | 50 | 16 | 51 | 26 | \n", "| 1_RZ_J_S26_L002_ReadsPerGene.out.tab | 0 | 45 | 7 | 51 | 31 | \n", "\n", "\n" ], "text/plain": [ " expid CNAG_00001 CNAG_00002 CNAG_00003\n", "1 1_MA_J_S18_L001_ReadsPerGene.out.tab 0 66 38 \n", "2 1_MA_J_S18_L002_ReadsPerGene.out.tab 0 59 25 \n", "3 1_MA_J_S18_L003_ReadsPerGene.out.tab 0 74 27 \n", "4 1_MA_J_S18_L004_ReadsPerGene.out.tab 0 66 22 \n", "5 1_RZ_J_S26_L001_ReadsPerGene.out.tab 0 50 16 \n", "6 1_RZ_J_S26_L002_ReadsPerGene.out.tab 0 45 7 \n", " CNAG_00004 CNAG_00005\n", "1 74 33 \n", "2 79 25 \n", "3 79 32 \n", "4 69 24 \n", "5 51 26 \n", "6 51 31 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "genecounts[1:6,1:6]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### For each samples, sum up all the counts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can create a variable denoting the number of total genes mapped for each sample by summing across the rows." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "### Sum across the rows for a total gene count variable\n", "genecounts %>% \n", " mutate(ngenemap = rowSums(.[-1])) %>%\n", " select(expid, ngenemap) -> ngene" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarize the results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will create a comprehensive data frame `mapresults` which will combine `ngene` with `nmisc`. This data frame will have summarizing mapping features in addition to proportion features. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "### Merge in the 4 misc counts and add summaries\n", "ngene %>%\n", " full_join(nmisc, by = \"expid\") %>%\n", " mutate(depth = as.integer(ngenemap + namb + nmulti + nnofeat + nunmap)) %>%\n", " mutate(prop.gene = ngenemap / depth) %>%\n", " mutate(prop.nofeat = nnofeat / depth) %>%\n", " mutate(prop.unique = (ngenemap + nnofeat) / depth) ->\n", " mapresults" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Store the results" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
expidngenemapnambnmultinnofeatnunmapdepthprop.geneprop.nofeatprop.unique
1_MA_J_S18_L001_ReadsPerGene.out.tab2399781 647 66100 20347 2690 2489565 0.9639359 0.008172914 0.9721088
1_MA_J_S18_L002_ReadsPerGene.out.tab2362228 652 65234 20004 2684 2450802 0.9638592 0.008162226 0.9720214
1_MA_J_S18_L003_ReadsPerGene.out.tab2436776 697 66538 20549 2672 2527232 0.9642075 0.008131030 0.9723385
1_MA_J_S18_L004_ReadsPerGene.out.tab2417485 616 65066 20505 2585 2506257 0.9645798 0.008181523 0.9727614
1_RZ_J_S26_L001_ReadsPerGene.out.tab2366742 1431 395848 768146 7218 3539385 0.6686874 0.217028100 0.8857155
1_RZ_J_S26_L002_ReadsPerGene.out.tab2331658 1337 388079 755654 7022 3483750 0.6692954 0.216908217 0.8862037
\n" ], "text/latex": [ "\\begin{tabular}{r|llllllllll}\n", " expid & ngenemap & namb & nmulti & nnofeat & nunmap & depth & prop.gene & prop.nofeat & prop.unique\\\\\n", "\\hline\n", "\t 1\\_MA\\_J\\_S18\\_L001\\_ReadsPerGene.out.tab & 2399781 & 647 & 66100 & 20347 & 2690 & 2489565 & 0.9639359 & 0.008172914 & 0.9721088 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L002\\_ReadsPerGene.out.tab & 2362228 & 652 & 65234 & 20004 & 2684 & 2450802 & 0.9638592 & 0.008162226 & 0.9720214 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L003\\_ReadsPerGene.out.tab & 2436776 & 697 & 66538 & 20549 & 2672 & 2527232 & 0.9642075 & 0.008131030 & 0.9723385 \\\\\n", "\t 1\\_MA\\_J\\_S18\\_L004\\_ReadsPerGene.out.tab & 2417485 & 616 & 65066 & 20505 & 2585 & 2506257 & 0.9645798 & 0.008181523 & 0.9727614 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L001\\_ReadsPerGene.out.tab & 2366742 & 1431 & 395848 & 768146 & 7218 & 3539385 & 0.6686874 & 0.217028100 & 0.8857155 \\\\\n", "\t 1\\_RZ\\_J\\_S26\\_L002\\_ReadsPerGene.out.tab & 2331658 & 1337 & 388079 & 755654 & 7022 & 3483750 & 0.6692954 & 0.216908217 & 0.8862037 \\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "expid | ngenemap | namb | nmulti | nnofeat | nunmap | depth | prop.gene | prop.nofeat | prop.unique | \n", "|---|---|---|---|---|---|\n", "| 1_MA_J_S18_L001_ReadsPerGene.out.tab | 2399781 | 647 | 66100 | 20347 | 2690 | 2489565 | 0.9639359 | 0.008172914 | 0.9721088 | \n", "| 1_MA_J_S18_L002_ReadsPerGene.out.tab | 2362228 | 652 | 65234 | 20004 | 2684 | 2450802 | 0.9638592 | 0.008162226 | 0.9720214 | \n", "| 1_MA_J_S18_L003_ReadsPerGene.out.tab | 2436776 | 697 | 66538 | 20549 | 2672 | 2527232 | 0.9642075 | 0.008131030 | 0.9723385 | \n", "| 1_MA_J_S18_L004_ReadsPerGene.out.tab | 2417485 | 616 | 65066 | 20505 | 2585 | 2506257 | 0.9645798 | 0.008181523 | 0.9727614 | \n", "| 1_RZ_J_S26_L001_ReadsPerGene.out.tab | 2366742 | 1431 | 395848 | 768146 | 7218 | 3539385 | 0.6686874 | 0.217028100 | 0.8857155 | \n", "| 1_RZ_J_S26_L002_ReadsPerGene.out.tab | 2331658 | 1337 | 388079 | 755654 | 7022 | 3483750 | 0.6692954 | 0.216908217 | 0.8862037 | \n", "\n", "\n" ], "text/plain": [ " expid ngenemap namb nmulti nnofeat nunmap\n", "1 1_MA_J_S18_L001_ReadsPerGene.out.tab 2399781 647 66100 20347 2690 \n", "2 1_MA_J_S18_L002_ReadsPerGene.out.tab 2362228 652 65234 20004 2684 \n", "3 1_MA_J_S18_L003_ReadsPerGene.out.tab 2436776 697 66538 20549 2672 \n", "4 1_MA_J_S18_L004_ReadsPerGene.out.tab 2417485 616 65066 20505 2585 \n", "5 1_RZ_J_S26_L001_ReadsPerGene.out.tab 2366742 1431 395848 768146 7218 \n", "6 1_RZ_J_S26_L002_ReadsPerGene.out.tab 2331658 1337 388079 755654 7022 \n", " depth prop.gene prop.nofeat prop.unique\n", "1 2489565 0.9639359 0.008172914 0.9721088 \n", "2 2450802 0.9638592 0.008162226 0.9720214 \n", "3 2527232 0.9642075 0.008131030 0.9723385 \n", "4 2506257 0.9645798 0.008181523 0.9727614 \n", "5 3539385 0.6686874 0.217028100 0.8857155 \n", "6 3483750 0.6692954 0.216908217 0.8862037 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "head(mapresults)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "outfile <- file.path(OUTDIR, \"hts-pilot-2018.RData\")\n", "save(mapresults, genecounts, file = outfile)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "/home/jovyan/work/scratch/analysis_output/out/hts-pilot-2018.RData: 'e8d7991f674b7a9284abae8f38656c2e'" ], "text/latex": [ "\\textbf{/home/jovyan/work/scratch/analysis\\textbackslash{}\\_output/out/hts-pilot-2018.RData:} 'e8d7991f674b7a9284abae8f38656c2e'" ], "text/markdown": [ "**/home/jovyan/work/scratch/analysis_output/out/hts-pilot-2018.RData:** 'e8d7991f674b7a9284abae8f38656c2e'" ], "text/plain": [ "/home/jovyan/work/scratch/analysis_output/out/hts-pilot-2018.RData \n", " \"e8d7991f674b7a9284abae8f38656c2e\" " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tools::md5sum(path.expand(outfile))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The End" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "R version 3.4.1 (2017-06-30)\n", "Platform: x86_64-pc-linux-gnu (64-bit)\n", "Running under: Debian GNU/Linux 8 (jessie)\n", "\n", "Matrix products: default\n", "BLAS: /opt/conda/lib/R/lib/libRblas.so\n", "LAPACK: /opt/conda/lib/R/lib/libRlapack.so\n", "\n", "locale:\n", " [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C \n", " [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 \n", " [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 \n", " [7] LC_PAPER=en_US.UTF-8 LC_NAME=C \n", " [9] LC_ADDRESS=C LC_TELEPHONE=C \n", "[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C \n", "\n", "attached base packages:\n", " [1] tools parallel stats4 stats graphics grDevices utils \n", " [8] datasets methods base \n", "\n", "other attached packages:\n", " [1] bindrcpp_0.2 plotly_4.8.0 \n", " [3] dendextend_1.8.0 gridExtra_2.3 \n", " [5] RColorBrewer_1.1-2 qvalue_2.10.0 \n", " [7] limma_3.34.9 DESeq2_1.18.1 \n", " [9] SummarizedExperiment_1.8.0 DelayedArray_0.4.1 \n", "[11] matrixStats_0.53.1 Biobase_2.38.0 \n", "[13] GenomicRanges_1.30.3 GenomeInfoDb_1.14.0 \n", "[15] IRanges_2.12.0 S4Vectors_0.16.0 \n", "[17] BiocGenerics_0.24.0 haven_1.1.1 \n", "[19] stringr_1.3.0 foreach_1.4.4 \n", "[21] dplyr_0.7.4 purrr_0.2.4 \n", "[23] readr_1.1.1 tidyr_0.8.1 \n", "[25] tibble_1.4.2 ggplot2_3.0.0 \n", "[27] tidyverse_1.1.1 \n", "\n", "loaded via a namespace (and not attached):\n", " [1] colorspace_1.3-2 class_7.3-14 modeltools_0.2-22 \n", " [4] mclust_5.4.1 IRdisplay_0.4.4 htmlTable_1.9 \n", " [7] XVector_0.18.0 base64enc_0.1-3 flexmix_2.3-14 \n", " [10] bit64_0.9-5 mvtnorm_1.0-8 AnnotationDbi_1.40.0 \n", " [13] lubridate_1.7.4 xml2_1.2.0 codetools_0.2-15 \n", " [16] splines_3.4.1 mnormt_1.5-5 robustbase_0.92-7 \n", " [19] geneplotter_1.56.0 knitr_1.20 IRkernel_0.8.11 \n", " [22] Formula_1.2-1 jsonlite_1.5 broom_0.4.4 \n", " [25] annotate_1.56.0 kernlab_0.9-25 cluster_2.0.6 \n", " [28] compiler_3.4.1 httr_1.3.1 backports_1.1.2 \n", " [31] assertthat_0.2.0 Matrix_1.2-14 lazyeval_0.2.1 \n", " [34] acepack_1.4.1 htmltools_0.3.6 gtable_0.2.0 \n", " [37] glue_1.2.0 GenomeInfoDbData_0.99.0 reshape2_1.4.3 \n", " [40] Rcpp_0.12.15 trimcluster_0.1-2.1 cellranger_1.1.0 \n", " [43] nlme_3.1-131 fpc_2.1-11.1 iterators_1.0.9 \n", " [46] psych_1.8.4 rvest_0.3.2 XML_3.98-1.11 \n", " [49] DEoptimR_1.0-8 MASS_7.3-48 zlibbioc_1.24.0 \n", " [52] scales_0.5.0 hms_0.3 memoise_1.1.0 \n", " [55] rpart_4.1-13 latticeExtra_0.6-28 stringi_1.1.7 \n", " [58] RSQLite_2.0 genefilter_1.60.0 checkmate_1.8.5 \n", " [61] BiocParallel_1.12.0 repr_0.15.0 prabclus_2.2-6 \n", " [64] rlang_0.2.1 pkgconfig_2.0.1 bitops_1.0-6 \n", " [67] evaluate_0.10.1 lattice_0.20-34 bindr_0.1.1 \n", " [70] htmlwidgets_1.2 tidyselect_0.2.4 bit_1.1-12 \n", " [73] plyr_1.8.4 magrittr_1.5 R6_2.2.2 \n", " [76] Hmisc_4.0-3 pbdZMQ_0.3-2 DBI_1.0.0 \n", " [79] pillar_1.2.2 whisker_0.3-2 foreign_0.8-67 \n", " [82] withr_2.1.1 survival_2.40-1 RCurl_1.95-4.8 \n", " [85] nnet_7.3-12 modelr_0.1.2 crayon_1.3.4 \n", " [88] uuid_0.1-2 viridis_0.5.1 locfit_1.5-9.1 \n", " [91] grid_3.4.1 readxl_1.1.0 data.table_1.10.4 \n", " [94] blob_1.1.1 forcats_0.3.0 diptest_0.75-7 \n", " [97] digest_0.6.15 xtable_1.8-2 munsell_0.5.0 \n", "[100] viridisLite_0.3.0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sessionInfo()" ] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.4.1" } }, "nbformat": 4, "nbformat_minor": 2 }