Set up environment

In [1]:
# load required libraries
library(tidyverse)
library(foreach)
library(stringr)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Your Folder Structure:
└── HTS2018
    ├── out
    └── img
In [2]:
# set directories
datadir <- "/data/hts2018_pilot/star_counts"
outdir  <- "/home/jovyan/work/HTS2018/out"

# get files of all star output files
stardirs <- list.files(datadir)

Read in the count data output from STAR

There are 204 files under this folder. Each file is generated from a fastq file using STAR.

In [3]:
length(stardirs)
204
In [4]:
head(stardirs)
  1. '1_MA_J_S18_L001_ReadsPerGene.out.tab'
  2. '1_MA_J_S18_L002_ReadsPerGene.out.tab'
  3. '1_MA_J_S18_L003_ReadsPerGene.out.tab'
  4. '1_MA_J_S18_L004_ReadsPerGene.out.tab'
  5. '1_RZ_J_S26_L001_ReadsPerGene.out.tab'
  6. '1_RZ_J_S26_L002_ReadsPerGene.out.tab'

View the head of one file (1_MA_J_S18_L001_ReadsPerGene.out.tab). Notice that there are four columns and the columns we want is the first and the fourth columns.

In [5]:
cmdstr <- paste("head", file.path(datadir, stardirs[1]))
cmdout <- system(cmdstr, intern = TRUE)
str_split(cmdout, pattern = "\t")
    1. 'N_unmapped'
    2. '2690'
    3. '2690'
    4. '2690'
    1. 'N_multimapping'
    2. '66100'
    3. '66100'
    4. '66100'
    1. 'N_noFeature'
    2. '10626'
    3. '2238382'
    4. '20347'
    1. 'N_ambiguous'
    2. '173170'
    3. '1622'
    4. '647'
    1. 'CNAG_04548'
    2. '0'
    3. '0'
    4. '0'
    1. 'CNAG_07303'
    2. '0'
    3. '0'
    4. '0'
    1. 'CNAG_07304'
    2. '8'
    3. '0'
    4. '8'
    1. 'CNAG_00001'
    2. '0'
    3. '0'
    4. '0'
    1. 'CNAG_07305'
    2. '0'
    3. '0'
    4. '0'
    1. 'CNAG_00002'
    2. '66'
    3. '0'
    4. '66'

Construct a matrix that gathers all the count files

In [6]:
mycombine <- function(df1, df2) {
    # Combine two data frames by gene names
    #
    # Args:
    #   df1 (Dataframe): the first count data
    #   df2 (Dataframe): the second count data
    #
    # Returns:
    #   (Dataframe) The combined data frame of df1 and df2
    full_join(df1, df2, by = "gene")
}

myfile <- function(filedir, filename) {
    # Get the absolute paths of a file
    #
    # Args:
    #   filedir  (Character): the directory of the folder
    #   filename (Character): the filename
    #
    # Returns:
    #   (Character) the directory of the input file
    file.path(filedir, filename)
}

# Data type for each column
coltypes <- list(col_character(), col_integer(), col_integer(), col_integer())
In [7]:
out <- foreach(stardir = stardirs, .combine = mycombine) %do% {

    # get a directory of each count file
    cntfile <- myfile(datadir, stardir)

    # read in the count file
    readr::read_tsv(cntfile, col_names = FALSE, col_types = coltypes) %>%
        dplyr::select(X1, X4) %>% # get the 1st and 4th columns
            dplyr::rename_(.dots=setNames(names(.), c("gene",stardir)))
}

There are 205 columns (204 count files + 1 rowname)

In [8]:
dim(out)
  1. 8501
  2. 205
In [9]:
out[1:6, 1:6]
gene1_MA_J_S18_L001_ReadsPerGene.out.tab1_MA_J_S18_L002_ReadsPerGene.out.tab1_MA_J_S18_L003_ReadsPerGene.out.tab1_MA_J_S18_L004_ReadsPerGene.out.tab1_RZ_J_S26_L001_ReadsPerGene.out.tab
N_unmapped 2690 2684 2672 2585 7218
N_multimapping66100 65234 66538 65066 395848
N_noFeature 20347 20004 20549 20505 768146
N_ambiguous 647 652 697 616 1431
CNAG_04548 0 0 0 1 0
CNAG_07303 0 0 0 0 0

Arrange the results from the count files

Separate the first four rows (-> nmisc) and others (-> genecounts)

In [10]:
### Gather and spread the first four rows
out %>%
    dplyr::slice(1:4) %>%
    gather(expid, value, -gene) %>%
    spread(gene, value) %>%
    rename_(.dots = setNames(names(.), c("expid", "namb", "nmulti", "nnofeat","nunmap"))) ->
    nmisc

### Gather and spread the genes to get a count matrix
out %>%
    dplyr::slice(-(1:4)) %>%
    gather(expid, value, -gene) %>%
    spread(gene, value) -> genecounts

For each samples, sum up all the counts

In [11]:
### Gather and spread the gene rows
genecounts %>%
    mutate(ngenemap = rowSums(.[-1])) %>%
    select(expid, ngenemap) -> ngene

Summarize the results

In [12]:
### Merge in the 4 misc counts and add summaries
ngene %>%
    full_join(nmisc, by = "expid") %>%
    mutate(depth = as.integer(ngenemap + namb + nmulti + nnofeat + nunmap)) %>%
    mutate(prob.gene = ngenemap / depth) %>%
    mutate(prob.nofeat = nnofeat / depth) %>%
    mutate(prob.unique = (ngenemap + nnofeat) / depth) ->
    mapresults

Store the results

In [13]:
head(mapresults)
expidngenemapnambnmultinnofeatnunmapdepthprob.geneprob.nofeatprob.unique
1_MA_J_S18_L001_ReadsPerGene.out.tab2399781 647 66100 20347 2690 2489565 0.9639359 0.008172914 0.9721088
1_MA_J_S18_L002_ReadsPerGene.out.tab2362228 652 65234 20004 2684 2450802 0.9638592 0.008162226 0.9720214
1_MA_J_S18_L003_ReadsPerGene.out.tab2436776 697 66538 20549 2672 2527232 0.9642075 0.008131030 0.9723385
1_MA_J_S18_L004_ReadsPerGene.out.tab2417485 616 65066 20505 2585 2506257 0.9645798 0.008181523 0.9727614
1_RZ_J_S26_L001_ReadsPerGene.out.tab2366742 1431 395848 768146 7218 3539385 0.6686874 0.217028100 0.8857155
1_RZ_J_S26_L002_ReadsPerGene.out.tab2331658 1337 388079 755654 7022 3483750 0.6692954 0.216908217 0.8862037
In [14]:
genecounts[1:6, 1:6]
expidCNAG_00001CNAG_00002CNAG_00003CNAG_00004CNAG_00005
1_MA_J_S18_L001_ReadsPerGene.out.tab0 66 38 74 33
1_MA_J_S18_L002_ReadsPerGene.out.tab0 59 25 79 25
1_MA_J_S18_L003_ReadsPerGene.out.tab0 74 27 79 32
1_MA_J_S18_L004_ReadsPerGene.out.tab0 66 22 69 24
1_RZ_J_S26_L001_ReadsPerGene.out.tab0 50 16 51 26
1_RZ_J_S26_L002_ReadsPerGene.out.tab0 45 7 51 31
In [15]:
outfile <- file.path(outdir, "hts-pilot-2018.RData")
save(mapresults, genecounts, file = outfile)
In [16]:
tools::md5sum(outfile)
/home/jovyan/work/HTS2018/out/hts-pilot-2018.RData: '488ca13061c39b67843257f4ace35132'