Set up environment¶

In [1]:

# load required libraries
library(tidyverse)
library(foreach)
library(stringr)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Your Folder Structure:
└── HTS2018
    ├── out
    └── img

In [2]:

# set directories
datadir <- "/data/hts2018_pilot/star_counts"
outdir  <- "/home/jovyan/work/HTS2018/out"

# get files of all star output files
stardirs <- list.files(datadir)

Read in the count data output from STAR¶

There are 204 files under this folder. Each file is generated from a fastq file using STAR.

In [3]:

length(stardirs)

204

In [4]:

head(stardirs)

'1_MA_J_S18_L001_ReadsPerGene.out.tab'
'1_MA_J_S18_L002_ReadsPerGene.out.tab'
'1_MA_J_S18_L003_ReadsPerGene.out.tab'
'1_MA_J_S18_L004_ReadsPerGene.out.tab'
'1_RZ_J_S26_L001_ReadsPerGene.out.tab'
'1_RZ_J_S26_L002_ReadsPerGene.out.tab'

View the head of one file (1_MA_J_S18_L001_ReadsPerGene.out.tab). Notice that there are four columns and the columns we want is the first and the fourth columns.

In [5]:

cmdstr <- paste("head", file.path(datadir, stardirs[1]))
cmdout <- system(cmdstr, intern = TRUE)
str_split(cmdout, pattern = "\t")

1. 'N_unmapped'
2. '2690'
3. '2690'
4. '2690'
1. 'N_multimapping'
2. '66100'
3. '66100'
4. '66100'
1. 'N_noFeature'
2. '10626'
3. '2238382'
4. '20347'
1. 'N_ambiguous'
2. '173170'
3. '1622'
4. '647'
1. 'CNAG_04548'
2. '0'
3. '0'
4. '0'
1. 'CNAG_07303'
2. '0'
3. '0'
4. '0'
1. 'CNAG_07304'
2. '8'
3. '0'
4. '8'
1. 'CNAG_00001'
2. '0'
3. '0'
4. '0'
1. 'CNAG_07305'
2. '0'
3. '0'
4. '0'
1. 'CNAG_00002'
2. '66'
3. '0'
4. '66'

Construct a matrix that gathers all the count files¶

In [6]:

mycombine <- function(df1, df2) {
    # Combine two data frames by gene names
    #
    # Args:
    #   df1 (Dataframe): the first count data
    #   df2 (Dataframe): the second count data
    #
    # Returns:
    #   (Dataframe) The combined data frame of df1 and df2
    full_join(df1, df2, by = "gene")
}

myfile <- function(filedir, filename) {
    # Get the absolute paths of a file
    #
    # Args:
    #   filedir  (Character): the directory of the folder
    #   filename (Character): the filename
    #
    # Returns:
    #   (Character) the directory of the input file
    file.path(filedir, filename)
}

# Data type for each column
coltypes <- list(col_character(), col_integer(), col_integer(), col_integer())

In [7]:

out <- foreach(stardir = stardirs, .combine = mycombine) %do% {

    # get a directory of each count file
    cntfile <- myfile(datadir, stardir)

    # read in the count file
    readr::read_tsv(cntfile, col_names = FALSE, col_types = coltypes) %>%
        dplyr::select(X1, X4) %>% # get the 1st and 4th columns
            dplyr::rename_(.dots=setNames(names(.), c("gene",stardir)))
}

There are 205 columns (204 count files + 1 rowname)

In [8]:

dim(out)

8501
205

In [9]:

out[1:6, 1:6]

gene	1_MA_J_S18_L001_ReadsPerGene.out.tab	1_MA_J_S18_L002_ReadsPerGene.out.tab	1_MA_J_S18_L003_ReadsPerGene.out.tab	1_MA_J_S18_L004_ReadsPerGene.out.tab	1_RZ_J_S26_L001_ReadsPerGene.out.tab
N_unmapped	2690	2684	2672	2585	7218
N_multimapping	66100	65234	66538	65066	395848
N_noFeature	20347	20004	20549	20505	768146
N_ambiguous	647	652	697	616	1431
CNAG_04548	0	0	0	1	0
CNAG_07303	0	0	0	0	0

Arrange the results from the count files¶

Separate the first four rows (-> nmisc) and others (-> genecounts)¶

In [10]:

### Gather and spread the first four rows
out %>%
    dplyr::slice(1:4) %>%
    gather(expid, value, -gene) %>%
    spread(gene, value) %>%
    rename_(.dots = setNames(names(.), c("expid", "namb", "nmulti", "nnofeat","nunmap"))) ->
    nmisc

### Gather and spread the genes to get a count matrix
out %>%
    dplyr::slice(-(1:4)) %>%
    gather(expid, value, -gene) %>%
    spread(gene, value) -> genecounts

For each samples, sum up all the counts¶

In [11]:

### Gather and spread the gene rows
genecounts %>%
    mutate(ngenemap = rowSums(.[-1])) %>%
    select(expid, ngenemap) -> ngene

Summarize the results¶

In [12]:

### Merge in the 4 misc counts and add summaries
ngene %>%
    full_join(nmisc, by = "expid") %>%
    mutate(depth = as.integer(ngenemap + namb + nmulti + nnofeat + nunmap)) %>%
    mutate(prob.gene = ngenemap / depth) %>%
    mutate(prob.nofeat = nnofeat / depth) %>%
    mutate(prob.unique = (ngenemap + nnofeat) / depth) ->
    mapresults

Store the results¶

In [13]:

head(mapresults)

expid	ngenemap	namb	nmulti	nnofeat	nunmap	depth	prob.gene	prob.nofeat	prob.unique
1_MA_J_S18_L001_ReadsPerGene.out.tab	2399781	647	66100	20347	2690	2489565	0.9639359	0.008172914	0.9721088
1_MA_J_S18_L002_ReadsPerGene.out.tab	2362228	652	65234	20004	2684	2450802	0.9638592	0.008162226	0.9720214
1_MA_J_S18_L003_ReadsPerGene.out.tab	2436776	697	66538	20549	2672	2527232	0.9642075	0.008131030	0.9723385
1_MA_J_S18_L004_ReadsPerGene.out.tab	2417485	616	65066	20505	2585	2506257	0.9645798	0.008181523	0.9727614
1_RZ_J_S26_L001_ReadsPerGene.out.tab	2366742	1431	395848	768146	7218	3539385	0.6686874	0.217028100	0.8857155
1_RZ_J_S26_L002_ReadsPerGene.out.tab	2331658	1337	388079	755654	7022	3483750	0.6692954	0.216908217	0.8862037

In [14]:

genecounts[1:6, 1:6]

expid	CNAG_00002	CNAG_00003	CNAG_00004	CNAG_00005
1_MA_J_S18_L001_ReadsPerGene.out.tab	66	38	74	33
1_MA_J_S18_L002_ReadsPerGene.out.tab	59	25	79	25
1_MA_J_S18_L003_ReadsPerGene.out.tab	74	27	79	32
1_MA_J_S18_L004_ReadsPerGene.out.tab	66	22	69	24
1_RZ_J_S26_L001_ReadsPerGene.out.tab	50	16	51	26
1_RZ_J_S26_L002_ReadsPerGene.out.tab	45	7	51	31

In [15]:

outfile <- file.path(outdir, "hts-pilot-2018.RData")
save(mapresults, genecounts, file = outfile)

In [16]:

tools::md5sum(outfile)

/home/jovyan/work/HTS2018/out/hts-pilot-2018.RData: '488ca13061c39b67843257f4ace35132'