Set up environment¶
In [1]:
# load required libraries
library(tidyverse)
library(foreach)
library(stringr)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
Attaching package: ‘foreach’
The following objects are masked from ‘package:purrr’:
accumulate, when
Your Folder Structure:
└── HTS2018
├── out
└── img
In [2]:
# set directories
datadir <- "/data/hts2018_pilot/star_counts"
outdir <- "/home/jovyan/work/HTS2018/out"
# get files of all star output files
stardirs <- list.files(datadir)
Read in the count data output from STAR¶
There are 204 files under this folder. Each file is generated from a fastq file using STAR.
In [3]:
length(stardirs)
204
In [4]:
head(stardirs)
- '1_MA_J_S18_L001_ReadsPerGene.out.tab'
- '1_MA_J_S18_L002_ReadsPerGene.out.tab'
- '1_MA_J_S18_L003_ReadsPerGene.out.tab'
- '1_MA_J_S18_L004_ReadsPerGene.out.tab'
- '1_RZ_J_S26_L001_ReadsPerGene.out.tab'
- '1_RZ_J_S26_L002_ReadsPerGene.out.tab'
View the head of one file (1_MA_J_S18_L001_ReadsPerGene.out.tab). Notice that there are four columns and the columns we want is the first and the fourth columns.
In [5]:
cmdstr <- paste("head", file.path(datadir, stardirs[1]))
cmdout <- system(cmdstr, intern = TRUE)
str_split(cmdout, pattern = "\t")
- 'N_unmapped'
- '2690'
- '2690'
- '2690'
- 'N_multimapping'
- '66100'
- '66100'
- '66100'
- 'N_noFeature'
- '10626'
- '2238382'
- '20347'
- 'N_ambiguous'
- '173170'
- '1622'
- '647'
- 'CNAG_04548'
- '0'
- '0'
- '0'
- 'CNAG_07303'
- '0'
- '0'
- '0'
- 'CNAG_07304'
- '8'
- '0'
- '8'
- 'CNAG_00001'
- '0'
- '0'
- '0'
- 'CNAG_07305'
- '0'
- '0'
- '0'
- 'CNAG_00002'
- '66'
- '0'
- '66'
Construct a matrix that gathers all the count files¶
In [6]:
mycombine <- function(df1, df2) {
# Combine two data frames by gene names
#
# Args:
# df1 (Dataframe): the first count data
# df2 (Dataframe): the second count data
#
# Returns:
# (Dataframe) The combined data frame of df1 and df2
full_join(df1, df2, by = "gene")
}
myfile <- function(filedir, filename) {
# Get the absolute paths of a file
#
# Args:
# filedir (Character): the directory of the folder
# filename (Character): the filename
#
# Returns:
# (Character) the directory of the input file
file.path(filedir, filename)
}
# Data type for each column
coltypes <- list(col_character(), col_integer(), col_integer(), col_integer())
In [7]:
out <- foreach(stardir = stardirs, .combine = mycombine) %do% {
# get a directory of each count file
cntfile <- myfile(datadir, stardir)
# read in the count file
readr::read_tsv(cntfile, col_names = FALSE, col_types = coltypes) %>%
dplyr::select(X1, X4) %>% # get the 1st and 4th columns
dplyr::rename_(.dots=setNames(names(.), c("gene",stardir)))
}
There are 205 columns (204 count files + 1 rowname)
In [8]:
dim(out)
- 8501
- 205
In [9]:
out[1:6, 1:6]
gene | 1_MA_J_S18_L001_ReadsPerGene.out.tab | 1_MA_J_S18_L002_ReadsPerGene.out.tab | 1_MA_J_S18_L003_ReadsPerGene.out.tab | 1_MA_J_S18_L004_ReadsPerGene.out.tab | 1_RZ_J_S26_L001_ReadsPerGene.out.tab |
---|---|---|---|---|---|
N_unmapped | 2690 | 2684 | 2672 | 2585 | 7218 |
N_multimapping | 66100 | 65234 | 66538 | 65066 | 395848 |
N_noFeature | 20347 | 20004 | 20549 | 20505 | 768146 |
N_ambiguous | 647 | 652 | 697 | 616 | 1431 |
CNAG_04548 | 0 | 0 | 0 | 1 | 0 |
CNAG_07303 | 0 | 0 | 0 | 0 | 0 |
Arrange the results from the count files¶
Separate the first four rows (-> nmisc) and others (-> genecounts)¶
In [10]:
### Gather and spread the first four rows
out %>%
dplyr::slice(1:4) %>%
gather(expid, value, -gene) %>%
spread(gene, value) %>%
rename_(.dots = setNames(names(.), c("expid", "namb", "nmulti", "nnofeat","nunmap"))) ->
nmisc
### Gather and spread the genes to get a count matrix
out %>%
dplyr::slice(-(1:4)) %>%
gather(expid, value, -gene) %>%
spread(gene, value) -> genecounts
For each samples, sum up all the counts¶
In [11]:
### Gather and spread the gene rows
genecounts %>%
mutate(ngenemap = rowSums(.[-1])) %>%
select(expid, ngenemap) -> ngene
Summarize the results¶
In [12]:
### Merge in the 4 misc counts and add summaries
ngene %>%
full_join(nmisc, by = "expid") %>%
mutate(depth = as.integer(ngenemap + namb + nmulti + nnofeat + nunmap)) %>%
mutate(prob.gene = ngenemap / depth) %>%
mutate(prob.nofeat = nnofeat / depth) %>%
mutate(prob.unique = (ngenemap + nnofeat) / depth) ->
mapresults
Store the results¶
In [13]:
head(mapresults)
expid | ngenemap | namb | nmulti | nnofeat | nunmap | depth | prob.gene | prob.nofeat | prob.unique |
---|---|---|---|---|---|---|---|---|---|
1_MA_J_S18_L001_ReadsPerGene.out.tab | 2399781 | 647 | 66100 | 20347 | 2690 | 2489565 | 0.9639359 | 0.008172914 | 0.9721088 |
1_MA_J_S18_L002_ReadsPerGene.out.tab | 2362228 | 652 | 65234 | 20004 | 2684 | 2450802 | 0.9638592 | 0.008162226 | 0.9720214 |
1_MA_J_S18_L003_ReadsPerGene.out.tab | 2436776 | 697 | 66538 | 20549 | 2672 | 2527232 | 0.9642075 | 0.008131030 | 0.9723385 |
1_MA_J_S18_L004_ReadsPerGene.out.tab | 2417485 | 616 | 65066 | 20505 | 2585 | 2506257 | 0.9645798 | 0.008181523 | 0.9727614 |
1_RZ_J_S26_L001_ReadsPerGene.out.tab | 2366742 | 1431 | 395848 | 768146 | 7218 | 3539385 | 0.6686874 | 0.217028100 | 0.8857155 |
1_RZ_J_S26_L002_ReadsPerGene.out.tab | 2331658 | 1337 | 388079 | 755654 | 7022 | 3483750 | 0.6692954 | 0.216908217 | 0.8862037 |
In [14]:
genecounts[1:6, 1:6]
expid | CNAG_00001 | CNAG_00002 | CNAG_00003 | CNAG_00004 | CNAG_00005 |
---|---|---|---|---|---|
1_MA_J_S18_L001_ReadsPerGene.out.tab | 0 | 66 | 38 | 74 | 33 |
1_MA_J_S18_L002_ReadsPerGene.out.tab | 0 | 59 | 25 | 79 | 25 |
1_MA_J_S18_L003_ReadsPerGene.out.tab | 0 | 74 | 27 | 79 | 32 |
1_MA_J_S18_L004_ReadsPerGene.out.tab | 0 | 66 | 22 | 69 | 24 |
1_RZ_J_S26_L001_ReadsPerGene.out.tab | 0 | 50 | 16 | 51 | 26 |
1_RZ_J_S26_L002_ReadsPerGene.out.tab | 0 | 45 | 7 | 51 | 31 |
In [15]:
outfile <- file.path(outdir, "hts-pilot-2018.RData")
save(mapresults, genecounts, file = outfile)
In [16]:
tools::md5sum(outfile)
/home/jovyan/work/HTS2018/out/hts-pilot-2018.RData: '488ca13061c39b67843257f4ace35132'