{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Note**: Make a copy of this notebook to work in." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Bash Scratchpad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hack for SIGPIPE error in Jupyter notebook" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "cleanup () { \n", " :\n", "}\n", "\n", "trap \"cleanup\" SIGPIPE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Safety first" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "set -u" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Challenge\n", "\n", "You will download and extract some information about Cryptococcus neoformans by investigating its GTF file located at\n", "\n", "ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## File and directory management" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/local_data/notebooks/cliburn/HTS2018-notebooks/cliburn\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Bash_Exercise_1.ipynb 'R_Graphic_ ggplot2.ipynb'\n", " Bash_Exercise_1_Solutions.ipynb R_Graphics_Base.ipynb\n", " Bash_Exercise_2.ipynb R_Graphics_Exercise.ipynb\n", " Bash_Exercise_2_Solutoins.ipynb R_Graphics_Exercise_Solutions.ipynb\n", " Bash_in_Jupyter.ipynb R_Graphics_Overview.ipynb\n", "'Bash Tutorial.ipynb' R_tidyverse_1.ipynb\n", " data R_tidyverse_2.ipynb\n", " figs R_tidyverse_3.ipynb\n", " Process-RNA-seq-counts.ipynb R_tidyyverse_Exercise.ipynb\n", " ref R_tidyyverse_Exercise_Solutions.ipynb\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "rm -rf ref" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "mkdir ref" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "cd ref" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using variables" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "FOO=foo" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bash: FOObar: unbound variable\n" ] }, { "ename": "", "evalue": "1", "output_type": "error", "traceback": [] } ], "source": [ "echo $FOObar" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "foobar\n" ] } ], "source": [ "echo ${FOO}bar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Downlaoding files" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "URL='ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz'" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz\n" ] } ], "source": [ "echo $URL" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2018-07-09 08:18:10-- ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz\n", " => ‘Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz’\n", "Resolving ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)... 193.62.197.94\n", "Connecting to ftp.ensemblgenomes.org (ftp.ensemblgenomes.org)|193.62.197.94|:21... connected.\n", "Logging in as anonymous ... Logged in!\n", "==> SYST ... done. ==> PWD ... done.\n", "==> TYPE I ... done. ==> CWD (1) /pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99 ... done.\n", "==> SIZE Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz ... 1796344\n", "==> PASV ... done. ==> RETR Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz ... done.\n", "Length: 1796344 (1.7M) (unauthoritative)\n", "\n", "Cryptococcus_neofor 100%[===================>] 1.71M 1.06MB/s in 1.6s \n", "\n", "2018-07-09 08:18:13 (1.06 MB/s) - ‘Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz’ saved [1796344]\n", "\n" ] } ], "source": [ "wget $URL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with compressed files" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/local_data/notebooks/cliburn/HTS2018-notebooks/cliburn/ref\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "gunzip Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf\n" ] } ], "source": [ "ls" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inspecting the GTF file\n", "\n", "A GTF file has some header lines, followed by tabular data in 9 columns:\n", "\n", "1.\tchromosome name\t\n", " > chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,M}\n", "2.\tannotation source\t\n", " > {ENSEMBL,HAVANA}\n", "3.\tfeature-type\t\n", " > {gene,transcript,exon,CDS,UTR,start_codon,stop_codon,Selenocysteine}\n", "4.\tgenomic start location\t\n", " > integer-value (1-based)\n", "5.\tgenomic end location\t\n", " > integer-value\n", "6.\tscore (not used) \t\n", " > .\n", "7.\tgenomic strand\t\n", " > {+,-}\n", "8.\tgenomic phase (for CDS features) \t\n", " > {0,1,2,.}\n", "9.\tadditional information as key-value pairs \n", " > (format: key \"value\";)\n", "\n", "See [GTF3](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) in ths Sequence Ontology.\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "GTF=$(ls)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf\n" ] } ], "source": [ "echo $GTF" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#!genome-build CNA3\n", "#!genome-version CNA3\n", "#!genome-date 2015-11\n", "#!genome-build-accession GCA_000149245.3\n", "#!genebuild-last-updated 2015-11\n", "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n", "1\tena\tCDS\t5494\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR92135\"; protein_version \"1\";\n", "1\tena\tstart_codon\t5643\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n" ] } ], "source": [ "head $GTF" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mt\tena\tstop_codon\t23840\t23842\t.\t+\t0\tgene_id \"CNAG_09011\"; transcript_id \"AFR99113\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "Mt\tEnsembl_Fungi\tgene\t23909\t23980\t.\t+\t.\tgene_id \"ENSRNA049545749\"; gene_name \"tRNA-Val\"; gene_source \"Ensembl_Fungi\"; gene_biotype \"tRNA\";\n", "Mt\tEnsembl_Fungi\ttranscript\t23909\t23980\t.\t+\t.\tgene_id \"ENSRNA049545749\"; transcript_id \"ENSRNA049545749-T1\"; gene_name \"tRNA-Val\"; gene_source \"Ensembl_Fungi\"; gene_biotype \"tRNA\"; transcript_source \"Ensembl_Fungi\"; transcript_biotype \"tRNA\";\n", "Mt\tEnsembl_Fungi\texon\t23909\t23980\t.\t+\t.\tgene_id \"ENSRNA049545749\"; transcript_id \"ENSRNA049545749-T1\"; exon_number \"1\"; gene_name \"tRNA-Val\"; gene_source \"Ensembl_Fungi\"; gene_biotype \"tRNA\"; transcript_source \"Ensembl_Fungi\"; transcript_biotype \"tRNA\"; exon_id \"ENSRNA049545749-E1\";\n", "Mt\tena\tgene\t24096\t24851\t.\t+\t.\tgene_id \"CNAG_09012\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "Mt\tena\ttranscript\t24096\t24851\t.\t+\t.\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "Mt\tena\texon\t24096\t24851\t.\t+\t.\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR99114-1\";\n", "Mt\tena\tCDS\t24096\t24848\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR99114\"; protein_version \"1\";\n", "Mt\tena\tstart_codon\t24096\t24098\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "Mt\tena\tstop_codon\t24849\t24851\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n" ] } ], "source": [ "tail $GTF" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combining operations with cat and pipe" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#!genome-build CNA3\n", "#!genome-version CNA3\n", "#!genome-date 2015-11\n", "#!genome-build-accession GCA_000149245.3\n", "#!genebuild-last-updated 2015-11\n", "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n" ] } ], "source": [ "cat $GTF | head -8" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n" ] } ], "source": [ "cat $GTF | head -8 | tail -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filtering comment lines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using `tail`" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n" ] } ], "source": [ "cat $GTF | tail +6 | head -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using `cat` and regualr expressions" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "cat << EOF > vietnam.txt\n", "In 1965 Vietnam seemed like just another foreign war but it wasn't\n", "It was different in many ways, as so were those that did the fighting\n", "In World War II the average age of the combat soldier was 26\n", "In Vietnam he was 19\n", "EOF" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In 1965 Vietnam seemed like just another foreign war but it wasn't\n", "It was different in many ways, as so were those that did the fighting\n", "In World War II the average age of the combat soldier was 26\n", "In Vietnam he was 19\n" ] } ], "source": [ "cat vietnam.txt" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In World War II the average age of the combat soldier was 26\n" ] } ], "source": [ "cat vietnam.txt | egrep 'combat'" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "It was different in many ways, as so were those that did the fighting\n" ] } ], "source": [ "cat vietnam.txt | egrep 'wa.s'" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In World War II the average age of the combat soldier was 26\n" ] } ], "source": [ "cat vietnam.txt | egrep 'c.*'" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In 1965 Vietnam seemed like just another foreign war but it wasn't\n" ] } ], "source": [ "cat vietnam.txt | egrep 'it'" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In 1965 Vietnam seemed like just another foreign war but it wasn't\n", "It was different in many ways, as so were those that did the fighting\n" ] } ], "source": [ "cat vietnam.txt | egrep -i 'it'" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "It was different in many ways, as so were those that did the fighting\n" ] } ], "source": [ "cat vietnam.txt | egrep -i '^it'" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In 1965 Vietnam seemed like just another foreign war but it wasn't\n", "In World War II the average age of the combat soldier was 26\n", "In Vietnam he was 19\n" ] } ], "source": [ "cat vietnam.txt | egrep -i '[0-9]+'" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "It was different in many ways, as so were those that did the fighting\n" ] } ], "source": [ "cat vietnam.txt | egrep -i -v '[0-9]+'" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1965\n", "26\n", "19\n" ] } ], "source": [ "cat vietnam.txt | egrep -i -o '[0-9]+'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Usng regular eexpressions with `grep`" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n" ] } ], "source": [ "cat $GTF |\n", "grep -v '^#' | \n", "head -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating new files with redirection operators" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "cat $GTF | grep '^#' > header.txt" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "cat $GTF | grep -v '^#' > info.txt" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n" ] } ], "source": [ "head -3 info.txt" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "cat info.txt >> header.txt " ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#!genome-build CNA3\n", "#!genome-version CNA3\n", "#!genome-date 2015-11\n", "#!genome-build-accession GCA_000149245.3\n", "#!genebuild-last-updated 2015-11\n", "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n", "1\tena\tCDS\t5494\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR92135\"; protein_version \"1\";\n", "1\tena\tstart_codon\t5643\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n" ] } ], "source": [ "head header.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cutting columns from tabular data" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gene\n", "transcript\n", "exon\n", "CDS\n", "start_codon\n", "exon\n", "CDS\n", "exon\n", "CDS\n", "exon\n" ] } ], "source": [ "cat info.txt | cut -f3 | head -10" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\tgene\t100\t5645\n", "1\ttranscript\t100\t5645\n", "1\texon\t5494\t5645\n" ] } ], "source": [ "cat info.txt | cut -f1,3-5 | head -3" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "cat info.txt | cut -f1,3-5 | tr '\\t' ',' > info.csv" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1,gene,100,5645\n", "1,transcript,100,5645\n", "1,exon,5494,5645\n", "1,CDS,5494,5645\n", "1,start_codon,5643,5645\n", "1,exon,5322,5422\n", "1,CDS,5322,5422\n", "1,exon,3958,5263\n", "1,CDS,3958,5263\n", "1,exon,3206,3890\n" ] } ], "source": [ "head info.csv" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gene\n", "transcript\n", "exon\n" ] } ], "source": [ "cat info.csv | cut -f2 -d',' | head -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sorting and counting" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CDS\n", "CDS\n", "CDS\n", "exon\n", "exon\n", "exon\n", "exon\n", "gene\n", "start_codon\n", "transcript\n" ] } ], "source": [ "cat info.txt | cut -f3 | head -10 | sort" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "transcript\n", "start_codon\n", "gene\n", "exon\n", "exon\n", "exon\n", "exon\n", "CDS\n", "CDS\n", "CDS\n" ] } ], "source": [ "cat info.txt | cut -f3 | head -10 | sort -r" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CDS\n", "exon\n", "five_prime_utr\n", "gene\n", "start_codon\n", "stop_codon\n", "three_prime_utr\n", "transcript\n" ] } ], "source": [ "cat info.txt | cut -f3 | sort | uniq" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 49063 CDS\n", " 52036 exon\n", " 6923 five_prime_utr\n", " 8497 gene\n", " 7860 start_codon\n", " 3167 stop_codon\n", " 7034 three_prime_utr\n", " 9348 transcript\n" ] } ], "source": [ "cat info.txt | cut -f3 | sort | uniq -c" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Features on a chromosome - using `awk`" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mt\tena\tgene\t20\t2196\t.\t+\t.\tgene_id \"CNAG_09000\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "Mt\tena\ttranscript\t20\t2196\t.\t+\t.\tgene_id \"CNAG_09000\"; transcript_id \"AFR99102\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "Mt\tena\texon\t20\t233\t.\t+\t.\tgene_id \"CNAG_09000\"; transcript_id \"AFR99102\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR99102-1\";\n" ] } ], "source": [ "cat info.txt | awk -F '\\t' '$1==\"Mt\"' | head -3" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4241 CDS\n", " 4434 exon\n", " 597 five_prime_utr\n", " 706 gene\n", " 681 start_codon\n", " 278 stop_codon\n", " 606 three_prime_utr\n", " 780 transcript\n" ] } ], "source": [ "cat info.txt | awk -F '\\t' '$1==\"2\" {print $3}' | sort | uniq -c" ] } ], "metadata": { "kernelspec": { "display_name": "Bash", "language": "bash", "name": "bash" }, "language_info": { "codemirror_mode": "shell", "file_extension": ".sh", "mimetype": "text/x-sh", "name": "bash" } }, "nbformat": 4, "nbformat_minor": 2 }