{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# The Unix Shell: File and Directory Management" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Listing files" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_1_Solutions.sh\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_2_Solutoins.sh\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "\u001b[0m\u001b[01;34mdata\u001b[0m\n", "\u001b[01;34mdata2\u001b[0m\n", "\u001b[01;34mfigs\u001b[0m\n", "hello.txt\n", "nursery.txt\n", "Process-RNA-seq-counts.ipynb\n", "\u001b[01;34mref\u001b[0m\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R_Graphics_Exercise_Solutions.r\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "R_tidyyverse_Exercise_Solutions.r\n", "stderr.txt\n", "stdout.txt\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cn.txt foo_Copy.ipynb\n", "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf header.txt\n" ] } ], "source": [ "ls ref" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Include hidden files" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34m.\u001b[0m\n", "\u001b[01;34m..\u001b[0m\n", "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_1_Solutions.sh\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_2_Solutoins.sh\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "\u001b[01;34mdata\u001b[0m\n", "\u001b[01;34mdata2\u001b[0m\n", "\u001b[01;34mfigs\u001b[0m\n", "hello.txt\n", "\u001b[01;34m.ipynb_checkpoints\u001b[0m\n", "nursery.txt\n", "Process-RNA-seq-counts.ipynb\n", "\u001b[01;34mref\u001b[0m\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R_Graphics_Exercise_Solutions.r\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "R_tidyyverse_Exercise_Solutions.r\n", "stderr.txt\n", "stdout.txt\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls -a" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exclude current and parent directory" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_1_Solutions.sh\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_2_Solutoins.sh\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "\u001b[0m\u001b[01;34mdata\u001b[0m\n", "\u001b[01;34mdata2\u001b[0m\n", "\u001b[01;34mfigs\u001b[0m\n", "hello.txt\n", "\u001b[01;34m.ipynb_checkpoints\u001b[0m\n", "nursery.txt\n", "Process-RNA-seq-counts.ipynb\n", "\u001b[01;34mref\u001b[0m\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R_Graphics_Exercise_Solutions.r\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "R_tidyyverse_Exercise_Solutions.r\n", "stderr.txt\n", "stdout.txt\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls -A" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Show details" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 4380\n", "-rw-r--r-- 1 jovyan 1000 9683 Jul 9 12:39 Bash_Exercise_1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 15460 Jul 9 12:39 Bash_Exercise_1_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 1323 Jul 9 12:39 Bash_Exercise_1_Solutions.sh\n", "-rw-r--r-- 1 jovyan 1000 6217 Jul 9 12:39 Bash_Exercise_2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 12757 Jul 9 12:39 Bash_Exercise_2_Solutoins.ipynb\n", "-rw-r--r-- 1 jovyan 1000 1126 Jul 9 12:39 Bash_Exercise_2_Solutoins.sh\n", "-rw-r--r-- 1 jovyan 1000 15448 Jul 1 21:42 Bash_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 65260 Jun 27 21:28 Bash in Jupyter.ipynb\n", "-rw-r--r-- 1 jovyan 1000 65260 Jul 9 12:39 Bash_in_Jupyter.ipynb\n", "-rw-r--r-- 1 jovyan users 36489 Jul 9 14:29 Bash_tutorial-Copy1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 5967 Jul 9 12:39 Bash_tutorial.ipynb\n", "-rw-r--r-- 1 jovyan 1000 29522 Jul 9 12:39 Bash_tutorial_prep.ipynb\n", "drwxr-xr-x 2 jovyan users 4096 Jul 5 15:01 \u001b[0m\u001b[01;34mdata\u001b[0m\n", "drwxr-xr-x 2 jovyan users 4096 Jul 9 14:59 \u001b[01;34mdata2\u001b[0m\n", "drwxr-xr-x 2 jovyan 1000 4096 Jul 5 15:01 \u001b[01;34mfigs\u001b[0m\n", "-rw-r--r-- 1 jovyan users 45 Jul 9 15:15 hello.txt\n", "-rw-r--r-- 1 jovyan users 24 Jul 9 15:17 nursery.txt\n", "-rw-r--r-- 1 jovyan 1000 12634 Jul 9 12:39 Process-RNA-seq-counts.ipynb\n", "drwxr-xr-x 2 jovyan users 4096 Jul 9 15:00 \u001b[01;34mref\u001b[0m\n", "-rw-r--r-- 1 jovyan 1000 1398902 Jul 9 12:39 R_Graphic_ ggplot2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 155396 Jun 27 21:28 R Graphics Base.ipynb\n", "-rw-r--r-- 1 jovyan 1000 155396 Jul 9 12:39 R_Graphics_Base.ipynb\n", "-rw-r--r-- 1 jovyan 1000 10670 Jul 9 12:39 R_Graphics_Exercise.ipynb\n", "-rw-r--r-- 1 jovyan 1000 195947 Jul 9 12:39 R_Graphics_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 938 Jul 9 12:39 R_Graphics_Exercise_Solutions.r\n", "-rw-r--r-- 1 jovyan 1000 1398902 Jun 27 21:28 R Graphics ggplot2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 158687 Jun 27 21:28 R Graphics Overview.ipynb\n", "-rw-r--r-- 1 jovyan 1000 158687 Jul 9 12:39 R_Graphics_Overview.ipynb\n", "-rw-r--r-- 1 jovyan 1000 106176 Jul 9 12:39 R_tidyverse_1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 82867 Jul 9 12:39 R_tidyverse_2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 126650 Jul 9 12:39 R_tidyverse_3.ipynb\n", "-rw-r--r-- 1 jovyan 1000 29862 Jul 9 12:39 R_tidyyverse_Exercise.ipynb\n", "-rw-r--r-- 1 jovyan 1000 7260 Jul 9 12:39 R_tidyyverse_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 70 Jul 9 12:39 R_tidyyverse_Exercise_Solutions.r\n", "-rw-r--r-- 1 jovyan users 76 Jul 9 15:22 stderr.txt\n", "-rw-r--r-- 1 jovyan users 0 Jul 9 15:22 stdout.txt\n", "-rw-r--r-- 1 jovyan 1000 53864 Jul 9 15:12 The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "-rw-r--r-- 1 jovyan 1000 7306 Jul 9 15:25 The_Unix_Shell_02___Working_with_Text.ipynb\n", "-rw-r--r-- 1 jovyan 1000 19041 Jul 9 15:35 The_Unix_Shell_03___Finding_Stuff.ipynb\n", "-rw-r--r-- 1 jovyan 1000 16736 Jul 9 15:51 The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 13236 Jul 9 12:39 The_Unix_Shell_05___Shell_Scripts.ipynb\n", "-rw-r--r-- 1 jovyan 1000 4928 Jul 9 12:39 The_Unix_Shell___Exercises.ipynb\n", "-rw-r--r-- 1 jovyan 1000 9293 Jul 9 12:39 The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls -l" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Shown only directories" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34mdata/\u001b[0m \u001b[01;34mdata2/\u001b[0m \u001b[01;34mfigs/\u001b[0m \u001b[01;34mref/\u001b[0m\n" ] } ], "source": [ "ls -d */" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Alternative using grep" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drwxr-xr-x 2 jovyan users 4096 Jul 5 15:01 data\n", "drwxr-xr-x 2 jovyan users 4096 Jul 9 14:59 data2\n", "drwxr-xr-x 2 jovyan 1000 4096 Jul 5 15:01 figs\n", "drwxr-xr-x 2 jovyan users 4096 Jul 9 15:00 ref\n" ] } ], "source": [ "ls -l | grep -E '^d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Show only files" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 4380\n", "-rw-r--r-- 1 jovyan 1000 9683 Jul 9 12:39 Bash_Exercise_1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 15460 Jul 9 12:39 Bash_Exercise_1_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 1323 Jul 9 12:39 Bash_Exercise_1_Solutions.sh\n", "-rw-r--r-- 1 jovyan 1000 6217 Jul 9 12:39 Bash_Exercise_2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 12757 Jul 9 12:39 Bash_Exercise_2_Solutoins.ipynb\n", "-rw-r--r-- 1 jovyan 1000 1126 Jul 9 12:39 Bash_Exercise_2_Solutoins.sh\n", "-rw-r--r-- 1 jovyan 1000 15448 Jul 1 21:42 Bash_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 65260 Jun 27 21:28 Bash in Jupyter.ipynb\n", "-rw-r--r-- 1 jovyan 1000 65260 Jul 9 12:39 Bash_in_Jupyter.ipynb\n", "-rw-r--r-- 1 jovyan users 36489 Jul 9 14:29 Bash_tutorial-Copy1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 5967 Jul 9 12:39 Bash_tutorial.ipynb\n", "-rw-r--r-- 1 jovyan 1000 29522 Jul 9 12:39 Bash_tutorial_prep.ipynb\n", "-rw-r--r-- 1 jovyan users 45 Jul 9 15:15 hello.txt\n", "-rw-r--r-- 1 jovyan users 24 Jul 9 15:17 nursery.txt\n", "-rw-r--r-- 1 jovyan 1000 12634 Jul 9 12:39 Process-RNA-seq-counts.ipynb\n", "-rw-r--r-- 1 jovyan 1000 1398902 Jul 9 12:39 R_Graphic_ ggplot2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 155396 Jun 27 21:28 R Graphics Base.ipynb\n", "-rw-r--r-- 1 jovyan 1000 155396 Jul 9 12:39 R_Graphics_Base.ipynb\n", "-rw-r--r-- 1 jovyan 1000 10670 Jul 9 12:39 R_Graphics_Exercise.ipynb\n", "-rw-r--r-- 1 jovyan 1000 195947 Jul 9 12:39 R_Graphics_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 938 Jul 9 12:39 R_Graphics_Exercise_Solutions.r\n", "-rw-r--r-- 1 jovyan 1000 1398902 Jun 27 21:28 R Graphics ggplot2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 158687 Jun 27 21:28 R Graphics Overview.ipynb\n", "-rw-r--r-- 1 jovyan 1000 158687 Jul 9 12:39 R_Graphics_Overview.ipynb\n", "-rw-r--r-- 1 jovyan 1000 106176 Jul 9 12:39 R_tidyverse_1.ipynb\n", "-rw-r--r-- 1 jovyan 1000 82867 Jul 9 12:39 R_tidyverse_2.ipynb\n", "-rw-r--r-- 1 jovyan 1000 126650 Jul 9 12:39 R_tidyverse_3.ipynb\n", "-rw-r--r-- 1 jovyan 1000 29862 Jul 9 12:39 R_tidyyverse_Exercise.ipynb\n", "-rw-r--r-- 1 jovyan 1000 7260 Jul 9 12:39 R_tidyyverse_Exercise_Solutions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 70 Jul 9 12:39 R_tidyyverse_Exercise_Solutions.r\n", "-rw-r--r-- 1 jovyan users 76 Jul 9 15:22 stderr.txt\n", "-rw-r--r-- 1 jovyan users 0 Jul 9 15:22 stdout.txt\n", "-rw-r--r-- 1 jovyan 1000 53864 Jul 9 15:12 The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "-rw-r--r-- 1 jovyan 1000 7306 Jul 9 15:25 The_Unix_Shell_02___Working_with_Text.ipynb\n", "-rw-r--r-- 1 jovyan 1000 19041 Jul 9 15:35 The_Unix_Shell_03___Finding_Stuff.ipynb\n", "-rw-r--r-- 1 jovyan 1000 16736 Jul 9 15:51 The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 13236 Jul 9 12:39 The_Unix_Shell_05___Shell_Scripts.ipynb\n", "-rw-r--r-- 1 jovyan 1000 4928 Jul 9 12:39 The_Unix_Shell___Exercises.ipynb\n", "-rw-r--r-- 1 jovyan 1000 9293 Jul 9 12:39 The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls -l | grep -Ev '^d'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort by last modified time" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 4380\n", "-rw-r--r-- 1 jovyan 1000 16736 Jul 9 15:51 The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 19041 Jul 9 15:35 The_Unix_Shell_03___Finding_Stuff.ipynb\n" ] } ], "source": [ "ls -lt | head -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Human readable output" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 4.3M\n", "-rw-r--r-- 1 jovyan 1000 17K Jul 9 15:51 The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "-rw-r--r-- 1 jovyan 1000 19K Jul 9 15:35 The_Unix_Shell_03___Finding_Stuff.ipynb\n" ] } ], "source": [ "ls -lth | head -3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recursive listing" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".:\n", "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_1_Solutions.sh\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_2_Solutoins.sh\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "\u001b[0m\u001b[01;34mdata\u001b[0m\n", "\u001b[01;34mdata2\u001b[0m\n", "\u001b[01;34mfigs\u001b[0m\n", "hello.txt\n", "nursery.txt\n", "Process-RNA-seq-counts.ipynb\n", "\u001b[01;34mref\u001b[0m\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R_Graphics_Exercise_Solutions.r\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "R_tidyyverse_Exercise_Solutions.r\n", "stderr.txt\n", "stdout.txt\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n", "\n", "./data:\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n", "\n", "./data2:\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n", "\n", "./figs:\n", "\u001b[01;35mfig1.png\u001b[0m \u001b[01;35mfig2.png\u001b[0m \u001b[01;35mfig3.png\u001b[0m \u001b[01;35mfig4.png\u001b[0m \u001b[01;35mfig5.png\u001b[0m\n", "\n", "./ref:\n", "cn.txt foo_Copy.ipynb\n", "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf header.txt\n" ] } ], "source": [ "ls -R" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Globbing\n", "\n", "The use of wild cards to specify Unix paths is known as globbing." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `*` represets any number of characters" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "Process-RNA-seq-counts.ipynb\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls *ipynb" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The_Unix_Shell_02___Working_with_Text.ipynb\n" ] } ], "source": [ "ls *Text*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `?` represents exactly one character" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls data" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/unc_genes_v1.tsv data/unc_genes_v2.tsv\n" ] } ], "source": [ "ls data/unc_genes_v?.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Character sets\n", "\n", "- `[abc]` represents a or b or c\n", "- [a-z] represents any lower case character\n", "- `!` negates" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls data" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/duke_demographics.tsv data/duke_proteins_v1.tsv\n", "data/duke_genes_v1.tsv data/duke_proteins_v2.tsv\n", "data/duke_genes_v2.tsv\n" ] } ], "source": [ "ls data/[a-d]*" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/gene_counts.txt data/unc_genes_v1.tsv data/unc_proteins_v1.tsv\n", "data/unc_demographics.tsv data/unc_genes_v2.tsv data/unc_proteins_v2.tsv\n" ] } ], "source": [ "ls data/[!a-d]*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Directory navigation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Show current directory" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks/cliburn\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Move to parent directory" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "cd .." ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Move back to last directory" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks/cliburn\n" ] } ], "source": [ "cd -" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Move using relative addressing" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "cd data" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks/cliburn/data\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Move using absolute addressing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Move to data folder" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "cd /home/jovyan/work/HTS2018-notebooks/cliburn/data" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks/cliburn/data\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Move back to cliburn directory" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "cd /home/jovyan/work/HTS2018-notebooks/cliburn/" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/jovyan/work/HTS2018-notebooks/cliburn\n" ] } ], "source": [ "pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Making and removing new directories" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [], "source": [ "mkdir foo" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34mdata/\u001b[0m \u001b[01;34mdata2/\u001b[0m \u001b[01;34mfigs/\u001b[0m \u001b[01;34mfoo/\u001b[0m \u001b[01;34mref/\u001b[0m\n" ] } ], "source": [ "ls -d */" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Making intermediate directories automatically" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory ‘a/b/c/d’: No such file or directory\n" ] }, { "ename": "", "evalue": "1", "output_type": "error", "traceback": [] } ], "source": [ "mkdir a/b/c/d" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [], "source": [ "mkdir -p a/b/c/d" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a:\n", "\u001b[0m\u001b[01;34mb\u001b[0m\n", "\n", "a/b:\n", "\u001b[01;34mc\u001b[0m\n", "\n", "a/b/c:\n", "\u001b[01;34md\u001b[0m\n", "\n", "a/b/c/d:\n" ] } ], "source": [ "ls -R a" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting directories" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "rmdir foo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Only works if directory is empty\n", "\n", "The `| cat` part is not necessary on the command line, but is only used here for convenience of Run All Cells as Jupyter stops on non-zero exit codes. The `| cat` syntax \"pipes\" the output of `rmdir data` to a the `cat` program." ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rmdir: failed to remove ‘data’: Directory not empty\n" ] } ], "source": [ "rmdir data | cat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recursive intermediate directories as well" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "rmdir -p a/b/c/d" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34mdata/\u001b[0m \u001b[01;34mdata2/\u001b[0m \u001b[01;34mfigs/\u001b[0m \u001b[01;34mref/\u001b[0m\n" ] } ], "source": [ "ls -d */" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Making an empty file" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [], "source": [ "touch foo.txt" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "foo.txt hello.txt nursery.txt stderr.txt stdout.txt\n" ] } ], "source": [ "ls *txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Deleting a file" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [], "source": [ "rm foo.txt" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello.txt nursery.txt stderr.txt stdout.txt\n" ] } ], "source": [ "ls *txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Viewing a file" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls data" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cat: date/duke_genes: No such file or directory\n" ] } ], "source": [ "cat date/duke_genes | head" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#!genome-build CNA3\n", "#!genome-version CNA3\n", "#!genome-date 2015-11\n" ] } ], "source": [ "head -n 3 ref/header.txt" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mt\tena\tCDS\t24096\t24848\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR99114\"; protein_version \"1\";\n", "Mt\tena\tstart_codon\t24096\t24098\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "Mt\tena\tstop_codon\t24849\t24851\t.\t+\t0\tgene_id \"CNAG_09012\"; transcript_id \"AFR99114\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n" ] } ], "source": [ "tail -n 3 ref/header.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Can start tail form a specified line number with (+)" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "#!genome-build-accession GCA_000149245.3\n", "#!genebuild-last-updated 2015-11\n", "1\tena\tgene\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; gene_source \"ena\"; gene_biotype \"protein_coding\";\n", "1\tena\ttranscript\t100\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5494\t5645\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-1\";\n", "1\tena\tCDS\t5494\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR92135\"; protein_version \"1\";\n", "1\tena\tstart_codon\t5643\t5645\t.\t-\t0\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"1\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\";\n", "1\tena\texon\t5322\t5422\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"2\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-2\";\n", "1\tena\tCDS\t5322\t5422\t.\t-\t1\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"2\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; protein_id \"AFR92135\"; protein_version \"1\";\n", "1\tena\texon\t3958\t5263\t.\t-\t.\tgene_id \"CNAG_04548\"; transcript_id \"AFR92135\"; exon_number \"3\"; gene_source \"ena\"; gene_biotype \"protein_coding\"; transcript_source \"ena\"; transcript_biotype \"protein_coding\"; exon_id \"AFR92135-3\";\n", "tail: error writing ‘standard output’: Broken pipe\n" ] } ], "source": [ "tail -n +4 ref/header.txt | head " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Copying and moving files" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bash_Exercise_1.ipynb\n", "Bash_Exercise_1_Solutions.ipynb\n", "Bash_Exercise_1_Solutions.sh\n", "Bash_Exercise_2.ipynb\n", "Bash_Exercise_2_Solutoins.ipynb\n", "Bash_Exercise_2_Solutoins.sh\n", "Bash_Exercise_Solutions.ipynb\n", "Bash in Jupyter.ipynb\n", "Bash_in_Jupyter.ipynb\n", "Bash_tutorial-Copy1.ipynb\n", "Bash_tutorial.ipynb\n", "Bash_tutorial_prep.ipynb\n", "\u001b[0m\u001b[01;34mdata\u001b[0m\n", "\u001b[01;34mdata2\u001b[0m\n", "\u001b[01;34mfigs\u001b[0m\n", "hello.txt\n", "nursery.txt\n", "Process-RNA-seq-counts.ipynb\n", "\u001b[01;34mref\u001b[0m\n", "R_Graphic_ ggplot2.ipynb\n", "R Graphics Base.ipynb\n", "R_Graphics_Base.ipynb\n", "R_Graphics_Exercise.ipynb\n", "R_Graphics_Exercise_Solutions.ipynb\n", "R_Graphics_Exercise_Solutions.r\n", "R Graphics ggplot2.ipynb\n", "R Graphics Overview.ipynb\n", "R_Graphics_Overview.ipynb\n", "R_tidyverse_1.ipynb\n", "R_tidyverse_2.ipynb\n", "R_tidyverse_3.ipynb\n", "R_tidyyverse_Exercise.ipynb\n", "R_tidyyverse_Exercise_Solutions.ipynb\n", "R_tidyyverse_Exercise_Solutions.r\n", "stderr.txt\n", "stdout.txt\n", "The_Unix_Shell_01___File_and_Directory_Management.ipynb\n", "The_Unix_Shell_02___Working_with_Text.ipynb\n", "The_Unix_Shell_03___Finding_Stuff.ipynb\n", "The_Unix_Shell_04___Regular_Expresssions.ipynb\n", "The_Unix_Shell_05___Shell_Scripts.ipynb\n", "The_Unix_Shell___Exercises.ipynb\n", "The_Unix_Shell___Exercises_Solutions.ipynb\n" ] } ], "source": [ "ls" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Copying files" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "cp \"The_Unix_Shell_01___File_and_Directory_Management.ipynb\" foo.ipynb" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "foo.ipynb\n" ] } ], "source": [ "ls f*ipynb" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Copying directories (Recursive copy)" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "cp -R data data2" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data2:\n", "\u001b[0m\u001b[01;34mdata\u001b[0m duke_proteins_v1.tsv unc_genes_v1.tsv\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "\n", "data2/data:\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls -R data2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Renaming a file" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "mv foo.ipynb foo_Copy.ipynb" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "foo_Copy.ipynb\n" ] } ], "source": [ "ls foo*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Move a file to a new location" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [], "source": [ "mv foo_Copy.ipynb ref/" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cn.txt foo_Copy.ipynb\n", "Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf header.txt\n" ] } ], "source": [ "ls ref" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## File compression and archival" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combine multiple files into single file" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls data" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bash: man: command not found\n" ] } ], "source": [ "man tar | head -n 20" ] }, { "cell_type": "code", "execution_count": 169, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/\n", "data/unc_proteins_v2.tsv\n", "data/duke_genes_v1.tsv\n", "data/duke_demographics.tsv\n", "data/unc_genes_v2.tsv\n", "data/gene_counts.txt\n", "data/unc_proteins_v1.tsv\n", "data/duke_proteins_v2.tsv\n", "data/unc_demographics.tsv\n", "data/unc_genes_v1.tsv\n", "data/duke_proteins_v1.tsv\n", "data/duke_genes_v2.tsv\n" ] } ], "source": [ "tar -cvf data.tar data" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;31mdata.tar\u001b[0m\n" ] } ], "source": [ "ls *tar" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34mdata/\u001b[0m \u001b[01;34mdata2/\u001b[0m \u001b[01;34mfigs/\u001b[0m \u001b[01;34mref/\u001b[0m\n" ] } ], "source": [ "ls -d */" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "rm -rf data/" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;34mdata2/\u001b[0m \u001b[01;34mfigs/\u001b[0m \u001b[01;34mref/\u001b[0m\n" ] } ], "source": [ "ls -d */" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compress concatenated file" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "gzip data.tar" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;31mdata.tar.gz\u001b[0m\n" ] } ], "source": [ "ls *gz" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Uncompress" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [], "source": [ "gunzip data.tar.gz" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;31mdata.tar\u001b[0m\n" ] } ], "source": [ "ls *tar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recover original files" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/\n", "data/unc_proteins_v2.tsv\n", "data/duke_genes_v1.tsv\n", "data/duke_demographics.tsv\n", "data/unc_genes_v2.tsv\n", "data/gene_counts.txt\n", "data/unc_proteins_v1.tsv\n", "data/duke_proteins_v2.tsv\n", "data/unc_demographics.tsv\n", "data/unc_genes_v1.tsv\n", "data/duke_proteins_v1.tsv\n", "data/duke_genes_v2.tsv\n" ] } ], "source": [ "tar -xvf data.tar" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n" ] } ], "source": [ "ls data/" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [], "source": [ "rm data.tar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Concatenate and compress" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/\n", "data/unc_proteins_v2.tsv\n", "data/duke_genes_v1.tsv\n", "data/duke_demographics.tsv\n", "data/unc_genes_v2.tsv\n", "data/gene_counts.txt\n", "data/unc_proteins_v1.tsv\n", "data/duke_proteins_v2.tsv\n", "data/unc_demographics.tsv\n", "data/unc_genes_v1.tsv\n", "data/duke_proteins_v1.tsv\n", "data/duke_genes_v2.tsv\n" ] } ], "source": [ "tar -cvzf data.tar.gz data" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [], "source": [ "rm -rf data/" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;31mdata.tar.gz\u001b[0m\n", "\n", "data2:\n", "\u001b[01;34mdata\u001b[0m duke_proteins_v1.tsv unc_genes_v1.tsv\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n" ] } ], "source": [ "ls data*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Uncompress and recover" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/\n", "data/unc_proteins_v2.tsv\n", "data/duke_genes_v1.tsv\n", "data/duke_demographics.tsv\n", "data/unc_genes_v2.tsv\n", "data/gene_counts.txt\n", "data/unc_proteins_v1.tsv\n", "data/duke_proteins_v2.tsv\n", "data/unc_demographics.tsv\n", "data/unc_genes_v1.tsv\n", "data/duke_proteins_v1.tsv\n", "data/duke_genes_v2.tsv\n" ] } ], "source": [ "tar -xvzf data.tar.gz" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0m\u001b[01;31mdata.tar.gz\u001b[0m\n", "\n", "data:\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n", "duke_proteins_v1.tsv unc_genes_v1.tsv\n", "\n", "data2:\n", "\u001b[01;34mdata\u001b[0m duke_proteins_v1.tsv unc_genes_v1.tsv\n", "duke_demographics.tsv duke_proteins_v2.tsv unc_genes_v2.tsv\n", "duke_genes_v1.tsv gene_counts.txt unc_proteins_v1.tsv\n", "duke_genes_v2.tsv unc_demographics.tsv unc_proteins_v2.tsv\n" ] } ], "source": [ "ls data*" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [], "source": [ "rm data.tar.gz" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Checksums\n", "\n", "When working with genomic data, we deal with very large files. There is a small risk that these files will be corrupted over time or during data transfer. To ensure that files are not changed, we use a \"checksum\" function. This is a function that generates an long, essentially random number called a checksum that represents the contents of the file. When the file contents change, so will the checksum. In theory, there is a very small probability that two different files generate the same checksum, but in practice the probability is too small to worry about.\n", "\n", "There are several different algorithms for generating the checksums, and at least 3 Unix commands to do so, but they all work very similarly for our purposes.\n", "\n", "The strategy is:\n", "\n", "- Generate and store a checksum together with a data file whose integrity you care about\n", "- When you use or download the data, re-generate the checksum (using the same algorithm e.g. MD5) and compare with the checksum" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [], "source": [ "cat << EOF > hello.txt\n", "1 Hello, bash\n", "2 Hello, again\n", "3 Hello\n", "4 again\n", "EOF" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 Hello, bash\n", "2 Hello, again\n", "3 Hello\n", "4 again\n" ] } ], "source": [ "cat hello.txt" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1567754519 45 hello.txt\n" ] } ], "source": [ "cksum hello.txt" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a68554400613f5445c13c57907e976ed hello.txt\n" ] } ], "source": [ "md5sum hello.txt" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "57eae725420bf0075d17f849cc8e75379bea6eb6 hello.txt\n" ] } ], "source": [ "sha1sum hello.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### If we alter hello.txt in any way the checksum will be different" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 Hello, bash\n", "2 Hello, again\n", "3 Hello\n", "4 again\n" ] } ], "source": [ "cat hello.txt" ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [], "source": [ "md5sum hello.txt > hello.md5" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a68554400613f5445c13c57907e976ed hello.txt\n" ] } ], "source": [ "cat hello.md5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now make a small change to `hello.txt`" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [], "source": [ "cat > test1.txt << EOF\n", "One, two buckle my shoe\n", "Three, four lock the door\n", "EOF" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [], "source": [ "cat > hello.txt << EOF\n", "1 Hello, bash\n", "2 Hella, again\n", "3 Hello\n", "4 again\n", "EOF" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 Hello, bash\n", "2 Hella, again\n", "3 Hello\n", "4 again\n" ] } ], "source": [ "cat hello.txt" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0d8c8172f2a69f5845f21cb03a436be3 hello.txt\n" ] } ], "source": [ "md5sum hello.txt" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello.txt: FAILED\n", "md5sum: WARNING: 1 computed checksum did NOT match\n" ] }, { "ename": "", "evalue": "1", "output_type": "error", "traceback": [] } ], "source": [ "md5sum -c hello.md5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Restore original text" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [], "source": [ "cat > hello.txt << EOF\n", "1 Hello, bash\n", "2 Hello, again\n", "3 Hello\n", "4 again\n", "EOF" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [], "source": [ "md5sum hello.txt > test.md5" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello.txt: OK\n" ] } ], "source": [ "md5sum -c hello.md5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Checksums for multiple files" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [], "source": [ "echo \"aaaaa\" > a.txt\n", "echo \"bbbbb\" > b.txt\n", "echo \"ccccc\" > c.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Generate md5 checksum file" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [], "source": [ "md5sum a.txt b.txt c.txt > MD5_CHECKSUM" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4c850c5b3b2756e67a91bad8e046ddac a.txt\n", "369d9bb6f2313be57f7a55502eb420ba b.txt\n", "34d9ae3c9b1fa64d91bdb00f3c0d6cd5 c.txt\n" ] } ], "source": [ "cat MD5_CHECKSUM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Modify one file" ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [], "source": [ "echo \"bbcbb\" > b.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check file integrity for all files" ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a.txt: OK\n", "b.txt: FAILED\n", "c.txt: OK\n", "md5sum: WARNING: 1 computed checksum did NOT match\n" ] }, { "ename": "", "evalue": "1", "output_type": "error", "traceback": [] } ], "source": [ "md5sum -c MD5_CHECKSUM" ] } ], "metadata": { "kernelspec": { "display_name": "Bash", "language": "bash", "name": "bash" }, "language_info": { "codemirror_mode": "shell", "file_extension": ".sh", "mimetype": "text/x-sh", "name": "bash" } }, "nbformat": 4, "nbformat_minor": 2 }