{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# S02 RDBMS and SQL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more SQL examples in the SQLite3 dialect, seee [SQLite3 tutorial](https://www.techonthenet.com/sqlite/index.php). \n", "\n", "For a deep dive, see [SQL Queries for Mere Mortals](https://www.amazon.com/SQL-Queries-Mere-Mortals-Hands/dp/0134858336/ref=dp_ob_title_bk)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A. More RDBMS concepts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### OLTP and OLAP\n", "\n", "- OLTP\n", " - Normalized schema\n", "- OLAP\n", " - Denormalized schema\n", " - Star\n", " - Facts\n", " - Dimensions\n", " - Snowflake \n", " - Generated from OLTP databases by ETL (Extract-Transform-Load) operations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Types of REBMS\n", " \n", "- Data lake\n", "- Data warehouse\n", "- Data mart \n", " \n", "Data marts typically use a star schema that is customized for the analysis needs. For example, the finance department in a hospital may be most interested in Facts about Claims.\n", " \n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Robustness and scaling\n", "\n", "- Replication\n", "- Sharding" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## B. Basic SQL queries" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data we will work with in Part B" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Connected: None@data/faculty.db'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql sqlite:///data/faculty.db" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "
type | \n", "name | \n", "tbl_name | \n", "rootpage | \n", "sql | \n", "
---|---|---|---|---|
table | \n", "person | \n", "person | \n", "2 | \n", "CREATE TABLE person ( \t"index" BIGINT, \tperson_id BIGINT, \tfirst TEXT, \tlast TEXT, \tage BIGINT, \theight FLOAT, \tweight BIGINT, \tcountry_id TEXT, \tgender_id BIGINT ) | \n",
"
table | \n", "confidential | \n", "confidential | \n", "18 | \n", "CREATE TABLE confidential ( \t"index" BIGINT, \tperson_id BIGINT, \tsalary BIGINT ) | \n",
"
table | \n", "person_language | \n", "person_language | \n", "33 | \n", "CREATE TABLE person_language ( \t"index" BIGINT, \tperson_id BIGINT, \tlanguage_id BIGINT ) | \n",
"
table | \n", "language | \n", "language | \n", "50 | \n", "CREATE TABLE language ( \t"index" BIGINT, \tlanguage_id BIGINT, \tlanguage_name TEXT ) | \n",
"
table | \n", "gender | \n", "gender | \n", "55 | \n", "CREATE TABLE gender ( \t"index" BIGINT, \tgender_id BIGINT, \tgender TEXT ) | \n",
"
table | \n", "country | \n", "country | \n", "57 | \n", "CREATE TABLE country ( \t"index" BIGINT, \tcountry_id TEXT, \tcountry TEXT, \tnationality TEXT ) | \n",
"
language_name | \n", "
---|
PHP | \n", "
Clojure | \n", "
Dylan | \n", "
GNU Octave | \n", "
D | \n", "
language_name | \n", "
---|
ASP | \n", "
Assembly | \n", "
AutoIt | \n", "
Awk | \n", "
Bash | \n", "
first | \n", "last | \n", "age | \n", "
---|---|---|
Antoine | \n", "Beard | \n", "16 | \n", "
Augustine | \n", "Mejia | \n", "16 | \n", "
Boris | \n", "Mejia | \n", "16 | \n", "
Brain | \n", "Haney | \n", "16 | \n", "
Burl | \n", "Mayo | \n", "17 | \n", "
first | \n", "last | \n", "language_name | \n", "
---|---|---|
Aaron | \n", "Alexander | \n", "Haskell | \n", "
Aaron | \n", "Kirby | \n", "GNU Octave | \n", "
Aaron | \n", "Kirby | \n", "haXe | \n", "
Aaron | \n", "Kirby | \n", "Falcon | \n", "
Abram | \n", "Allen | \n", "TypeScript | \n", "
Abram | \n", "Boyer | \n", "Io | \n", "
Abram | \n", "Boyer | \n", "Lua | \n", "
Abram | \n", "Boyer | \n", "Falcon | \n", "
Adan | \n", "Brown | \n", "F# | \n", "
Adolph | \n", "Dalton | \n", "Dart | \n", "
first | \n", "last | \n", "language_name | \n", "
---|---|---|
Aaron | \n", "Alexander | \n", "Haskell | \n", "
Aaron | \n", "Kirby | \n", "GNU Octave | \n", "
Aaron | \n", "Kirby | \n", "haXe | \n", "
Aaron | \n", "Kirby | \n", "Falcon | \n", "
Abram | \n", "Allen | \n", "TypeScript | \n", "
Abram | \n", "Boyer | \n", "Io | \n", "
Abram | \n", "Boyer | \n", "Lua | \n", "
Abram | \n", "Boyer | \n", "Falcon | \n", "
Adan | \n", "Brown | \n", "F# | \n", "
Adolph | \n", "Dalton | \n", "Dart | \n", "
first | \n", "last | \n", "language_name | \n", "
---|---|---|
Aaron | \n", "Alexander | \n", "Haskell | \n", "
Andree | \n", "Douglas | \n", "Haskell | \n", "
Arlie | \n", "Terrell | \n", "Python | \n", "
Boyd | \n", "Blackwell | \n", "Haskell | \n", "
Buck | \n", "Howe | \n", "Haskell | \n", "
Carlton | \n", "Richard | \n", "Haskell | \n", "
Carylon | \n", "Zamora | \n", "Python | \n", "
Clarisa | \n", "Rodgers | \n", "Python | \n", "
Dinorah | \n", "O'brien | \n", "Haskell | \n", "
Dorian | \n", "Lloyd | \n", "Haskell | \n", "
first | \n", "last | \n", "language_name | \n", "
---|---|---|
Aaron | \n", "Alexander | \n", "Haskell | \n", "
Andree | \n", "Douglas | \n", "Haskell | \n", "
Arlie | \n", "Terrell | \n", "Python | \n", "
Boyd | \n", "Blackwell | \n", "Haskell | \n", "
Buck | \n", "Howe | \n", "Haskell | \n", "
Carlton | \n", "Richard | \n", "Haskell | \n", "
Carylon | \n", "Zamora | \n", "Python | \n", "
Clarisa | \n", "Rodgers | \n", "Python | \n", "
Dinorah | \n", "O'brien | \n", "Haskell | \n", "
Dorian | \n", "Lloyd | \n", "Haskell | \n", "
first | \n", "last | \n", "language_name | \n", "
---|---|---|
Aaron | \n", "Alexander | \n", "Io | \n", "
Aaron | \n", "Kirby | \n", "Io | \n", "
Abram | \n", "Allen | \n", "Io | \n", "
Abram | \n", "Boyer | \n", "Io | \n", "
Adan | \n", "Brown | \n", "Io | \n", "
Adolph | \n", "Dalton | \n", "Io | \n", "
Adrian | \n", "Blevins | \n", "Io | \n", "
Agustin | \n", "Fulton | \n", "Io | \n", "
Agustin | \n", "Mcdonald | \n", "Io | \n", "
Alberto | \n", "Dudley | \n", "Io | \n", "
count(language_name) | \n", "
---|
2297 | \n", "
language_name | \n", "n | \n", "
---|---|
AutoIt | \n", "61 | \n", "
Bash | \n", "48 | \n", "
ECMAScript | \n", "48 | \n", "
GNU Octave | \n", "49 | \n", "
JavaScript | \n", "48 | \n", "
Perl | \n", "55 | \n", "
PowerShell | \n", "50 | \n", "
Prolog | \n", "50 | \n", "
first | \n", "last | \n", "language_name | \n", "outcome | \n", "
---|---|---|---|
Aaron | \n", "Alexander | \n", "Haskell | \n", "Hire | \n", "
Aaron | \n", "Kirby | \n", "GNU Octave | \n", "FIRE | \n", "
Aaron | \n", "Kirby | \n", "haXe | \n", "Hire | \n", "
Aaron | \n", "Kirby | \n", "Falcon | \n", "FIRE | \n", "
Abram | \n", "Allen | \n", "TypeScript | \n", "FIRE | \n", "
Abram | \n", "Boyer | \n", "Io | \n", "FIRE | \n", "
Abram | \n", "Boyer | \n", "Lua | \n", "FIRE | \n", "
Abram | \n", "Boyer | \n", "Falcon | \n", "FIRE | \n", "
Adan | \n", "Brown | \n", "F# | \n", "FIRE | \n", "
Adolph | \n", "Dalton | \n", "Dart | \n", "FIRE | \n", "
\n", " | person | \n", "time | \n", "bsl | \n", "
---|---|---|---|
8 | \n", "A | \n", "0 | \n", "115 | \n", "
5 | \n", "A | \n", "2 | \n", "237 | \n", "
2 | \n", "A | \n", "3 | \n", "129 | \n", "
7 | \n", "B | \n", "5 | \n", "86 | \n", "
3 | \n", "B | \n", "6 | \n", "396 | \n", "
4 | \n", "C | \n", "1 | \n", "107 | \n", "
1 | \n", "C | \n", "9 | \n", "347 | \n", "
6 | \n", "D | \n", "5 | \n", "89 | \n", "
9 | \n", "D | \n", "5 | \n", "221 | \n", "
0 | \n", "D | \n", "7 | \n", "98 | \n", "
person | \n", "time | \n", "bsl | \n", "row_number | \n", "
---|---|---|---|
D | \n", "7 | \n", "98 | \n", "1 | \n", "
C | \n", "9 | \n", "347 | \n", "2 | \n", "
A | \n", "3 | \n", "129 | \n", "3 | \n", "
B | \n", "6 | \n", "396 | \n", "4 | \n", "
C | \n", "1 | \n", "107 | \n", "5 | \n", "
A | \n", "2 | \n", "237 | \n", "6 | \n", "
D | \n", "5 | \n", "89 | \n", "7 | \n", "
B | \n", "5 | \n", "86 | \n", "8 | \n", "
A | \n", "0 | \n", "115 | \n", "9 | \n", "
D | \n", "5 | \n", "221 | \n", "10 | \n", "
person | \n", "time | \n", "bsl | \n", "row_number | \n", "
---|---|---|---|
A | \n", "0 | \n", "115 | \n", "1 | \n", "
A | \n", "2 | \n", "237 | \n", "2 | \n", "
A | \n", "3 | \n", "129 | \n", "3 | \n", "
B | \n", "5 | \n", "86 | \n", "4 | \n", "
B | \n", "6 | \n", "396 | \n", "5 | \n", "
C | \n", "1 | \n", "107 | \n", "6 | \n", "
C | \n", "9 | \n", "347 | \n", "7 | \n", "
D | \n", "5 | \n", "221 | \n", "8 | \n", "
D | \n", "5 | \n", "89 | \n", "9 | \n", "
D | \n", "7 | \n", "98 | \n", "10 | \n", "
person | \n", "time | \n", "bsl | \n", "row_number | \n", "
---|---|---|---|
A | \n", "0 | \n", "115 | \n", "1 | \n", "
A | \n", "2 | \n", "237 | \n", "2 | \n", "
A | \n", "3 | \n", "129 | \n", "3 | \n", "
B | \n", "5 | \n", "86 | \n", "1 | \n", "
B | \n", "6 | \n", "396 | \n", "2 | \n", "
C | \n", "1 | \n", "107 | \n", "1 | \n", "
C | \n", "9 | \n", "347 | \n", "2 | \n", "
D | \n", "5 | \n", "221 | \n", "1 | \n", "
D | \n", "5 | \n", "89 | \n", "2 | \n", "
D | \n", "7 | \n", "98 | \n", "3 | \n", "
person | \n", "time | \n", "bsl | \n", "string_agg | \n", "
---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115 | \n", "
A | \n", "2 | \n", "237 | \n", "115, 237 | \n", "
A | \n", "3 | \n", "129 | \n", "115, 237, 129 | \n", "
B | \n", "5 | \n", "86 | \n", "86 | \n", "
B | \n", "6 | \n", "396 | \n", "86, 396 | \n", "
C | \n", "1 | \n", "107 | \n", "107 | \n", "
C | \n", "9 | \n", "347 | \n", "107, 347 | \n", "
D | \n", "5 | \n", "221 | \n", "221, 89 | \n", "
D | \n", "5 | \n", "89 | \n", "221, 89 | \n", "
D | \n", "7 | \n", "98 | \n", "221, 89, 98 | \n", "
person | \n", "time | \n", "bsl | \n", "string_agg | \n", "
---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115, 237 | \n", "
A | \n", "2 | \n", "237 | \n", "115, 237, 129 | \n", "
A | \n", "3 | \n", "129 | \n", "237, 129 | \n", "
B | \n", "5 | \n", "86 | \n", "86, 396 | \n", "
B | \n", "6 | \n", "396 | \n", "86, 396 | \n", "
C | \n", "1 | \n", "107 | \n", "107, 347 | \n", "
C | \n", "9 | \n", "347 | \n", "107, 347 | \n", "
D | \n", "5 | \n", "221 | \n", "221, 89 | \n", "
D | \n", "5 | \n", "89 | \n", "221, 89, 98 | \n", "
D | \n", "7 | \n", "98 | \n", "89, 98 | \n", "
person | \n", "time | \n", "bsl | \n", "row_number | \n", "rank | \n", "dense_rank | \n", "percent_rank | \n", "cume_dist | \n", "
---|---|---|---|---|---|---|---|
A | \n", "0 | \n", "115 | \n", "1 | \n", "1 | \n", "1 | \n", "0.0 | \n", "0.3 | \n", "
A | \n", "2 | \n", "237 | \n", "2 | \n", "1 | \n", "1 | \n", "0.0 | \n", "0.3 | \n", "
A | \n", "3 | \n", "129 | \n", "3 | \n", "1 | \n", "1 | \n", "0.0 | \n", "0.3 | \n", "
B | \n", "6 | \n", "396 | \n", "4 | \n", "4 | \n", "2 | \n", "0.333333333333333 | \n", "0.5 | \n", "
B | \n", "5 | \n", "86 | \n", "5 | \n", "4 | \n", "2 | \n", "0.333333333333333 | \n", "0.5 | \n", "
C | \n", "1 | \n", "107 | \n", "6 | \n", "6 | \n", "3 | \n", "0.555555555555556 | \n", "0.7 | \n", "
C | \n", "9 | \n", "347 | \n", "7 | \n", "6 | \n", "3 | \n", "0.555555555555556 | \n", "0.7 | \n", "
D | \n", "5 | \n", "221 | \n", "8 | \n", "8 | \n", "4 | \n", "0.777777777777778 | \n", "1.0 | \n", "
D | \n", "5 | \n", "89 | \n", "9 | \n", "8 | \n", "4 | \n", "0.777777777777778 | \n", "1.0 | \n", "
D | \n", "7 | \n", "98 | \n", "10 | \n", "8 | \n", "4 | \n", "0.777777777777778 | \n", "1.0 | \n", "
person | \n", "time | \n", "bsl | \n", "bsl_sum | \n", "bsl_avg | \n", "bsl_min | \n", "bsl_max | \n", "bsl_start | \n", "bsl_end | \n", "
---|---|---|---|---|---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115 | \n", "115.0000000000000000 | \n", "115 | \n", "115 | \n", "115 | \n", "115 | \n", "
A | \n", "2 | \n", "237 | \n", "352 | \n", "176.0000000000000000 | \n", "115 | \n", "237 | \n", "115 | \n", "237 | \n", "
A | \n", "3 | \n", "129 | \n", "481 | \n", "160.3333333333333333 | \n", "115 | \n", "237 | \n", "115 | \n", "129 | \n", "
B | \n", "5 | \n", "86 | \n", "86 | \n", "86.0000000000000000 | \n", "86 | \n", "86 | \n", "86 | \n", "86 | \n", "
B | \n", "6 | \n", "396 | \n", "482 | \n", "241.0000000000000000 | \n", "86 | \n", "396 | \n", "86 | \n", "396 | \n", "
C | \n", "1 | \n", "107 | \n", "107 | \n", "107.0000000000000000 | \n", "107 | \n", "107 | \n", "107 | \n", "107 | \n", "
C | \n", "9 | \n", "347 | \n", "454 | \n", "227.0000000000000000 | \n", "107 | \n", "347 | \n", "107 | \n", "347 | \n", "
D | \n", "5 | \n", "221 | \n", "310 | \n", "155.0000000000000000 | \n", "89 | \n", "221 | \n", "221 | \n", "89 | \n", "
D | \n", "5 | \n", "89 | \n", "310 | \n", "155.0000000000000000 | \n", "89 | \n", "221 | \n", "221 | \n", "89 | \n", "
D | \n", "7 | \n", "98 | \n", "408 | \n", "136.0000000000000000 | \n", "89 | \n", "221 | \n", "221 | \n", "98 | \n", "
person | \n", "time | \n", "bsl | \n", "vals | \n", "bsl_sum | \n", "bsl_avg | \n", "
---|---|---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115, 237 | \n", "352 | \n", "176.0000000000000000 | \n", "
A | \n", "2 | \n", "237 | \n", "115, 237, 129 | \n", "481 | \n", "160.3333333333333333 | \n", "
A | \n", "3 | \n", "129 | \n", "237, 129 | \n", "366 | \n", "183.0000000000000000 | \n", "
B | \n", "5 | \n", "86 | \n", "86, 396 | \n", "482 | \n", "241.0000000000000000 | \n", "
B | \n", "6 | \n", "396 | \n", "86, 396 | \n", "482 | \n", "241.0000000000000000 | \n", "
C | \n", "1 | \n", "107 | \n", "107, 347 | \n", "454 | \n", "227.0000000000000000 | \n", "
C | \n", "9 | \n", "347 | \n", "107, 347 | \n", "454 | \n", "227.0000000000000000 | \n", "
D | \n", "5 | \n", "221 | \n", "221, 89 | \n", "310 | \n", "155.0000000000000000 | \n", "
D | \n", "5 | \n", "89 | \n", "221, 89, 98 | \n", "408 | \n", "136.0000000000000000 | \n", "
D | \n", "7 | \n", "98 | \n", "89, 98 | \n", "187 | \n", "93.5000000000000000 | \n", "
person | \n", "time | \n", "bsl | \n", "vals | \n", "bsl_sum | \n", "bsl_avg | \n", "
---|---|---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115 | \n", "115 | \n", "115.0000000000000000 | \n", "
A | \n", "2 | \n", "237 | \n", "115, 237 | \n", "352 | \n", "176.0000000000000000 | \n", "
A | \n", "3 | \n", "129 | \n", "115, 237, 129 | \n", "481 | \n", "160.3333333333333333 | \n", "
B | \n", "5 | \n", "86 | \n", "115, 237, 129, 396, 86 | \n", "963 | \n", "192.6000000000000000 | \n", "
B | \n", "6 | \n", "396 | \n", "115, 237, 129, 396 | \n", "877 | \n", "219.2500000000000000 | \n", "
C | \n", "1 | \n", "107 | \n", "115, 237, 129, 396, 86, 107 | \n", "1070 | \n", "178.3333333333333333 | \n", "
C | \n", "9 | \n", "347 | \n", "115, 237, 129, 396, 86, 107, 347 | \n", "1417 | \n", "202.4285714285714286 | \n", "
D | \n", "5 | \n", "221 | \n", "115, 237, 129, 396, 86, 107, 347, 221 | \n", "1638 | \n", "204.7500000000000000 | \n", "
D | \n", "5 | \n", "89 | \n", "115, 237, 129, 396, 86, 107, 347, 221, 89 | \n", "1727 | \n", "191.8888888888888889 | \n", "
D | \n", "7 | \n", "98 | \n", "115, 237, 129, 396, 86, 107, 347, 221, 89, 98 | \n", "1825 | \n", "182.5000000000000000 | \n", "
person | \n", "time | \n", "bsl | \n", "vals | \n", "bsl_sum | \n", "bsl_avg | \n", "
---|---|---|---|---|---|
A | \n", "0 | \n", "115 | \n", "115, 237, 129 | \n", "481 | \n", "160.3333333333333333 | \n", "
A | \n", "2 | \n", "237 | \n", "115, 237, 129 | \n", "481 | \n", "160.3333333333333333 | \n", "
A | \n", "3 | \n", "129 | \n", "115, 237, 129 | \n", "481 | \n", "160.3333333333333333 | \n", "
B | \n", "5 | \n", "86 | \n", "115, 237, 129, 396, 86 | \n", "963 | \n", "192.6000000000000000 | \n", "
B | \n", "6 | \n", "396 | \n", "115, 237, 129, 396, 86 | \n", "963 | \n", "192.6000000000000000 | \n", "
C | \n", "1 | \n", "107 | \n", "115, 237, 129, 396, 86, 107, 347 | \n", "1417 | \n", "202.4285714285714286 | \n", "
C | \n", "9 | \n", "347 | \n", "115, 237, 129, 396, 86, 107, 347 | \n", "1417 | \n", "202.4285714285714286 | \n", "
D | \n", "5 | \n", "221 | \n", "115, 237, 129, 396, 86, 107, 347, 221, 89, 98 | \n", "1825 | \n", "182.5000000000000000 | \n", "
D | \n", "5 | \n", "89 | \n", "115, 237, 129, 396, 86, 107, 347, 221, 89, 98 | \n", "1825 | \n", "182.5000000000000000 | \n", "
D | \n", "7 | \n", "98 | \n", "115, 237, 129, 396, 86, 107, 347, 221, 89, 98 | \n", "1825 | \n", "182.5000000000000000 | \n", "