{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Archival and Management (Part 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Relational databases via `sqlite3` package"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sqlite3\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con = sqlite3.connect('Chinook_Sqlite.sqlite')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creating a cursor\n",
"\n",
"A cursor is used to execute commands. Actually, you can also execute commands directly from the connection object in `sqlite3`, but this is not true for all DB API drivers."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c = con.cursor()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find tables in a database\n",
"\n",
"Metadata is stored in a special table `sql_master`."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Album',),\n",
" ('Artist',),\n",
" ('Customer',),\n",
" ('Employee',),\n",
" ('Genre',),\n",
" ('Invoice',),\n",
" ('InvoiceLine',),\n",
" ('MediaType',),\n",
" ('Playlist',),\n",
" ('PlaylistTrack',),\n",
" ('Track',)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT name \n",
"FROM sqlite_master \n",
"WHERE type='table';\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View structure of a particular table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Method 1: Using `table_info` pragma"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, 'InvoiceId', 'INTEGER', 1, None, 1),\n",
" (1, 'CustomerId', 'INTEGER', 1, None, 0),\n",
" (2, 'InvoiceDate', 'DATETIME', 1, None, 0),\n",
" (3, 'BillingAddress', 'NVARCHAR(70)', 0, None, 0),\n",
" (4, 'BillingCity', 'NVARCHAR(40)', 0, None, 0),\n",
" (5, 'BillingState', 'NVARCHAR(40)', 0, None, 0),\n",
" (6, 'BillingCountry', 'NVARCHAR(40)', 0, None, 0),\n",
" (7, 'BillingPostalCode', 'NVARCHAR(10)', 0, None, 0),\n",
" (8, 'Total', 'NUMERIC(10,2)', 1, None, 0)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.execute(\"PRAGMA table_info(Invoice)\")\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Method 2: Looking at SQL definition of table"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CREATE TABLE [Album]\n",
"(\n",
" [AlbumId] INTEGER NOT NULL,\n",
" [Title] NVARCHAR(160) NOT NULL,\n",
" [ArtistId] INTEGER NOT NULL,\n",
" CONSTRAINT [PK_Album] PRIMARY KEY ([AlbumId]),\n",
" FOREIGN KEY ([ArtistId]) REFERENCES [Artist] ([ArtistId]) \n",
"\t\tON DELETE NO ACTION ON UPDATE NO ACTION\n",
")\n"
]
}
],
"source": [
"q = \"\"\"\n",
"SELECT sql \n",
"FROM sqlite_master \n",
"WHERE name='Album';\n",
"\"\"\"\n",
"c.execute(q)\n",
"print(c.fetchone()[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Method 3: Fetch a row and see description (only column names)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(('InvoiceId', None, None, None, None, None, None),\n",
" ('CustomerId', None, None, None, None, None, None),\n",
" ('InvoiceDate', None, None, None, None, None, None),\n",
" ('BillingAddress', None, None, None, None, None, None),\n",
" ('BillingCity', None, None, None, None, None, None),\n",
" ('BillingState', None, None, None, None, None, None),\n",
" ('BillingCountry', None, None, None, None, None, None),\n",
" ('BillingPostalCode', None, None, None, None, None, None),\n",
" ('Total', None, None, None, None, None, None))"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.execute(\"Select * from Invoice;\")\n",
"c.fetchone()\n",
"c.description"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View all columns of a table"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 'For Those About To Rock We Salute You', 1),\n",
" (2, 'Balls to the Wall', 2),\n",
" (3, 'Restless and Wild', 2),\n",
" (4, 'Let There Be Rock', 1),\n",
" (5, 'Big Ones', 3)]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT * \n",
"FROM Album \n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Select specific columns"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 'For Those About To Rock We Salute You'),\n",
" (2, 'Balls to the Wall'),\n",
" (3, 'Restless and Wild'),\n",
" (4, 'Let There Be Rock'),\n",
" (5, 'Big Ones')]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT AlbumId, Title\n",
"FROM Album \n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filter rows by some criteria"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(1, 'For Those About To Rock We Salute You', 1),\n",
" (4, 'Let There Be Rock', 1),\n",
" (59, 'Deep Purple In Rock', 58),\n",
" (108, 'Rock In Rio [CD1]', 90),\n",
" (109, 'Rock In Rio [CD2]', 90)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT *\n",
"FROM Album \n",
"WHERE Title LIKE '%Rock%'\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(5, 'Big Ones', 3),\n",
" (7, 'Facelift', 5),\n",
" (28, 'Na Pista', 20),\n",
" (60, 'Fireball', 58),\n",
" (68, 'Outbreak', 79)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT *\n",
"FROM Album \n",
"WHERE length(Title) < 10\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sorting "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(308,\n",
" \"Tchaikovsky: 1812 Festival Overture, Op.49, Capriccio Italien & Beethoven: Wellington's Victory\",\n",
" 243),\n",
" (335,\n",
" 'J.S. Bach: Chaconne, Suite in E Minor, Partita in E Major & Prelude, Fugue and Allegro',\n",
" 265),\n",
" (294,\n",
" \"Great Performances - Barber's Adagio and Other Romantic Favorites for Strings\",\n",
" 228),\n",
" (213,\n",
" 'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]',\n",
" 139),\n",
" (257,\n",
" '20th Century Masters - The Millennium Collection: The Best of Scorpions',\n",
" 179)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT *\n",
"FROM Album \n",
"ORDER BY length(Title) DESC\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summarization"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(412, 5.651941747572825, 0.99, 25.86, 2328.600000000004)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT COUNT(*), AVG(Total), MIN(Total), MAX(Total), SUM(Total)\n",
"FROM Invoice ;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Grouping and summarization"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Argentina', 'Buenos Aires', 5.3742857142857146),\n",
" ('Australia', 'Sidney', 5.3742857142857146),\n",
" ('Austria', 'Vienne', 6.088571428571428),\n",
" ('Belgium', 'Brussels', 5.374285714285714),\n",
" ('Brazil', 'Brasília', 5.3742857142857146)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT BillingCountry, BillingCity, AVG(Total)\n",
"FROM Invoice \n",
"GROUP BY BillingCountry, BillingCity\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summarization with conditions on summary"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Chile', 'Santiago', 6.659999999999999),\n",
" ('Czech Republic', 'Prague', 6.445714285714287)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT BillingCountry, BillingCity, AVG(Total)\n",
"FROM Invoice \n",
"GROUP BY BillingCountry, BillingCity\n",
"HAVING BillingCountry like 'C%' AND AVG(Total) > 6\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Joining tables"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, 'ArtistId', 'INTEGER', 1, None, 1),\n",
" (1, 'Name', 'NVARCHAR(120)', 0, None, 0)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.execute(\"PRAGMA table_info(Artist)\")\n",
"c.fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0, 'AlbumId', 'INTEGER', 1, None, 1),\n",
" (1, 'Title', 'NVARCHAR(160)', 1, None, 0),\n",
" (2, 'ArtistId', 'INTEGER', 1, None, 0)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.execute(\"PRAGMA table_info(Album)\")\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### No join gives the Cartesian prodcut"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('For Those About To Rock We Salute You', 'AC/DC'),\n",
" ('For Those About To Rock We Salute You', 'Accept'),\n",
" ('For Those About To Rock We Salute You', 'Aerosmith'),\n",
" ('For Those About To Rock We Salute You', 'Alanis Morissette'),\n",
" ('For Those About To Rock We Salute You', 'Alice In Chains')]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT Album.Title, Artist.Name\n",
"FROM Album, Artist\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Inner Join"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('For Those About To Rock We Salute You', 'AC/DC'),\n",
" ('Balls to the Wall', 'Accept'),\n",
" ('Restless and Wild', 'Accept'),\n",
" ('Let There Be Rock', 'AC/DC'),\n",
" ('Big Ones', 'Aerosmith')]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT Album.Title, Artist.Name\n",
"FROM Album\n",
"INNER JOIN Artist ON Album.ArtistId = Artist.ArtistId\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Left Join"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('For Those About To Rock We Salute You', 'AC/DC'),\n",
" ('Balls to the Wall', 'Accept'),\n",
" ('Restless and Wild', 'Accept'),\n",
" ('Let There Be Rock', 'AC/DC'),\n",
" ('Big Ones', 'Aerosmith')]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT Album.Title, Artist.Name\n",
"FROM Album\n",
"LEFT JOIN Artist ON Album.ArtistId = Artist.ArtistId\n",
"LIMIT 5;\n",
"\"\"\"\n",
"c.execute(q)\n",
"c.fetchall()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using `sqlite3` and `pandas`"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" BillingCountry | \n",
" BillingCity | \n",
" Mean | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Argentina | \n",
" Buenos Aires | \n",
" 5.374286 | \n",
"
\n",
" \n",
" 1 | \n",
" Australia | \n",
" Sidney | \n",
" 5.374286 | \n",
"
\n",
" \n",
" 2 | \n",
" Austria | \n",
" Vienne | \n",
" 6.088571 | \n",
"
\n",
" \n",
" 3 | \n",
" Belgium | \n",
" Brussels | \n",
" 5.374286 | \n",
"
\n",
" \n",
" 4 | \n",
" Brazil | \n",
" Brasília | \n",
" 5.374286 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" BillingCountry BillingCity Mean\n",
"0 Argentina Buenos Aires 5.374286\n",
"1 Australia Sidney 5.374286\n",
"2 Austria Vienne 6.088571\n",
"3 Belgium Brussels 5.374286\n",
"4 Brazil Brasília 5.374286"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = \"\"\"\n",
"SELECT BillingCountry, BillingCity, AVG(Total) AS Mean\n",
"FROM Invoice \n",
"GROUP BY BillingCountry, BillingCity\n",
"LIMIT 5;\n",
"\"\"\"\n",
"\n",
"df = pd.read_sql(q, con)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Free resources when no longer needed"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c.close()\n",
"con.close()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Creating database tables\n",
"\n",
"Concepts introduced\n",
"\n",
"- In memory database\n",
"- Primary and foreign keys\n",
"- Constraints\n",
"- Modeling many:many relationships with a linker table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SQLite3 databases can reside in memory"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con = sqlite3.connect(\":memory:\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creating a database"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"s1 = \"\"\"\n",
"CREATE TABLE IF NOT EXISTS student (\n",
"student_id integer PRIMARY KEY,\n",
"first varchar NOT NULL,\n",
"last varchar NOT NULL,\n",
"email VARCHAR NOT NULL UNIQUE\n",
");\n",
"\"\"\"\n",
"\n",
"s2 = \"\"\"\n",
"CREATE TABLE IF NOT EXISTS subject (\n",
"subject_id integer PRIMARY KEY,\n",
"code varchar NOT NULL,\n",
"name varchar NOT NULL\n",
");\n",
"\"\"\"\n",
"\n",
"s3 = \"\"\"\n",
"CREATE TABLE IF NOT EXISTS grade (\n",
"student_id integer,\n",
"subject_id integer,\n",
"score float,\n",
"FOREIGN KEY (student_id) REFERENCES student (student_id),\n",
"FOREIGN KEY (subject_id) REFERENCES subject (subject_id),\n",
"CHECK (score BETWEEN 0 and 100)\n",
");\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(s1)\n",
"con.execute(s2)\n",
"con.execute(s3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Populate the database"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with con:\n",
" con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n",
" (\"John\", \"Smith\", \"john.smith@duke.edu\"))\n",
" con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n",
" (\"Jane\", \"Smith\", \"jane.smith@duke.edu\"))\n",
" con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n",
" (\"Joe\", \"Smith\", \"joe.smith@duke.edu\"))\n",
" \n",
" con.execute(\"insert into subject(code, name) values (?, ?)\", \n",
" (\"BIOS 703\", \"Biology for unsuspecting statisticians\"))\n",
" con.execute(\"insert into subject(code, name) values (?, ?)\", \n",
" (\"BIOS 821\", \"Data science for mere mortals\"))\n",
" \n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (1, 1, np.random.randint(0, 101)))\n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (1, 2, np.random.randint(0, 101)))\n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (2, 1, np.random.randint(0, 101)))\n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (2, 2, np.random.randint(0, 101)))\n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (3, 1, np.random.randint(0, 101)))\n",
" con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n",
" (3, 2, np.random.randint(0, 101)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Query the database"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 'John', 'Smith', 'john.smith@duke.edu')\n",
"(2, 'Jane', 'Smith', 'jane.smith@duke.edu')\n",
"(3, 'Joe', 'Smith', 'joe.smith@duke.edu')\n"
]
}
],
"source": [
"for row in con.execute(\"select * from student;\"):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 'BIOS 703', 'Biology for unsuspecting statisticians')\n",
"(2, 'BIOS 821', 'Data science for mere mortals')\n"
]
}
],
"source": [
"for row in con.execute(\"select * from subject;\"):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 1, 59.0)\n",
"(1, 2, 51.0)\n",
"(2, 1, 21.0)\n",
"(2, 2, 24.0)\n",
"(3, 1, 49.0)\n",
"(3, 2, 29.0)\n"
]
}
],
"source": [
"for row in con.execute(\"select * from grade;\"):\n",
" print(row)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Joins involving a linker table"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"q = \"\"\"\n",
"SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n",
"FROM student\n",
"JOIN grade ON grade.student_id = student.student_id\n",
"JOIN subject ON grade.subject_id = subject.subject_id;\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)\n",
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 29.0)\n"
]
}
],
"source": [
"for row in con.execute(q):\n",
" print(row)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Modify the database\n",
"\n",
"Joe showed me a cute cat picture. I'm going to give him 100 for all subjects."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(3,)\n"
]
}
],
"source": [
"for row in con.execute('SELECT student_id from student where first = \"Joe\"'):\n",
" print(row)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"s = \"\"\"\n",
"UPDATE grade\n",
"SET score = 100\n",
"WHERE grade.student_id =\n",
"(\n",
" SELECT student_id from student where first = \"Joe\"\n",
")\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(s)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Check that update worked"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"q = \"\"\"\n",
"SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n",
"FROM student\n",
"JOIN grade ON grade.student_id = student.student_id\n",
"JOIN subject ON grade.subject_id = subject.subject_id\n",
"WHERE student.first = \"Joe\"\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n"
]
}
],
"source": [
"for row in con.execute(q):\n",
" print(row)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Saving in-memory database to disk"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('dump.sql', 'w') as f:\n",
" for line in con.iterdump():\n",
" f.write('%s\\n' % line)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BEGIN TRANSACTION;\r\n",
"CREATE TABLE grade (\r\n",
"student_id integer,\r\n",
"subject_id integer,\r\n",
"score float,\r\n",
"FOREIGN KEY (student_id) REFERENCES student (student_id),\r\n",
"FOREIGN KEY (subject_id) REFERENCES subject (subject_id),\r\n",
"CHECK (score BETWEEN 0 and 100)\r\n",
");\r\n",
"INSERT INTO \"grade\" VALUES(1,1,59.0);\r\n",
"INSERT INTO \"grade\" VALUES(1,2,51.0);\r\n",
"INSERT INTO \"grade\" VALUES(2,1,21.0);\r\n",
"INSERT INTO \"grade\" VALUES(2,2,24.0);\r\n",
"INSERT INTO \"grade\" VALUES(3,1,100.0);\r\n",
"INSERT INTO \"grade\" VALUES(3,2,100.0);\r\n",
"CREATE TABLE student (\r\n",
"student_id integer PRIMARY KEY,\r\n",
"first varchar NOT NULL,\r\n",
"last varchar NOT NULL,\r\n",
"email VARCHAR NOT NULL UNIQUE\r\n",
");\r\n",
"INSERT INTO \"student\" VALUES(1,'John','Smith','john.smith@duke.edu');\r\n",
"INSERT INTO \"student\" VALUES(2,'Jane','Smith','jane.smith@duke.edu');\r\n",
"INSERT INTO \"student\" VALUES(3,'Joe','Smith','joe.smith@duke.edu');\r\n",
"CREATE TABLE subject (\r\n",
"subject_id integer PRIMARY KEY,\r\n",
"code varchar NOT NULL,\r\n",
"name varchar NOT NULL\r\n",
");\r\n",
"INSERT INTO \"subject\" VALUES(1,'BIOS 703','Biology for unsuspecting statisticians');\r\n",
"INSERT INTO \"subject\" VALUES(2,'BIOS 821','Data science for mere mortals');\r\n",
"COMMIT;\r\n"
]
}
],
"source": [
"! cat dump.sql"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Convert to SQLite3 database"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error: near line 2: table grade already exists\r\n",
"Error: near line 16: table student already exists\r\n",
"Error: near line 22: UNIQUE constraint failed: student.student_id\r\n",
"Error: near line 23: UNIQUE constraint failed: student.student_id\r\n",
"Error: near line 24: UNIQUE constraint failed: student.student_id\r\n",
"Error: near line 25: table subject already exists\r\n",
"Error: near line 30: UNIQUE constraint failed: subject.subject_id\r\n",
"Error: near line 31: UNIQUE constraint failed: subject.subject_id\r\n"
]
}
],
"source": [
"! cat dump.sql | sqlite3 college.db"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con1 = sqlite3.connect('college.db')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"q = \"\"\"\n",
"SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n",
"FROM student\n",
"JOIN grade ON grade.student_id = student.student_id\n",
"JOIN subject ON grade.subject_id = subject.subject_id;\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 29.0)\n",
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 20.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 96.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n",
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)\n",
"('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)\n",
"('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n",
"('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n"
]
}
],
"source": [
"for row in con1.execute(q):\n",
" print(row)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Close connections"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con.close()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"con1.close()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Using SQL magic functions\n",
"\n",
"We will use the [ipython-sql](https://github.com/catherinedevlin/ipython-sql) notebook extension for convenience. This will only work in notebooks and IPython scripts with the .ipy extension."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"with warnings.catch_warnings():\n",
" warnings.simplefilter('ignore')\n",
" %load_ext sql"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Configuring the SqlMagic extension"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SqlMagic options\n",
"--------------\n",
"SqlMagic.autolimit=\n",
" Current: 0\n",
" Automatically limit the size of the returned result sets\n",
"SqlMagic.autopandas=\n",
" Current: False\n",
" Return Pandas DataFrames instead of regular result sets\n",
"SqlMagic.column_local_vars=\n",
" Current: False\n",
" Return data into local variables from column names\n",
"SqlMagic.displaylimit=\n",
" Current: 0\n",
" Automatically limit the number of rows displayed (full result set is still\n",
" stored)\n",
"SqlMagic.dsn_filename=\n",
" Current: 'odbc.ini'\n",
" Path to DSN file. When the first argument is of the form [section], a\n",
" sqlalchemy connection string is formed from the matching section in the DSN\n",
" file.\n",
"SqlMagic.feedback=\n",
" Current: True\n",
" Print number of rows affected by DML\n",
"SqlMagic.short_errors=\n",
" Current: True\n",
" Don't display the full traceback on SQL Programming Error\n",
"SqlMagic.style=\n",
" Current: 'DEFAULT'\n",
" Set the table printing style to any of prettytable's defined styles\n",
" (currently DEFAULT, MSWORD_FRIENDLY, PLAIN_COLUMNS, RANDOM)\n"
]
}
],
"source": [
"%config SqlMagic"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%config SqlMagic.displaylimit=10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Connect to SQLite3 database\n",
"\n",
"See [SQLAlchemy connection strings](http://docs.sqlalchemy.org/en/latest/core/engines.html) for how to connect to other databases such as Oracle, MySQL or PostgreSQL."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Connected: None@Chinook_Sqlite.sqlite'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql sqlite:///Chinook_Sqlite.sqlite"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" AlbumId | \n",
" Title | \n",
" ArtistId | \n",
"
\n",
" \n",
" 1 | \n",
" For Those About To Rock We Salute You | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" Balls to the Wall | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" Restless and Wild | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" Let There Be Rock | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" Big Ones | \n",
" 3 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 'For Those About To Rock We Salute You', 1),\n",
" (2, 'Balls to the Wall', 2),\n",
" (3, 'Restless and Wild', 2),\n",
" (4, 'Let There Be Rock', 1),\n",
" (5, 'Big Ones', 3)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * from Album LIMIT 5;"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" ArtistId | \n",
" Name | \n",
"
\n",
" \n",
" 1 | \n",
" AC/DC | \n",
"
\n",
" \n",
" 2 | \n",
" Accept | \n",
"
\n",
" \n",
" 3 | \n",
" Aerosmith | \n",
"
\n",
" \n",
" 4 | \n",
" Alanis Morissette | \n",
"
\n",
" \n",
" 5 | \n",
" Alice In Chains | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 'AC/DC'),\n",
" (2, 'Accept'),\n",
" (3, 'Aerosmith'),\n",
" (4, 'Alanis Morissette'),\n",
" (5, 'Alice In Chains')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * from Artist LIMIT 5;"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" Name | \n",
" Title | \n",
"
\n",
" \n",
" AC/DC | \n",
" For Those About To Rock We Salute You | \n",
"
\n",
" \n",
" AC/DC | \n",
" Let There Be Rock | \n",
"
\n",
" \n",
" Aaron Copland & London Symphony Orchestra | \n",
" A Copland Celebration, Vol. I | \n",
"
\n",
" \n",
" Aaron Goldberg | \n",
" Worlds | \n",
"
\n",
" \n",
" Academy of St. Martin in the Fields & Sir Neville Marriner | \n",
" The World of Classical Favourites | \n",
"
\n",
"
"
],
"text/plain": [
"[('AC/DC', 'For Those About To Rock We Salute You'),\n",
" ('AC/DC', 'Let There Be Rock'),\n",
" ('Aaron Copland & London Symphony Orchestra', 'A Copland Celebration, Vol. I'),\n",
" ('Aaron Goldberg', 'Worlds'),\n",
" ('Academy of St. Martin in the Fields & Sir Neville Marriner', 'The World of Classical Favourites')]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT Artist.Name, Album.Title\n",
"FROM Album\n",
"INNER JOIN Artist on Album.ArtistId = Artist.ArtistId\n",
"ORDER BY Artist.Name ASC\n",
"LIMIT 5;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### You can assign results of queries to Python names\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
}
],
"source": [
"result = %sql SELECT * from Album;"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sql.run.ResultSet"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(result)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(3, 'Restless and Wild', 2), (4, 'Let There Be Rock', 1)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result[2:4]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### You can use Python variables in your queries\n",
"\n",
"Use `:varname` where you want to use a Python variable in your query. "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" ArtistId | \n",
" Name | \n",
"
\n",
" \n",
" 1 | \n",
" AC/DC | \n",
"
\n",
" \n",
" 2 | \n",
" Accept | \n",
"
\n",
" \n",
" 3 | \n",
" Aerosmith | \n",
"
\n",
" \n",
" 4 | \n",
" Alanis Morissette | \n",
"
\n",
" \n",
" 5 | \n",
" Alice In Chains | \n",
"
\n",
" \n",
" 6 | \n",
" Antônio Carlos Jobim | \n",
"
\n",
" \n",
" 7 | \n",
" Apocalyptica | \n",
"
\n",
" \n",
" 8 | \n",
" Audioslave | \n",
"
\n",
" \n",
" 9 | \n",
" BackBeat | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 'AC/DC'),\n",
" (2, 'Accept'),\n",
" (3, 'Aerosmith'),\n",
" (4, 'Alanis Morissette'),\n",
" (5, 'Alice In Chains'),\n",
" (6, 'Antônio Carlos Jobim'),\n",
" (7, 'Apocalyptica'),\n",
" (8, 'Audioslave'),\n",
" (9, 'BackBeat')]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"artist_id = 10\n",
"\n",
"%sql select * from Artist where ArtistId < :artist_id;"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" AlbumId | \n",
" Title | \n",
" ArtistId | \n",
"
\n",
" \n",
" 1 | \n",
" For Those About To Rock We Salute You | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" Let There Be Rock | \n",
" 1 | \n",
"
\n",
" \n",
" 59 | \n",
" Deep Purple In Rock | \n",
" 58 | \n",
"
\n",
" \n",
" 108 | \n",
" Rock In Rio [CD1] | \n",
" 90 | \n",
"
\n",
" \n",
" 109 | \n",
" Rock In Rio [CD2] | \n",
" 90 | \n",
"
\n",
" \n",
" 213 | \n",
" Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK] | \n",
" 139 | \n",
"
\n",
" \n",
" 216 | \n",
" Hot Rocks, 1964-1971 (Disc 1) | \n",
" 142 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 'For Those About To Rock We Salute You', 1),\n",
" (4, 'Let There Be Rock', 1),\n",
" (59, 'Deep Purple In Rock', 58),\n",
" (108, 'Rock In Rio [CD1]', 90),\n",
" (109, 'Rock In Rio [CD2]', 90),\n",
" (213, 'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]', 139),\n",
" (216, 'Hot Rocks, 1964-1971 (Disc 1)', 142)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word = '%rock%'\n",
"\n",
"%sql select * from Album WHERE Title LIKE :word;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Convert to `pandas` dataframe"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AlbumId | \n",
" Title | \n",
" ArtistId | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" For Those About To Rock We Salute You | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Balls to the Wall | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Restless and Wild | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Let There Be Rock | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Big Ones | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AlbumId Title ArtistId\n",
"0 1 For Those About To Rock We Salute You 1\n",
"1 2 Balls to the Wall 2\n",
"2 3 Restless and Wild 2\n",
"3 4 Let There Be Rock 1\n",
"4 5 Big Ones 3"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = result.DataFrame()\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}