{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Archival and Management (Part 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Relational databases via `sqlite3` package" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sqlite3\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con = sqlite3.connect('Chinook_Sqlite.sqlite')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creating a cursor\n", "\n", "A cursor is used to execute commands. Actually, you can also execute commands directly from the connection object in `sqlite3`, but this is not true for all DB API drivers." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "c = con.cursor()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find tables in a database\n", "\n", "Metadata is stored in a special table `sql_master`." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Album',),\n", " ('Artist',),\n", " ('Customer',),\n", " ('Employee',),\n", " ('Genre',),\n", " ('Invoice',),\n", " ('InvoiceLine',),\n", " ('MediaType',),\n", " ('Playlist',),\n", " ('PlaylistTrack',),\n", " ('Track',)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT name \n", "FROM sqlite_master \n", "WHERE type='table';\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### View structure of a particular table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Method 1: Using `table_info` pragma" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 'InvoiceId', 'INTEGER', 1, None, 1),\n", " (1, 'CustomerId', 'INTEGER', 1, None, 0),\n", " (2, 'InvoiceDate', 'DATETIME', 1, None, 0),\n", " (3, 'BillingAddress', 'NVARCHAR(70)', 0, None, 0),\n", " (4, 'BillingCity', 'NVARCHAR(40)', 0, None, 0),\n", " (5, 'BillingState', 'NVARCHAR(40)', 0, None, 0),\n", " (6, 'BillingCountry', 'NVARCHAR(40)', 0, None, 0),\n", " (7, 'BillingPostalCode', 'NVARCHAR(10)', 0, None, 0),\n", " (8, 'Total', 'NUMERIC(10,2)', 1, None, 0)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.execute(\"PRAGMA table_info(Invoice)\")\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Method 2: Looking at SQL definition of table" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CREATE TABLE [Album]\n", "(\n", " [AlbumId] INTEGER NOT NULL,\n", " [Title] NVARCHAR(160) NOT NULL,\n", " [ArtistId] INTEGER NOT NULL,\n", " CONSTRAINT [PK_Album] PRIMARY KEY ([AlbumId]),\n", " FOREIGN KEY ([ArtistId]) REFERENCES [Artist] ([ArtistId]) \n", "\t\tON DELETE NO ACTION ON UPDATE NO ACTION\n", ")\n" ] } ], "source": [ "q = \"\"\"\n", "SELECT sql \n", "FROM sqlite_master \n", "WHERE name='Album';\n", "\"\"\"\n", "c.execute(q)\n", "print(c.fetchone()[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Method 3: Fetch a row and see description (only column names)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(('InvoiceId', None, None, None, None, None, None),\n", " ('CustomerId', None, None, None, None, None, None),\n", " ('InvoiceDate', None, None, None, None, None, None),\n", " ('BillingAddress', None, None, None, None, None, None),\n", " ('BillingCity', None, None, None, None, None, None),\n", " ('BillingState', None, None, None, None, None, None),\n", " ('BillingCountry', None, None, None, None, None, None),\n", " ('BillingPostalCode', None, None, None, None, None, None),\n", " ('Total', None, None, None, None, None, None))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.execute(\"Select * from Invoice;\")\n", "c.fetchone()\n", "c.description" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### View all columns of a table" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(1, 'For Those About To Rock We Salute You', 1),\n", " (2, 'Balls to the Wall', 2),\n", " (3, 'Restless and Wild', 2),\n", " (4, 'Let There Be Rock', 1),\n", " (5, 'Big Ones', 3)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT * \n", "FROM Album \n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select specific columns" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(1, 'For Those About To Rock We Salute You'),\n", " (2, 'Balls to the Wall'),\n", " (3, 'Restless and Wild'),\n", " (4, 'Let There Be Rock'),\n", " (5, 'Big Ones')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT AlbumId, Title\n", "FROM Album \n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter rows by some criteria" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(1, 'For Those About To Rock We Salute You', 1),\n", " (4, 'Let There Be Rock', 1),\n", " (59, 'Deep Purple In Rock', 58),\n", " (108, 'Rock In Rio [CD1]', 90),\n", " (109, 'Rock In Rio [CD2]', 90)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT *\n", "FROM Album \n", "WHERE Title LIKE '%Rock%'\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(5, 'Big Ones', 3),\n", " (7, 'Facelift', 5),\n", " (28, 'Na Pista', 20),\n", " (60, 'Fireball', 58),\n", " (68, 'Outbreak', 79)]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT *\n", "FROM Album \n", "WHERE length(Title) < 10\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sorting " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(308,\n", " \"Tchaikovsky: 1812 Festival Overture, Op.49, Capriccio Italien & Beethoven: Wellington's Victory\",\n", " 243),\n", " (335,\n", " 'J.S. Bach: Chaconne, Suite in E Minor, Partita in E Major & Prelude, Fugue and Allegro',\n", " 265),\n", " (294,\n", " \"Great Performances - Barber's Adagio and Other Romantic Favorites for Strings\",\n", " 228),\n", " (213,\n", " 'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]',\n", " 139),\n", " (257,\n", " '20th Century Masters - The Millennium Collection: The Best of Scorpions',\n", " 179)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT *\n", "FROM Album \n", "ORDER BY length(Title) DESC\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarization" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(412, 5.651941747572825, 0.99, 25.86, 2328.600000000004)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT COUNT(*), AVG(Total), MIN(Total), MAX(Total), SUM(Total)\n", "FROM Invoice ;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grouping and summarization" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Argentina', 'Buenos Aires', 5.3742857142857146),\n", " ('Australia', 'Sidney', 5.3742857142857146),\n", " ('Austria', 'Vienne', 6.088571428571428),\n", " ('Belgium', 'Brussels', 5.374285714285714),\n", " ('Brazil', 'Brasília', 5.3742857142857146)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT BillingCountry, BillingCity, AVG(Total)\n", "FROM Invoice \n", "GROUP BY BillingCountry, BillingCity\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarization with conditions on summary" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Chile', 'Santiago', 6.659999999999999),\n", " ('Czech Republic', 'Prague', 6.445714285714287)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT BillingCountry, BillingCity, AVG(Total)\n", "FROM Invoice \n", "GROUP BY BillingCountry, BillingCity\n", "HAVING BillingCountry like 'C%' AND AVG(Total) > 6\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Joining tables" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 'ArtistId', 'INTEGER', 1, None, 1),\n", " (1, 'Name', 'NVARCHAR(120)', 0, None, 0)]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.execute(\"PRAGMA table_info(Artist)\")\n", "c.fetchall()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 'AlbumId', 'INTEGER', 1, None, 1),\n", " (1, 'Title', 'NVARCHAR(160)', 1, None, 0),\n", " (2, 'ArtistId', 'INTEGER', 1, None, 0)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.execute(\"PRAGMA table_info(Album)\")\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### No join gives the Cartesian prodcut" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('For Those About To Rock We Salute You', 'AC/DC'),\n", " ('For Those About To Rock We Salute You', 'Accept'),\n", " ('For Those About To Rock We Salute You', 'Aerosmith'),\n", " ('For Those About To Rock We Salute You', 'Alanis Morissette'),\n", " ('For Those About To Rock We Salute You', 'Alice In Chains')]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT Album.Title, Artist.Name\n", "FROM Album, Artist\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Inner Join" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('For Those About To Rock We Salute You', 'AC/DC'),\n", " ('Balls to the Wall', 'Accept'),\n", " ('Restless and Wild', 'Accept'),\n", " ('Let There Be Rock', 'AC/DC'),\n", " ('Big Ones', 'Aerosmith')]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT Album.Title, Artist.Name\n", "FROM Album\n", "INNER JOIN Artist ON Album.ArtistId = Artist.ArtistId\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Left Join" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('For Those About To Rock We Salute You', 'AC/DC'),\n", " ('Balls to the Wall', 'Accept'),\n", " ('Restless and Wild', 'Accept'),\n", " ('Let There Be Rock', 'AC/DC'),\n", " ('Big Ones', 'Aerosmith')]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT Album.Title, Artist.Name\n", "FROM Album\n", "LEFT JOIN Artist ON Album.ArtistId = Artist.ArtistId\n", "LIMIT 5;\n", "\"\"\"\n", "c.execute(q)\n", "c.fetchall()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using `sqlite3` and `pandas`" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BillingCountryBillingCityMean
0ArgentinaBuenos Aires5.374286
1AustraliaSidney5.374286
2AustriaVienne6.088571
3BelgiumBrussels5.374286
4BrazilBrasília5.374286
\n", "
" ], "text/plain": [ " BillingCountry BillingCity Mean\n", "0 Argentina Buenos Aires 5.374286\n", "1 Australia Sidney 5.374286\n", "2 Austria Vienne 6.088571\n", "3 Belgium Brussels 5.374286\n", "4 Brazil Brasília 5.374286" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q = \"\"\"\n", "SELECT BillingCountry, BillingCity, AVG(Total) AS Mean\n", "FROM Invoice \n", "GROUP BY BillingCountry, BillingCity\n", "LIMIT 5;\n", "\"\"\"\n", "\n", "df = pd.read_sql(q, con)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Free resources when no longer needed" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "c.close()\n", "con.close()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Creating database tables\n", "\n", "Concepts introduced\n", "\n", "- In memory database\n", "- Primary and foreign keys\n", "- Constraints\n", "- Modeling many:many relationships with a linker table" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SQLite3 databases can reside in memory" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con = sqlite3.connect(\":memory:\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creating a database" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "s1 = \"\"\"\n", "CREATE TABLE IF NOT EXISTS student (\n", "student_id integer PRIMARY KEY,\n", "first varchar NOT NULL,\n", "last varchar NOT NULL,\n", "email VARCHAR NOT NULL UNIQUE\n", ");\n", "\"\"\"\n", "\n", "s2 = \"\"\"\n", "CREATE TABLE IF NOT EXISTS subject (\n", "subject_id integer PRIMARY KEY,\n", "code varchar NOT NULL,\n", "name varchar NOT NULL\n", ");\n", "\"\"\"\n", "\n", "s3 = \"\"\"\n", "CREATE TABLE IF NOT EXISTS grade (\n", "student_id integer,\n", "subject_id integer,\n", "score float,\n", "FOREIGN KEY (student_id) REFERENCES student (student_id),\n", "FOREIGN KEY (subject_id) REFERENCES subject (subject_id),\n", "CHECK (score BETWEEN 0 and 100)\n", ");\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(s1)\n", "con.execute(s2)\n", "con.execute(s3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Populate the database" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with con:\n", " con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n", " (\"John\", \"Smith\", \"john.smith@duke.edu\"))\n", " con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n", " (\"Jane\", \"Smith\", \"jane.smith@duke.edu\"))\n", " con.execute(\"insert into student(first, last, email) values (?, ?, ?)\", \n", " (\"Joe\", \"Smith\", \"joe.smith@duke.edu\"))\n", " \n", " con.execute(\"insert into subject(code, name) values (?, ?)\", \n", " (\"BIOS 703\", \"Biology for unsuspecting statisticians\"))\n", " con.execute(\"insert into subject(code, name) values (?, ?)\", \n", " (\"BIOS 821\", \"Data science for mere mortals\"))\n", " \n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (1, 1, np.random.randint(0, 101)))\n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (1, 2, np.random.randint(0, 101)))\n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (2, 1, np.random.randint(0, 101)))\n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (2, 2, np.random.randint(0, 101)))\n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (3, 1, np.random.randint(0, 101)))\n", " con.execute(\"insert into grade(student_id, subject_id, score) values (?, ?, ?)\",\n", " (3, 2, np.random.randint(0, 101)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Query the database" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 'John', 'Smith', 'john.smith@duke.edu')\n", "(2, 'Jane', 'Smith', 'jane.smith@duke.edu')\n", "(3, 'Joe', 'Smith', 'joe.smith@duke.edu')\n" ] } ], "source": [ "for row in con.execute(\"select * from student;\"):\n", " print(row)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 'BIOS 703', 'Biology for unsuspecting statisticians')\n", "(2, 'BIOS 821', 'Data science for mere mortals')\n" ] } ], "source": [ "for row in con.execute(\"select * from subject;\"):\n", " print(row)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 1, 59.0)\n", "(1, 2, 51.0)\n", "(2, 1, 21.0)\n", "(2, 2, 24.0)\n", "(3, 1, 49.0)\n", "(3, 2, 29.0)\n" ] } ], "source": [ "for row in con.execute(\"select * from grade;\"):\n", " print(row)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Joins involving a linker table" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "q = \"\"\"\n", "SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n", "FROM student\n", "JOIN grade ON grade.student_id = student.student_id\n", "JOIN subject ON grade.subject_id = subject.subject_id;\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)\n", "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 29.0)\n" ] } ], "source": [ "for row in con.execute(q):\n", " print(row)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Modify the database\n", "\n", "Joe showed me a cute cat picture. I'm going to give him 100 for all subjects." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(3,)\n" ] } ], "source": [ "for row in con.execute('SELECT student_id from student where first = \"Joe\"'):\n", " print(row)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "s = \"\"\"\n", "UPDATE grade\n", "SET score = 100\n", "WHERE grade.student_id =\n", "(\n", " SELECT student_id from student where first = \"Joe\"\n", ")\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(s)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Check that update worked" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "q = \"\"\"\n", "SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n", "FROM student\n", "JOIN grade ON grade.student_id = student.student_id\n", "JOIN subject ON grade.subject_id = subject.subject_id\n", "WHERE student.first = \"Joe\"\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n" ] } ], "source": [ "for row in con.execute(q):\n", " print(row)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Saving in-memory database to disk" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "with open('dump.sql', 'w') as f:\n", " for line in con.iterdump():\n", " f.write('%s\\n' % line)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEGIN TRANSACTION;\r\n", "CREATE TABLE grade (\r\n", "student_id integer,\r\n", "subject_id integer,\r\n", "score float,\r\n", "FOREIGN KEY (student_id) REFERENCES student (student_id),\r\n", "FOREIGN KEY (subject_id) REFERENCES subject (subject_id),\r\n", "CHECK (score BETWEEN 0 and 100)\r\n", ");\r\n", "INSERT INTO \"grade\" VALUES(1,1,59.0);\r\n", "INSERT INTO \"grade\" VALUES(1,2,51.0);\r\n", "INSERT INTO \"grade\" VALUES(2,1,21.0);\r\n", "INSERT INTO \"grade\" VALUES(2,2,24.0);\r\n", "INSERT INTO \"grade\" VALUES(3,1,100.0);\r\n", "INSERT INTO \"grade\" VALUES(3,2,100.0);\r\n", "CREATE TABLE student (\r\n", "student_id integer PRIMARY KEY,\r\n", "first varchar NOT NULL,\r\n", "last varchar NOT NULL,\r\n", "email VARCHAR NOT NULL UNIQUE\r\n", ");\r\n", "INSERT INTO \"student\" VALUES(1,'John','Smith','john.smith@duke.edu');\r\n", "INSERT INTO \"student\" VALUES(2,'Jane','Smith','jane.smith@duke.edu');\r\n", "INSERT INTO \"student\" VALUES(3,'Joe','Smith','joe.smith@duke.edu');\r\n", "CREATE TABLE subject (\r\n", "subject_id integer PRIMARY KEY,\r\n", "code varchar NOT NULL,\r\n", "name varchar NOT NULL\r\n", ");\r\n", "INSERT INTO \"subject\" VALUES(1,'BIOS 703','Biology for unsuspecting statisticians');\r\n", "INSERT INTO \"subject\" VALUES(2,'BIOS 821','Data science for mere mortals');\r\n", "COMMIT;\r\n" ] } ], "source": [ "! cat dump.sql" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Convert to SQLite3 database" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error: near line 2: table grade already exists\r\n", "Error: near line 16: table student already exists\r\n", "Error: near line 22: UNIQUE constraint failed: student.student_id\r\n", "Error: near line 23: UNIQUE constraint failed: student.student_id\r\n", "Error: near line 24: UNIQUE constraint failed: student.student_id\r\n", "Error: near line 25: table subject already exists\r\n", "Error: near line 30: UNIQUE constraint failed: subject.subject_id\r\n", "Error: near line 31: UNIQUE constraint failed: subject.subject_id\r\n" ] } ], "source": [ "! cat dump.sql | sqlite3 college.db" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con1 = sqlite3.connect('college.db')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "q = \"\"\"\n", "SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score\n", "FROM student\n", "JOIN grade ON grade.student_id = student.student_id\n", "JOIN subject ON grade.subject_id = subject.subject_id;\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 29.0)\n", "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 20.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 96.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n", "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)\n", "('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)\n", "('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)\n", "('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)\n" ] } ], "source": [ "for row in con1.execute(q):\n", " print(row)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Close connections" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con.close()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": true }, "outputs": [], "source": [ "con1.close()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Using SQL magic functions\n", "\n", "We will use the [ipython-sql](https://github.com/catherinedevlin/ipython-sql) notebook extension for convenience. This will only work in notebooks and IPython scripts with the .ipy extension." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import warnings\n", "\n", "with warnings.catch_warnings():\n", " warnings.simplefilter('ignore')\n", " %load_ext sql" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Configuring the SqlMagic extension" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SqlMagic options\n", "--------------\n", "SqlMagic.autolimit=\n", " Current: 0\n", " Automatically limit the size of the returned result sets\n", "SqlMagic.autopandas=\n", " Current: False\n", " Return Pandas DataFrames instead of regular result sets\n", "SqlMagic.column_local_vars=\n", " Current: False\n", " Return data into local variables from column names\n", "SqlMagic.displaylimit=\n", " Current: 0\n", " Automatically limit the number of rows displayed (full result set is still\n", " stored)\n", "SqlMagic.dsn_filename=\n", " Current: 'odbc.ini'\n", " Path to DSN file. When the first argument is of the form [section], a\n", " sqlalchemy connection string is formed from the matching section in the DSN\n", " file.\n", "SqlMagic.feedback=\n", " Current: True\n", " Print number of rows affected by DML\n", "SqlMagic.short_errors=\n", " Current: True\n", " Don't display the full traceback on SQL Programming Error\n", "SqlMagic.style=\n", " Current: 'DEFAULT'\n", " Set the table printing style to any of prettytable's defined styles\n", " (currently DEFAULT, MSWORD_FRIENDLY, PLAIN_COLUMNS, RANDOM)\n" ] } ], "source": [ "%config SqlMagic" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%config SqlMagic.displaylimit=10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Connect to SQLite3 database\n", "\n", "See [SQLAlchemy connection strings](http://docs.sqlalchemy.org/en/latest/core/engines.html) for how to connect to other databases such as Oracle, MySQL or PostgreSQL." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Connected: None@Chinook_Sqlite.sqlite'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql sqlite:///Chinook_Sqlite.sqlite" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlbumIdTitleArtistId
1For Those About To Rock We Salute You1
2Balls to the Wall2
3Restless and Wild2
4Let There Be Rock1
5Big Ones3
" ], "text/plain": [ "[(1, 'For Those About To Rock We Salute You', 1),\n", " (2, 'Balls to the Wall', 2),\n", " (3, 'Restless and Wild', 2),\n", " (4, 'Let There Be Rock', 1),\n", " (5, 'Big Ones', 3)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * from Album LIMIT 5;" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArtistIdName
1AC/DC
2Accept
3Aerosmith
4Alanis Morissette
5Alice In Chains
" ], "text/plain": [ "[(1, 'AC/DC'),\n", " (2, 'Accept'),\n", " (3, 'Aerosmith'),\n", " (4, 'Alanis Morissette'),\n", " (5, 'Alice In Chains')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * from Artist LIMIT 5;" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameTitle
AC/DCFor Those About To Rock We Salute You
AC/DCLet There Be Rock
Aaron Copland & London Symphony OrchestraA Copland Celebration, Vol. I
Aaron GoldbergWorlds
Academy of St. Martin in the Fields & Sir Neville MarrinerThe World of Classical Favourites
" ], "text/plain": [ "[('AC/DC', 'For Those About To Rock We Salute You'),\n", " ('AC/DC', 'Let There Be Rock'),\n", " ('Aaron Copland & London Symphony Orchestra', 'A Copland Celebration, Vol. I'),\n", " ('Aaron Goldberg', 'Worlds'),\n", " ('Academy of St. Martin in the Fields & Sir Neville Marriner', 'The World of Classical Favourites')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "SELECT Artist.Name, Album.Title\n", "FROM Album\n", "INNER JOIN Artist on Album.ArtistId = Artist.ArtistId\n", "ORDER BY Artist.Name ASC\n", "LIMIT 5;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### You can assign results of queries to Python names\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] } ], "source": [ "result = %sql SELECT * from Album;" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sql.run.ResultSet" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(result)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(3, 'Restless and Wild', 2), (4, 'Let There Be Rock', 1)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result[2:4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### You can use Python variables in your queries\n", "\n", "Use `:varname` where you want to use a Python variable in your query. " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ArtistIdName
1AC/DC
2Accept
3Aerosmith
4Alanis Morissette
5Alice In Chains
6Antônio Carlos Jobim
7Apocalyptica
8Audioslave
9BackBeat
" ], "text/plain": [ "[(1, 'AC/DC'),\n", " (2, 'Accept'),\n", " (3, 'Aerosmith'),\n", " (4, 'Alanis Morissette'),\n", " (5, 'Alice In Chains'),\n", " (6, 'Antônio Carlos Jobim'),\n", " (7, 'Apocalyptica'),\n", " (8, 'Audioslave'),\n", " (9, 'BackBeat')]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "artist_id = 10\n", "\n", "%sql select * from Artist where ArtistId < :artist_id;" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlbumIdTitleArtistId
1For Those About To Rock We Salute You1
4Let There Be Rock1
59Deep Purple In Rock58
108Rock In Rio [CD1]90
109Rock In Rio [CD2]90
213Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]139
216Hot Rocks, 1964-1971 (Disc 1)142
" ], "text/plain": [ "[(1, 'For Those About To Rock We Salute You', 1),\n", " (4, 'Let There Be Rock', 1),\n", " (59, 'Deep Purple In Rock', 58),\n", " (108, 'Rock In Rio [CD1]', 90),\n", " (109, 'Rock In Rio [CD2]', 90),\n", " (213, 'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]', 139),\n", " (216, 'Hot Rocks, 1964-1971 (Disc 1)', 142)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word = '%rock%'\n", "\n", "%sql select * from Album WHERE Title LIKE :word;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Convert to `pandas` dataframe" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlbumIdTitleArtistId
01For Those About To Rock We Salute You1
12Balls to the Wall2
23Restless and Wild2
34Let There Be Rock1
45Big Ones3
\n", "
" ], "text/plain": [ " AlbumId Title ArtistId\n", "0 1 For Those About To Rock We Salute You 1\n", "1 2 Balls to the Wall 2\n", "2 3 Restless and Wild 2\n", "3 4 Let There Be Rock 1\n", "4 5 Big Ones 3" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = result.DataFrame()\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }