Data Archival and Management (Part 1)

Relational databases via sqlite3 package

In [1]:
import sqlite3
import numpy as np
import pandas as pd
In [2]:
con = sqlite3.connect('Chinook_Sqlite.sqlite')

Creating a cursor

A cursor is used to execute commands. Actually, you can also execute commands directly from the connection object in sqlite3, but this is not true for all DB API drivers.

In [3]:
c = con.cursor()

Find tables in a database

Metadata is stored in a special table sql_master.

In [4]:
q = """
SELECT name
FROM sqlite_master
WHERE type='table';
"""
c.execute(q)
c.fetchall()
Out[4]:
[('Album',),
 ('Artist',),
 ('Customer',),
 ('Employee',),
 ('Genre',),
 ('Invoice',),
 ('InvoiceLine',),
 ('MediaType',),
 ('Playlist',),
 ('PlaylistTrack',),
 ('Track',)]

View structure of a particular table

Method 1: Using table_info pragma

In [5]:
c.execute("PRAGMA table_info(Invoice)")
c.fetchall()
Out[5]:
[(0, 'InvoiceId', 'INTEGER', 1, None, 1),
 (1, 'CustomerId', 'INTEGER', 1, None, 0),
 (2, 'InvoiceDate', 'DATETIME', 1, None, 0),
 (3, 'BillingAddress', 'NVARCHAR(70)', 0, None, 0),
 (4, 'BillingCity', 'NVARCHAR(40)', 0, None, 0),
 (5, 'BillingState', 'NVARCHAR(40)', 0, None, 0),
 (6, 'BillingCountry', 'NVARCHAR(40)', 0, None, 0),
 (7, 'BillingPostalCode', 'NVARCHAR(10)', 0, None, 0),
 (8, 'Total', 'NUMERIC(10,2)', 1, None, 0)]

Method 2: Looking at SQL definition of table

In [6]:
q = """
SELECT sql
FROM sqlite_master
WHERE name='Album';
"""
c.execute(q)
print(c.fetchone()[0])
CREATE TABLE [Album]
(
    [AlbumId] INTEGER  NOT NULL,
    [Title] NVARCHAR(160)  NOT NULL,
    [ArtistId] INTEGER  NOT NULL,
    CONSTRAINT [PK_Album] PRIMARY KEY  ([AlbumId]),
    FOREIGN KEY ([ArtistId]) REFERENCES [Artist] ([ArtistId])
                ON DELETE NO ACTION ON UPDATE NO ACTION
)

Method 3: Fetch a row and see description (only column names)

In [7]:
c.execute("Select * from Invoice;")
c.fetchone()
c.description
Out[7]:
(('InvoiceId', None, None, None, None, None, None),
 ('CustomerId', None, None, None, None, None, None),
 ('InvoiceDate', None, None, None, None, None, None),
 ('BillingAddress', None, None, None, None, None, None),
 ('BillingCity', None, None, None, None, None, None),
 ('BillingState', None, None, None, None, None, None),
 ('BillingCountry', None, None, None, None, None, None),
 ('BillingPostalCode', None, None, None, None, None, None),
 ('Total', None, None, None, None, None, None))

View all columns of a table

In [8]:
q = """
SELECT *
FROM Album
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[8]:
[(1, 'For Those About To Rock We Salute You', 1),
 (2, 'Balls to the Wall', 2),
 (3, 'Restless and Wild', 2),
 (4, 'Let There Be Rock', 1),
 (5, 'Big Ones', 3)]

Select specific columns

In [9]:
q = """
SELECT AlbumId, Title
FROM Album
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[9]:
[(1, 'For Those About To Rock We Salute You'),
 (2, 'Balls to the Wall'),
 (3, 'Restless and Wild'),
 (4, 'Let There Be Rock'),
 (5, 'Big Ones')]

Filter rows by some criteria

In [10]:
q = """
SELECT *
FROM Album
WHERE Title LIKE '%Rock%'
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[10]:
[(1, 'For Those About To Rock We Salute You', 1),
 (4, 'Let There Be Rock', 1),
 (59, 'Deep Purple In Rock', 58),
 (108, 'Rock In Rio [CD1]', 90),
 (109, 'Rock In Rio [CD2]', 90)]
In [11]:
q = """
SELECT *
FROM Album
WHERE length(Title) < 10
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[11]:
[(5, 'Big Ones', 3),
 (7, 'Facelift', 5),
 (28, 'Na Pista', 20),
 (60, 'Fireball', 58),
 (68, 'Outbreak', 79)]

Sorting

In [12]:
q = """
SELECT *
FROM Album
ORDER BY length(Title) DESC
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[12]:
[(308,
  "Tchaikovsky: 1812 Festival Overture, Op.49, Capriccio Italien & Beethoven: Wellington's Victory",
  243),
 (335,
  'J.S. Bach: Chaconne, Suite in E Minor, Partita in E Major & Prelude, Fugue and Allegro',
  265),
 (294,
  "Great Performances - Barber's Adagio and Other Romantic Favorites for Strings",
  228),
 (213,
  'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]',
  139),
 (257,
  '20th Century Masters - The Millennium Collection: The Best of Scorpions',
  179)]

Summarization

In [13]:
q = """
SELECT COUNT(*), AVG(Total), MIN(Total), MAX(Total), SUM(Total)
FROM Invoice ;
"""
c.execute(q)
c.fetchall()
Out[13]:
[(412, 5.651941747572825, 0.99, 25.86, 2328.600000000004)]

Grouping and summarization

In [14]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total)
FROM Invoice
GROUP BY BillingCountry, BillingCity
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[14]:
[('Argentina', 'Buenos Aires', 5.3742857142857146),
 ('Australia', 'Sidney', 5.3742857142857146),
 ('Austria', 'Vienne', 6.088571428571428),
 ('Belgium', 'Brussels', 5.374285714285714),
 ('Brazil', 'Brasília', 5.3742857142857146)]

Summarization with conditions on summary

In [15]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total)
FROM Invoice
GROUP BY BillingCountry, BillingCity
HAVING BillingCountry like 'C%' AND AVG(Total) > 6
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[15]:
[('Chile', 'Santiago', 6.659999999999999),
 ('Czech Republic', 'Prague', 6.445714285714287)]

Joining tables

In [16]:
c.execute("PRAGMA table_info(Artist)")
c.fetchall()
Out[16]:
[(0, 'ArtistId', 'INTEGER', 1, None, 1),
 (1, 'Name', 'NVARCHAR(120)', 0, None, 0)]
In [17]:
c.execute("PRAGMA table_info(Album)")
c.fetchall()
Out[17]:
[(0, 'AlbumId', 'INTEGER', 1, None, 1),
 (1, 'Title', 'NVARCHAR(160)', 1, None, 0),
 (2, 'ArtistId', 'INTEGER', 1, None, 0)]

No join gives the Cartesian prodcut

In [18]:
q = """
SELECT Album.Title, Artist.Name
FROM Album, Artist
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[18]:
[('For Those About To Rock We Salute You', 'AC/DC'),
 ('For Those About To Rock We Salute You', 'Accept'),
 ('For Those About To Rock We Salute You', 'Aerosmith'),
 ('For Those About To Rock We Salute You', 'Alanis Morissette'),
 ('For Those About To Rock We Salute You', 'Alice In Chains')]

Inner Join

In [19]:
q = """
SELECT Album.Title, Artist.Name
FROM Album
INNER JOIN Artist ON Album.ArtistId = Artist.ArtistId
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[19]:
[('For Those About To Rock We Salute You', 'AC/DC'),
 ('Balls to the Wall', 'Accept'),
 ('Restless and Wild', 'Accept'),
 ('Let There Be Rock', 'AC/DC'),
 ('Big Ones', 'Aerosmith')]

Left Join

In [20]:
q = """
SELECT Album.Title, Artist.Name
FROM Album
LEFT JOIN Artist ON Album.ArtistId = Artist.ArtistId
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[20]:
[('For Those About To Rock We Salute You', 'AC/DC'),
 ('Balls to the Wall', 'Accept'),
 ('Restless and Wild', 'Accept'),
 ('Let There Be Rock', 'AC/DC'),
 ('Big Ones', 'Aerosmith')]

Using sqlite3 and pandas

In [21]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total) AS Mean
FROM Invoice
GROUP BY BillingCountry, BillingCity
LIMIT 5;
"""

df = pd.read_sql(q, con)
df.head()
Out[21]:
BillingCountry BillingCity Mean
0 Argentina Buenos Aires 5.374286
1 Australia Sidney 5.374286
2 Austria Vienne 6.088571
3 Belgium Brussels 5.374286
4 Brazil Brasília 5.374286

Free resources when no longer needed

In [22]:
c.close()
con.close()

Creating database tables

Concepts introduced

  • In memory database
  • Primary and foreign keys
  • Constraints
  • Modeling many:many relationships with a linker table
In [23]:
con = sqlite3.connect(":memory:")

Creating a database

In [24]:
s1 = """
CREATE TABLE IF NOT EXISTS student (
student_id integer PRIMARY KEY,
first varchar NOT NULL,
last varchar NOT NULL,
email VARCHAR NOT NULL UNIQUE
);
"""

s2 = """
CREATE TABLE IF NOT EXISTS subject (
subject_id integer PRIMARY KEY,
code varchar NOT NULL,
name varchar NOT NULL
);
"""

s3 = """
CREATE TABLE IF NOT EXISTS grade (
student_id integer,
subject_id integer,
score float,
FOREIGN KEY (student_id) REFERENCES student (student_id),
FOREIGN KEY (subject_id) REFERENCES subject (subject_id),
CHECK (score BETWEEN 0 and 100)
);
"""
In [25]:
con.execute(s1)
con.execute(s2)
con.execute(s3)
Out[25]:
<sqlite3.Cursor at 0x10a5a8b20>

Populate the database

In [26]:
with con:
    con.execute("insert into student(first, last, email) values (?, ?, ?)",
                ("John", "Smith", "john.smith@duke.edu"))
    con.execute("insert into student(first, last, email) values (?, ?, ?)",
                ("Jane", "Smith", "jane.smith@duke.edu"))
    con.execute("insert into student(first, last, email) values (?, ?, ?)",
                ("Joe", "Smith", "joe.smith@duke.edu"))

    con.execute("insert into subject(code, name) values (?, ?)",
                ("BIOS 703", "Biology for unsuspecting statisticians"))
    con.execute("insert into subject(code, name) values (?, ?)",
                ("BIOS 821", "Data science for mere mortals"))

    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (1, 1, np.random.randint(0, 101)))
    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (1, 2, np.random.randint(0, 101)))
    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (2, 1, np.random.randint(0, 101)))
    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (2, 2, np.random.randint(0, 101)))
    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (3, 1, np.random.randint(0, 101)))
    con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
               (3, 2, np.random.randint(0, 101)))

Query the database

In [27]:
for row in con.execute("select * from student;"):
    print(row)
(1, 'John', 'Smith', 'john.smith@duke.edu')
(2, 'Jane', 'Smith', 'jane.smith@duke.edu')
(3, 'Joe', 'Smith', 'joe.smith@duke.edu')
In [28]:
for row in con.execute("select * from subject;"):
    print(row)
(1, 'BIOS 703', 'Biology for unsuspecting statisticians')
(2, 'BIOS 821', 'Data science for mere mortals')
In [29]:
for row in con.execute("select * from grade;"):
    print(row)
(1, 1, 59.0)
(1, 2, 51.0)
(2, 1, 21.0)
(2, 2, 24.0)
(3, 1, 49.0)
(3, 2, 29.0)

Joins involving a linker table

In [30]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id;
"""
In [31]:
for row in con.execute(q):
    print(row)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 29.0)

Modify the database

Joe showed me a cute cat picture. I’m going to give him 100 for all subjects.

In [32]:
for row in con.execute('SELECT student_id from student where first = "Joe"'):
    print(row)
(3,)
In [33]:
s = """
UPDATE grade
SET score = 100
WHERE grade.student_id  =
(
    SELECT student_id from student where first = "Joe"
)
"""
In [34]:
con.execute(s)
Out[34]:
<sqlite3.Cursor at 0x10a5e77a0>

Check that update worked

In [35]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id
WHERE student.first = "Joe"
"""
In [36]:
for row in con.execute(q):
    print(row)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)

Saving in-memory database to disk

In [37]:
with open('dump.sql', 'w') as f:
    for line in con.iterdump():
        f.write('%s\n' % line)
In [38]:
! cat dump.sql
BEGIN TRANSACTION;
CREATE TABLE grade (
student_id integer,
subject_id integer,
score float,
FOREIGN KEY (student_id) REFERENCES student (student_id),
FOREIGN KEY (subject_id) REFERENCES subject (subject_id),
CHECK (score BETWEEN 0 and 100)
);
INSERT INTO "grade" VALUES(1,1,59.0);
INSERT INTO "grade" VALUES(1,2,51.0);
INSERT INTO "grade" VALUES(2,1,21.0);
INSERT INTO "grade" VALUES(2,2,24.0);
INSERT INTO "grade" VALUES(3,1,100.0);
INSERT INTO "grade" VALUES(3,2,100.0);
CREATE TABLE student (
student_id integer PRIMARY KEY,
first varchar NOT NULL,
last varchar NOT NULL,
email VARCHAR NOT NULL UNIQUE
);
INSERT INTO "student" VALUES(1,'John','Smith','john.smith@duke.edu');
INSERT INTO "student" VALUES(2,'Jane','Smith','jane.smith@duke.edu');
INSERT INTO "student" VALUES(3,'Joe','Smith','joe.smith@duke.edu');
CREATE TABLE subject (
subject_id integer PRIMARY KEY,
code varchar NOT NULL,
name varchar NOT NULL
);
INSERT INTO "subject" VALUES(1,'BIOS 703','Biology for unsuspecting statisticians');
INSERT INTO "subject" VALUES(2,'BIOS 821','Data science for mere mortals');
COMMIT;

Convert to SQLite3 database

In [39]:
! cat dump.sql | sqlite3 college.db
Error: near line 2: table grade already exists
Error: near line 16: table student already exists
Error: near line 22: UNIQUE constraint failed: student.student_id
Error: near line 23: UNIQUE constraint failed: student.student_id
Error: near line 24: UNIQUE constraint failed: student.student_id
Error: near line 25: table subject already exists
Error: near line 30: UNIQUE constraint failed: subject.subject_id
Error: near line 31: UNIQUE constraint failed: subject.subject_id
In [40]:
con1 = sqlite3.connect('college.db')
In [41]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id;
"""
In [42]:
for row in con1.execute(q):
    print(row)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 29.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 20.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 96.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)

Close connections

In [43]:
con.close()
In [44]:
con1.close()

Using SQL magic functions

We will use the ipython-sql notebook extension for convenience. This will only work in notebooks and IPython scripts with the .ipy extension.

In [1]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    %load_ext sql

Configuring the SqlMagic extension

In [2]:
%config SqlMagic
SqlMagic options
--------------
SqlMagic.autolimit=<Int>
    Current: 0
    Automatically limit the size of the returned result sets
SqlMagic.autopandas=<Bool>
    Current: False
    Return Pandas DataFrames instead of regular result sets
SqlMagic.column_local_vars=<Bool>
    Current: False
    Return data into local variables from column names
SqlMagic.displaylimit=<Int>
    Current: 0
    Automatically limit the number of rows displayed (full result set is still
    stored)
SqlMagic.dsn_filename=<Unicode>
    Current: 'odbc.ini'
    Path to DSN file. When the first argument is of the form [section], a
    sqlalchemy connection string is formed from the matching section in the DSN
    file.
SqlMagic.feedback=<Bool>
    Current: True
    Print number of rows affected by DML
SqlMagic.short_errors=<Bool>
    Current: True
    Don't display the full traceback on SQL Programming Error
SqlMagic.style=<Unicode>
    Current: 'DEFAULT'
    Set the table printing style to any of prettytable's defined styles
    (currently DEFAULT, MSWORD_FRIENDLY, PLAIN_COLUMNS, RANDOM)
In [3]:
%config SqlMagic.displaylimit=10

Connect to SQLite3 database

See SQLAlchemy connection strings for how to connect to other databases such as Oracle, MySQL or PostgreSQL.

In [6]:
%sql sqlite:///Chinook_Sqlite.sqlite
Out[6]:
'Connected: None@Chinook_Sqlite.sqlite'
In [7]:
%sql SELECT * from Album LIMIT 5;
Done.
Out[7]:
AlbumId Title ArtistId
1 For Those About To Rock We Salute You 1
2 Balls to the Wall 2
3 Restless and Wild 2
4 Let There Be Rock 1
5 Big Ones 3
In [8]:
%sql SELECT * from Artist LIMIT 5;
Done.
Out[8]:
ArtistId Name
1 AC/DC
2 Accept
3 Aerosmith
4 Alanis Morissette
5 Alice In Chains
In [9]:
%%sql
SELECT Artist.Name, Album.Title
FROM Album
INNER JOIN Artist on Album.ArtistId = Artist.ArtistId
ORDER BY Artist.Name ASC
LIMIT 5;
Done.
Out[9]:
Name Title
AC/DC For Those About To Rock We Salute You
AC/DC Let There Be Rock
Aaron Copland & London Symphony Orchestra A Copland Celebration, Vol. I
Aaron Goldberg Worlds
Academy of St. Martin in the Fields & Sir Neville Marriner The World of Classical Favourites

You can assign results of queries to Python names

In [10]:
result = %sql SELECT * from Album;
Done.
In [11]:
type(result)
Out[11]:
sql.run.ResultSet
In [12]:
result[2:4]
Out[12]:
[(3, 'Restless and Wild', 2), (4, 'Let There Be Rock', 1)]

You can use Python variables in your queries

Use :varname where you want to use a Python variable in your query.

In [13]:
artist_id = 10

%sql select * from Artist where ArtistId < :artist_id;
Done.
Out[13]:
ArtistId Name
1 AC/DC
2 Accept
3 Aerosmith
4 Alanis Morissette
5 Alice In Chains
6 Antônio Carlos Jobim
7 Apocalyptica
8 Audioslave
9 BackBeat
In [14]:
word = '%rock%'

%sql select * from Album WHERE Title LIKE :word;
Done.
Out[14]:
AlbumId Title ArtistId
1 For Those About To Rock We Salute You 1
4 Let There Be Rock 1
59 Deep Purple In Rock 58
108 Rock In Rio [CD1] 90
109 Rock In Rio [CD2] 90
213 Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK] 139
216 Hot Rocks, 1964-1971 (Disc 1) 142

Convert to pandas dataframe

In [16]:
df = result.DataFrame()
df.head(5)
Out[16]:
AlbumId Title ArtistId
0 1 For Those About To Rock We Salute You 1
1 2 Balls to the Wall 2
2 3 Restless and Wild 2
3 4 Let There Be Rock 1
4 5 Big Ones 3
In [ ]: