Data Archival and Management (Part 1)¶
Relational databases via sqlite3
package¶
In [1]:
import sqlite3
import numpy as np
import pandas as pd
In [2]:
con = sqlite3.connect('Chinook_Sqlite.sqlite')
Creating a cursor¶
A cursor is used to execute commands. Actually, you can also execute
commands directly from the connection object in sqlite3
, but this is
not true for all DB API drivers.
In [3]:
c = con.cursor()
Find tables in a database¶
Metadata is stored in a special table sql_master
.
In [4]:
q = """
SELECT name
FROM sqlite_master
WHERE type='table';
"""
c.execute(q)
c.fetchall()
Out[4]:
[('Album',),
('Artist',),
('Customer',),
('Employee',),
('Genre',),
('Invoice',),
('InvoiceLine',),
('MediaType',),
('Playlist',),
('PlaylistTrack',),
('Track',)]
View structure of a particular table¶
Method 1: Using table_info
pragma¶
In [5]:
c.execute("PRAGMA table_info(Invoice)")
c.fetchall()
Out[5]:
[(0, 'InvoiceId', 'INTEGER', 1, None, 1),
(1, 'CustomerId', 'INTEGER', 1, None, 0),
(2, 'InvoiceDate', 'DATETIME', 1, None, 0),
(3, 'BillingAddress', 'NVARCHAR(70)', 0, None, 0),
(4, 'BillingCity', 'NVARCHAR(40)', 0, None, 0),
(5, 'BillingState', 'NVARCHAR(40)', 0, None, 0),
(6, 'BillingCountry', 'NVARCHAR(40)', 0, None, 0),
(7, 'BillingPostalCode', 'NVARCHAR(10)', 0, None, 0),
(8, 'Total', 'NUMERIC(10,2)', 1, None, 0)]
Method 2: Looking at SQL definition of table¶
In [6]:
q = """
SELECT sql
FROM sqlite_master
WHERE name='Album';
"""
c.execute(q)
print(c.fetchone()[0])
CREATE TABLE [Album]
(
[AlbumId] INTEGER NOT NULL,
[Title] NVARCHAR(160) NOT NULL,
[ArtistId] INTEGER NOT NULL,
CONSTRAINT [PK_Album] PRIMARY KEY ([AlbumId]),
FOREIGN KEY ([ArtistId]) REFERENCES [Artist] ([ArtistId])
ON DELETE NO ACTION ON UPDATE NO ACTION
)
Method 3: Fetch a row and see description (only column names)¶
In [7]:
c.execute("Select * from Invoice;")
c.fetchone()
c.description
Out[7]:
(('InvoiceId', None, None, None, None, None, None),
('CustomerId', None, None, None, None, None, None),
('InvoiceDate', None, None, None, None, None, None),
('BillingAddress', None, None, None, None, None, None),
('BillingCity', None, None, None, None, None, None),
('BillingState', None, None, None, None, None, None),
('BillingCountry', None, None, None, None, None, None),
('BillingPostalCode', None, None, None, None, None, None),
('Total', None, None, None, None, None, None))
View all columns of a table¶
In [8]:
q = """
SELECT *
FROM Album
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[8]:
[(1, 'For Those About To Rock We Salute You', 1),
(2, 'Balls to the Wall', 2),
(3, 'Restless and Wild', 2),
(4, 'Let There Be Rock', 1),
(5, 'Big Ones', 3)]
Select specific columns¶
In [9]:
q = """
SELECT AlbumId, Title
FROM Album
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[9]:
[(1, 'For Those About To Rock We Salute You'),
(2, 'Balls to the Wall'),
(3, 'Restless and Wild'),
(4, 'Let There Be Rock'),
(5, 'Big Ones')]
Filter rows by some criteria¶
In [10]:
q = """
SELECT *
FROM Album
WHERE Title LIKE '%Rock%'
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[10]:
[(1, 'For Those About To Rock We Salute You', 1),
(4, 'Let There Be Rock', 1),
(59, 'Deep Purple In Rock', 58),
(108, 'Rock In Rio [CD1]', 90),
(109, 'Rock In Rio [CD2]', 90)]
In [11]:
q = """
SELECT *
FROM Album
WHERE length(Title) < 10
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[11]:
[(5, 'Big Ones', 3),
(7, 'Facelift', 5),
(28, 'Na Pista', 20),
(60, 'Fireball', 58),
(68, 'Outbreak', 79)]
Sorting¶
In [12]:
q = """
SELECT *
FROM Album
ORDER BY length(Title) DESC
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[12]:
[(308,
"Tchaikovsky: 1812 Festival Overture, Op.49, Capriccio Italien & Beethoven: Wellington's Victory",
243),
(335,
'J.S. Bach: Chaconne, Suite in E Minor, Partita in E Major & Prelude, Fugue and Allegro',
265),
(294,
"Great Performances - Barber's Adagio and Other Romantic Favorites for Strings",
228),
(213,
'Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK]',
139),
(257,
'20th Century Masters - The Millennium Collection: The Best of Scorpions',
179)]
Summarization¶
In [13]:
q = """
SELECT COUNT(*), AVG(Total), MIN(Total), MAX(Total), SUM(Total)
FROM Invoice ;
"""
c.execute(q)
c.fetchall()
Out[13]:
[(412, 5.651941747572825, 0.99, 25.86, 2328.600000000004)]
Grouping and summarization¶
In [14]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total)
FROM Invoice
GROUP BY BillingCountry, BillingCity
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[14]:
[('Argentina', 'Buenos Aires', 5.3742857142857146),
('Australia', 'Sidney', 5.3742857142857146),
('Austria', 'Vienne', 6.088571428571428),
('Belgium', 'Brussels', 5.374285714285714),
('Brazil', 'Brasília', 5.3742857142857146)]
Summarization with conditions on summary¶
In [15]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total)
FROM Invoice
GROUP BY BillingCountry, BillingCity
HAVING BillingCountry like 'C%' AND AVG(Total) > 6
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[15]:
[('Chile', 'Santiago', 6.659999999999999),
('Czech Republic', 'Prague', 6.445714285714287)]
Joining tables¶
In [16]:
c.execute("PRAGMA table_info(Artist)")
c.fetchall()
Out[16]:
[(0, 'ArtistId', 'INTEGER', 1, None, 1),
(1, 'Name', 'NVARCHAR(120)', 0, None, 0)]
In [17]:
c.execute("PRAGMA table_info(Album)")
c.fetchall()
Out[17]:
[(0, 'AlbumId', 'INTEGER', 1, None, 1),
(1, 'Title', 'NVARCHAR(160)', 1, None, 0),
(2, 'ArtistId', 'INTEGER', 1, None, 0)]
No join gives the Cartesian prodcut¶
In [18]:
q = """
SELECT Album.Title, Artist.Name
FROM Album, Artist
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[18]:
[('For Those About To Rock We Salute You', 'AC/DC'),
('For Those About To Rock We Salute You', 'Accept'),
('For Those About To Rock We Salute You', 'Aerosmith'),
('For Those About To Rock We Salute You', 'Alanis Morissette'),
('For Those About To Rock We Salute You', 'Alice In Chains')]
Inner Join¶
In [19]:
q = """
SELECT Album.Title, Artist.Name
FROM Album
INNER JOIN Artist ON Album.ArtistId = Artist.ArtistId
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[19]:
[('For Those About To Rock We Salute You', 'AC/DC'),
('Balls to the Wall', 'Accept'),
('Restless and Wild', 'Accept'),
('Let There Be Rock', 'AC/DC'),
('Big Ones', 'Aerosmith')]
Left Join¶
In [20]:
q = """
SELECT Album.Title, Artist.Name
FROM Album
LEFT JOIN Artist ON Album.ArtistId = Artist.ArtistId
LIMIT 5;
"""
c.execute(q)
c.fetchall()
Out[20]:
[('For Those About To Rock We Salute You', 'AC/DC'),
('Balls to the Wall', 'Accept'),
('Restless and Wild', 'Accept'),
('Let There Be Rock', 'AC/DC'),
('Big Ones', 'Aerosmith')]
Using sqlite3
and pandas
¶
In [21]:
q = """
SELECT BillingCountry, BillingCity, AVG(Total) AS Mean
FROM Invoice
GROUP BY BillingCountry, BillingCity
LIMIT 5;
"""
df = pd.read_sql(q, con)
df.head()
Out[21]:
BillingCountry | BillingCity | Mean | |
---|---|---|---|
0 | Argentina | Buenos Aires | 5.374286 |
1 | Australia | Sidney | 5.374286 |
2 | Austria | Vienne | 6.088571 |
3 | Belgium | Brussels | 5.374286 |
4 | Brazil | Brasília | 5.374286 |
Free resources when no longer needed¶
In [22]:
c.close()
con.close()
Creating database tables¶
Concepts introduced
- In memory database
- Primary and foreign keys
- Constraints
- Modeling many:many relationships with a linker table
In [23]:
con = sqlite3.connect(":memory:")
Creating a database¶
In [24]:
s1 = """
CREATE TABLE IF NOT EXISTS student (
student_id integer PRIMARY KEY,
first varchar NOT NULL,
last varchar NOT NULL,
email VARCHAR NOT NULL UNIQUE
);
"""
s2 = """
CREATE TABLE IF NOT EXISTS subject (
subject_id integer PRIMARY KEY,
code varchar NOT NULL,
name varchar NOT NULL
);
"""
s3 = """
CREATE TABLE IF NOT EXISTS grade (
student_id integer,
subject_id integer,
score float,
FOREIGN KEY (student_id) REFERENCES student (student_id),
FOREIGN KEY (subject_id) REFERENCES subject (subject_id),
CHECK (score BETWEEN 0 and 100)
);
"""
In [25]:
con.execute(s1)
con.execute(s2)
con.execute(s3)
Out[25]:
<sqlite3.Cursor at 0x10a5a8b20>
Populate the database¶
In [26]:
with con:
con.execute("insert into student(first, last, email) values (?, ?, ?)",
("John", "Smith", "john.smith@duke.edu"))
con.execute("insert into student(first, last, email) values (?, ?, ?)",
("Jane", "Smith", "jane.smith@duke.edu"))
con.execute("insert into student(first, last, email) values (?, ?, ?)",
("Joe", "Smith", "joe.smith@duke.edu"))
con.execute("insert into subject(code, name) values (?, ?)",
("BIOS 703", "Biology for unsuspecting statisticians"))
con.execute("insert into subject(code, name) values (?, ?)",
("BIOS 821", "Data science for mere mortals"))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(1, 1, np.random.randint(0, 101)))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(1, 2, np.random.randint(0, 101)))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(2, 1, np.random.randint(0, 101)))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(2, 2, np.random.randint(0, 101)))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(3, 1, np.random.randint(0, 101)))
con.execute("insert into grade(student_id, subject_id, score) values (?, ?, ?)",
(3, 2, np.random.randint(0, 101)))
Query the database¶
In [27]:
for row in con.execute("select * from student;"):
print(row)
(1, 'John', 'Smith', 'john.smith@duke.edu')
(2, 'Jane', 'Smith', 'jane.smith@duke.edu')
(3, 'Joe', 'Smith', 'joe.smith@duke.edu')
In [28]:
for row in con.execute("select * from subject;"):
print(row)
(1, 'BIOS 703', 'Biology for unsuspecting statisticians')
(2, 'BIOS 821', 'Data science for mere mortals')
In [29]:
for row in con.execute("select * from grade;"):
print(row)
(1, 1, 59.0)
(1, 2, 51.0)
(2, 1, 21.0)
(2, 2, 24.0)
(3, 1, 49.0)
(3, 2, 29.0)
Joins involving a linker table¶
In [30]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id;
"""
In [31]:
for row in con.execute(q):
print(row)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 29.0)
Modify the database¶
Joe showed me a cute cat picture. I’m going to give him 100 for all subjects.
In [32]:
for row in con.execute('SELECT student_id from student where first = "Joe"'):
print(row)
(3,)
In [33]:
s = """
UPDATE grade
SET score = 100
WHERE grade.student_id =
(
SELECT student_id from student where first = "Joe"
)
"""
In [34]:
con.execute(s)
Out[34]:
<sqlite3.Cursor at 0x10a5e77a0>
Check that update worked¶
In [35]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id
WHERE student.first = "Joe"
"""
In [36]:
for row in con.execute(q):
print(row)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)
Saving in-memory database to disk¶
In [37]:
with open('dump.sql', 'w') as f:
for line in con.iterdump():
f.write('%s\n' % line)
In [38]:
! cat dump.sql
BEGIN TRANSACTION;
CREATE TABLE grade (
student_id integer,
subject_id integer,
score float,
FOREIGN KEY (student_id) REFERENCES student (student_id),
FOREIGN KEY (subject_id) REFERENCES subject (subject_id),
CHECK (score BETWEEN 0 and 100)
);
INSERT INTO "grade" VALUES(1,1,59.0);
INSERT INTO "grade" VALUES(1,2,51.0);
INSERT INTO "grade" VALUES(2,1,21.0);
INSERT INTO "grade" VALUES(2,2,24.0);
INSERT INTO "grade" VALUES(3,1,100.0);
INSERT INTO "grade" VALUES(3,2,100.0);
CREATE TABLE student (
student_id integer PRIMARY KEY,
first varchar NOT NULL,
last varchar NOT NULL,
email VARCHAR NOT NULL UNIQUE
);
INSERT INTO "student" VALUES(1,'John','Smith','john.smith@duke.edu');
INSERT INTO "student" VALUES(2,'Jane','Smith','jane.smith@duke.edu');
INSERT INTO "student" VALUES(3,'Joe','Smith','joe.smith@duke.edu');
CREATE TABLE subject (
subject_id integer PRIMARY KEY,
code varchar NOT NULL,
name varchar NOT NULL
);
INSERT INTO "subject" VALUES(1,'BIOS 703','Biology for unsuspecting statisticians');
INSERT INTO "subject" VALUES(2,'BIOS 821','Data science for mere mortals');
COMMIT;
Convert to SQLite3 database¶
In [39]:
! cat dump.sql | sqlite3 college.db
Error: near line 2: table grade already exists
Error: near line 16: table student already exists
Error: near line 22: UNIQUE constraint failed: student.student_id
Error: near line 23: UNIQUE constraint failed: student.student_id
Error: near line 24: UNIQUE constraint failed: student.student_id
Error: near line 25: table subject already exists
Error: near line 30: UNIQUE constraint failed: subject.subject_id
Error: near line 31: UNIQUE constraint failed: subject.subject_id
In [40]:
con1 = sqlite3.connect('college.db')
In [41]:
q = """
SELECT student.first, student.last, student.email, subject.code, subject.name, grade.score
FROM student
JOIN grade ON grade.student_id = student.student_id
JOIN subject ON grade.subject_id = subject.subject_id;
"""
In [42]:
for row in con1.execute(q):
print(row)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 29.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 20.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 49.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 96.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 59.0)
('John', 'Smith', 'john.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 51.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 21.0)
('Jane', 'Smith', 'jane.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 24.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 703', 'Biology for unsuspecting statisticians', 100.0)
('Joe', 'Smith', 'joe.smith@duke.edu', 'BIOS 821', 'Data science for mere mortals', 100.0)
Close connections¶
In [43]:
con.close()
In [44]:
con1.close()
Using SQL magic functions¶
We will use the ipython-sql notebook extension for convenience. This will only work in notebooks and IPython scripts with the .ipy extension.
In [1]:
import warnings
with warnings.catch_warnings():
warnings.simplefilter('ignore')
%load_ext sql
Configuring the SqlMagic extension¶
In [2]:
%config SqlMagic
SqlMagic options
--------------
SqlMagic.autolimit=<Int>
Current: 0
Automatically limit the size of the returned result sets
SqlMagic.autopandas=<Bool>
Current: False
Return Pandas DataFrames instead of regular result sets
SqlMagic.column_local_vars=<Bool>
Current: False
Return data into local variables from column names
SqlMagic.displaylimit=<Int>
Current: 0
Automatically limit the number of rows displayed (full result set is still
stored)
SqlMagic.dsn_filename=<Unicode>
Current: 'odbc.ini'
Path to DSN file. When the first argument is of the form [section], a
sqlalchemy connection string is formed from the matching section in the DSN
file.
SqlMagic.feedback=<Bool>
Current: True
Print number of rows affected by DML
SqlMagic.short_errors=<Bool>
Current: True
Don't display the full traceback on SQL Programming Error
SqlMagic.style=<Unicode>
Current: 'DEFAULT'
Set the table printing style to any of prettytable's defined styles
(currently DEFAULT, MSWORD_FRIENDLY, PLAIN_COLUMNS, RANDOM)
In [3]:
%config SqlMagic.displaylimit=10
Connect to SQLite3 database¶
See SQLAlchemy connection strings for how to connect to other databases such as Oracle, MySQL or PostgreSQL.
In [6]:
%sql sqlite:///Chinook_Sqlite.sqlite
Out[6]:
'Connected: None@Chinook_Sqlite.sqlite'
In [7]:
%sql SELECT * from Album LIMIT 5;
Done.
Out[7]:
AlbumId | Title | ArtistId |
---|---|---|
1 | For Those About To Rock We Salute You | 1 |
2 | Balls to the Wall | 2 |
3 | Restless and Wild | 2 |
4 | Let There Be Rock | 1 |
5 | Big Ones | 3 |
In [8]:
%sql SELECT * from Artist LIMIT 5;
Done.
Out[8]:
ArtistId | Name |
---|---|
1 | AC/DC |
2 | Accept |
3 | Aerosmith |
4 | Alanis Morissette |
5 | Alice In Chains |
In [9]:
%%sql
SELECT Artist.Name, Album.Title
FROM Album
INNER JOIN Artist on Album.ArtistId = Artist.ArtistId
ORDER BY Artist.Name ASC
LIMIT 5;
Done.
Out[9]:
Name | Title |
---|---|
AC/DC | For Those About To Rock We Salute You |
AC/DC | Let There Be Rock |
Aaron Copland & London Symphony Orchestra | A Copland Celebration, Vol. I |
Aaron Goldberg | Worlds |
Academy of St. Martin in the Fields & Sir Neville Marriner | The World of Classical Favourites |
You can assign results of queries to Python names¶
In [10]:
result = %sql SELECT * from Album;
Done.
In [11]:
type(result)
Out[11]:
sql.run.ResultSet
In [12]:
result[2:4]
Out[12]:
[(3, 'Restless and Wild', 2), (4, 'Let There Be Rock', 1)]
You can use Python variables in your queries¶
Use :varname
where you want to use a Python variable in your query.
In [13]:
artist_id = 10
%sql select * from Artist where ArtistId < :artist_id;
Done.
Out[13]:
ArtistId | Name |
---|---|
1 | AC/DC |
2 | Accept |
3 | Aerosmith |
4 | Alanis Morissette |
5 | Alice In Chains |
6 | Antônio Carlos Jobim |
7 | Apocalyptica |
8 | Audioslave |
9 | BackBeat |
In [14]:
word = '%rock%'
%sql select * from Album WHERE Title LIKE :word;
Done.
Out[14]:
AlbumId | Title | ArtistId |
---|---|---|
1 | For Those About To Rock We Salute You | 1 |
4 | Let There Be Rock | 1 |
59 | Deep Purple In Rock | 58 |
108 | Rock In Rio [CD1] | 90 |
109 | Rock In Rio [CD2] | 90 |
213 | Pure Cult: The Best Of The Cult (For Rockers, Ravers, Lovers & Sinners) [UK] | 139 |
216 | Hot Rocks, 1964-1971 (Disc 1) | 142 |
Convert to pandas
dataframe¶
In [16]:
df = result.DataFrame()
df.head(5)
Out[16]:
AlbumId | Title | ArtistId | |
---|---|---|---|
0 | 1 | For Those About To Rock We Salute You | 1 |
1 | 2 | Balls to the Wall | 2 |
2 | 3 | Restless and Wild | 2 |
3 | 4 | Let There Be Rock | 1 |
4 | 5 | Big Ones | 3 |
In [ ]: