%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
%precision 4
import os, sys, glob

Working with structured data¶

Using SQLite3¶

Working example dataset¶

This data contains the survival time after receiving a heart transplant, the age of the patient and whether or not the survival time was censored

Number of Observations - 69
Number of Variables - 3

Variable name definitions:: * death - Days after surgery until death * age - age at the time of surgery * censored - indicates if an observation is censored. 1 is uncensored

import statsmodels.api as sm
heart = sm.datasets.heart.load_pandas().data
heart.take(np.random.choice(len(heart), 6))

	survival	censors	age
66	110	0	23.7
24	1367	0	48.6
30	897	1	46.1
67	13	0	28.9
49	499	0	52.2
35	322	1	48.1

import sqlite3
conn = sqlite3.connect('heart.db')

Creating and populating a table¶

c = conn.cursor()

c.execute('''CREATE TABLE IF NOT EXISTS transplant
             (survival integer, censors integer, age real)''')

c.executemany("insert into transplant(survival, censors, age) values (?, ?, ?)", heart.values);

SQL queries¶

SQL Queries take the form

select (distinct) ... from ... (limit ...)
where ...
groupby ..
order by ...

where most of the query apart from the select ... from ... are optional.

Selecting all columns, first 10 rows¶

for row in c.execute('''select * from transplant limit 5;'''):
    print row

(15, 1, 54.3)
(3, 1, 40.4)
(624, 1, 51.0)
(46, 1, 42.5)
(127, 1, 48.0)

Using where to filter rows¶

# only find censored data for subjects < 40 years old
for row in c.execute('''
select * from transplant
where censors=0 and age < 40 limit 5;'''):
    print row

(1775, 0, 33.3)
(1106, 0, 36.8)
(875, 0, 38.9)
(815, 0, 32.7)
(592, 0, 26.7)

Using SQL functions¶

for row in c.execute('''select count(*), avg(age) from transplant where censors=0 and age < 40;'''):
    print row

(9, 31.43333333333333)

Using groupby to find number of cnesored and uncensored subjects and thier average age¶

query = '''
select censors, count(*), avg(age) from transplant
group by censors;
'''
for row in c.execute(query):
    print row

(0, 24, 41.729166666666664)
(1, 45, 48.484444444444456)

Using having to filter grouped results¶

query = '''
select censors, count(*), avg(age) from transplant
group by censors
having avg(age) < 45;
'''
for row in c.execute(query):
    print row

(0, 24, 41.729166666666664)

Using order by to sort results¶

query = '''
select * from transplant
where age < 40
order by age desc;
'''
for row in c.execute(query):
    print row

(875, 0, 38.9)
(1106, 0, 36.8)
(44, 1, 36.2)
(1, 0, 35.2)
(1775, 0, 33.3)
(815, 0, 32.7)
(12, 1, 29.2)
(13, 0, 28.9)
(592, 0, 26.7)
(167, 0, 26.7)
(110, 0, 23.7)
(228, 1, 19.7)

Reading into a numpy structured array¶

result = c.execute(query).fetchall()
arr = np.fromiter(result, dtype='i4,i4,f4')
arr.dtype.names = ['survival', 'censors', 'age']
print '\n'.join(map(str, arr))

(875, 0, 38.900001525878906)
(1106, 0, 36.79999923706055)
(44, 1, 36.20000076293945)
(1, 0, 35.20000076293945)
(1775, 0, 33.29999923706055)
(815, 0, 32.70000076293945)
(12, 1, 29.200000762939453)
(13, 0, 28.899999618530273)
(592, 0, 26.700000762939453)
(167, 0, 26.700000762939453)
(110, 0, 23.700000762939453)
(228, 1, 19.700000762939453)

Reading into a numpy regular array¶

from itertools import chain
result = c.execute(query).fetchall()
arr = np.fromiter(chain.from_iterable(result), dtype=np.float)
print arr.reshape(-1,3)

[[  8.7500e+02   0.0000e+00   3.8900e+01]
 [  1.1060e+03   0.0000e+00   3.6800e+01]
 [  4.4000e+01   1.0000e+00   3.6200e+01]
 [  1.0000e+00   0.0000e+00   3.5200e+01]
 [  1.7750e+03   0.0000e+00   3.3300e+01]
 [  8.1500e+02   0.0000e+00   3.2700e+01]
 [  1.2000e+01   1.0000e+00   2.9200e+01]
 [  1.3000e+01   0.0000e+00   2.8900e+01]
 [  5.9200e+02   0.0000e+00   2.6700e+01]
 [  1.6700e+02   0.0000e+00   2.6700e+01]
 [  1.1000e+02   0.0000e+00   2.3700e+01]
 [  2.2800e+02   1.0000e+00   1.9700e+01]]

Working wiht multiple tables in SQL¶

We will consturct a new database with 2 tables to illustrate the concept of joins.

conn1 = sqlite3.connect('samples.db')
c1 = conn1.cursor()

c1.execute(
'''
CREATE TABLE IF NOT EXISTS t1(
  ID TEXT,
  Name TEXT,
  Value Real);
''')

c1.execute('''
CREATE TABLE IF NOT EXISTS t2(
  ID TEXT,
  Name TEXT,
  Value Real,
  Age INTEGER);
''');

from string import ascii_lowercase
for i in range(5):
    c1.execute('''insert into t1(ID, Name, Value) values (%d, '%s', %.2f)''' % (i, ascii_lowercase[i], i*i));
    c1.execute('''insert into t2(ID, Name, Value, Age) values (%d, '%s', %.2f, %d)''' % (i*2, ascii_lowercase[i*2], i*i+5, 10*i));

Cartesian product¶

# Without specifiying a join, the result is all possible combinations
query = '''
select t1.ID, t2.ID from t1, t2;
'''
for row in c1.execute(query):
    print row

(u'0', u'0')
(u'0', u'2')
(u'0', u'4')
(u'0', u'6')
(u'0', u'8')
(u'1', u'0')
(u'1', u'2')
(u'1', u'4')
(u'1', u'6')
(u'1', u'8')
(u'2', u'0')
(u'2', u'2')
(u'2', u'4')
(u'2', u'6')
(u'2', u'8')
(u'3', u'0')
(u'3', u'2')
(u'3', u'4')
(u'3', u'6')
(u'3', u'8')
(u'4', u'0')
(u'4', u'2')
(u'4', u'4')
(u'4', u'6')
(u'4', u'8')

Inner joins¶

# Inner join (intersection)
query = '''
select t1.ID, t2.ID, t1.value, t2.value, t1.value * t2.value from t1, t2
where t1.ID = t2.ID;
'''
for row in c1.execute(query):
    print row

(u'0', u'0', 0.0, 5.0, 0.0)
(u'2', u'2', 4.0, 6.0, 24.0)
(u'4', u'4', 16.0, 9.0, 144.0)

# left join keeps all values from the left table (t2)
# and values from the right (t1) where there is a match
query = '''
select t1.id, t2.ID, t1.value, t2.value from t2 left join t1 on t1.ID = t2.ID
'''
for row in c1.execute(query):
    print row

(u'0', u'0', 0.0, 5.0)
(u'2', u'2', 4.0, 6.0)
(u'4', u'4', 16.0, 9.0)
(None, u'6', None, 14.0)
(None, u'8', None, 21.0)

# same join but we swtich left and right tables
query = '''
select t1.ID, t2.ID, t1.value, t2.value from t1 left join t2 on t1.ID = t2.ID
'''
for row in c1.execute(query):
    print row

(u'0', u'0', 0.0, 5.0)
(u'1', None, 1.0, None)
(u'2', u'2', 4.0, 6.0)
(u'3', None, 9.0, None)
(u'4', u'4', 16.0, 9.0)

Self-joins¶

# we can join a table to itself by using aliases
# lets add a few more rows to t1 which may have the same id and name but different values

for i in range(5):
    c1.execute('''insert into t1(ID, Name, Value) values (%d, '%s', %.2f)''' % (i, ascii_lowercase[i], i*i*i));

for row in c1.execute('select * from t1;'):
    print row

(u'0', u'a', 0.0)
(u'1', u'b', 1.0)
(u'2', u'c', 4.0)
(u'3', u'd', 9.0)
(u'4', u'e', 16.0)
(u'0', u'a', 0.0)
(u'1', u'b', 1.0)
(u'2', u'c', 8.0)
(u'3', u'd', 27.0)
(u'4', u'e', 64.0)

# Now use a self-join to find paired values for the same ID and name

query = '''
select t1a.ID, t1a.Name, t1a.value, t1b.value from t1 as t1a, t1 as t1b
where t1a.Name = t1b.Name and t1a.Value < t1b.Value
order by t1a.ID ASC;
'''
for row in c1.execute(query):
    print row

(u'2', u'c', 4.0, 8.0)
(u'3', u'd', 9.0, 27.0)
(u'4', u'e', 16.0, 64.0)

Basic concepts of database normalization¶

In which we convert a dataframe into a normalized database.

names = ['ann', 'bob', 'ann', 'bob', 'carl', 'delia', 'ann']
tests = ['wbc', 'wbc', 'rbc', 'rbc', 'wbc', 'rbc', 'platelets']
values1 = [10, 11.2, 300, 204, 9.8, 340, 125]
values2 = [10.6, 13.2, 322, 214, 10.3, 343, 145]
df = pd.DataFrame([names, tests, values1, values2]).T
df.columns = ['names', 'tests', 'values1', 'values2']
df

	names	tests	values1	values2
0	ann	wbc	10	10.6
1	bob	wbc	11.2	13.2
2	ann	rbc	300	322
3	bob	rbc	204	214
4	carl	wbc	9.8	10.3
5	delia	rbc	340	343
6	ann	platelets	125	145

# names are put into their own table so there is no dubplication

name_table = pd.DataFrame(df['names'].unique(), columns=['name'])
name_table['name_id'] = name_table.index
columns = ['name_id', 'name']
name_table[columns]

	name_id	name
0	0	ann
1	1	bob
2	2	carl
3	3	delia

# tests are put inot their own table so there is no duplication

test_table = pd.DataFrame(df['tests'].unique(), columns=['test'])
test_table['test_id'] = test_table.index
columns = ['test_id', 'test']
test_table[columns]

	test_id	test
0	0	wbc
1	1	rbc
2	2	platelets

# the values1 and values2 correspond to visit 1 and 2, so
# we create a visits table

visit_table = pd.DataFrame([1,2], columns=['visit'])
visit_table['visit_id'] = visit_table.index
columns = ['visit_id', 'visit']
visit_table[columns]

	visit_id	visit
0	0	1
1	1	2

# finally, we link each value to a triple(name_id, test_id, visit_id)

value_table = pd.DataFrame([
    [0,0,0,10], [1,0,0,11.2], [0,1,0,300], [1,1,0,204], [2,0,0,9.8], [3,1,0,340], [0,2,0,125],
   [0,0,1,10.6], [1,0,1,13.2], [0,1,1,322], [1,1,1,214], [2,0,1,10.3], [3,1,1,343], [0,2,1,145]
], columns=['name_id', 'test_id', 'visit_id', 'value'])
value_table

	name_id	test_id	visit_id	value
0	0	0	0	10.0
1	1	0	0	11.2
2	0	1	0	300.0
3	1	1	0	204.0
4	2	0	0	9.8
5	3	1	0	340.0
6	0	2	0	125.0
7	0	0	1	10.6
8	1	0	1	13.2
9	0	1	1	322.0
10	1	1	1	214.0
11	2	0	1	10.3
12	3	1	1	343.0
13	0	2	1	145.0

At the end of the normalizaiton, we have gone from 1 dataframe with multiple redundancies to 4 tables with unique entries in each row. This organization helps maintain data integrity and is necesssary for effficeincy as the number of test values grows, possibly into millions of rows. As we have seen, we can use SQL queries to recreate the origianl dataformat if that is more convenient for analysis.

Using HDF5¶

When your data consists of many numerical and matrices, each of which is relatively independent, relational databases offer little benefit, and it is more efficient to use HDF5 (Hierarchical Data Format) for storage. For example, your data may come from a simulation which generates a 3D matrix and a list of count data at every iteration.

import h5py

f = h5py.File('simulation.h5')

for i in range(10): # iterations in simulation
    xs = np.random.random((100,100,100))
    ys = np.random.randint(0,100,(i+1)*10)
    group = f.create_group('Iteration%03d' % i)
    group.create_dataset('xs', data=xs)
    group.create_dataset('ys', data=ys)

f.keys()

[u'Iteration000',
 u'Iteration001',
 u'Iteration002',
 u'Iteration003',
 u'Iteration004',
 u'Iteration005',
 u'Iteration006',
 u'Iteration007',
 u'Iteration008',
 u'Iteration009']

f['Iteration008'].keys()

[u'xs', u'ys']

g8 = f['Iteration008']
print g8['xs'][2:5,2:5,2:5]
print g8['ys'][-10:]

[[[ 0.0367  0.2883  0.5562]
  [ 0.9494  0.5614  0.1159]
  [ 0.8887  0.7396  0.891 ]]

 [[ 0.7552  0.1539  0.216 ]
  [ 0.6671  0.4682  0.9107]
  [ 0.5565  0.5443  0.1665]]

 [[ 0.3972  0.1205  0.9487]
  [ 0.7874  0.3466  0.2818]
  [ 0.1248  0.0161  0.6898]]]
[37 69  5 15 10 44 20 73 74 24]

Interfacing withPandas¶

import pandas as pd

df = pd.read_sql('select * from transplant;', conn)

df.take(np.random.randint(0, len(df), 6))

	survival	censors	age
8	23	1	56.9
38	815	0	32.7
12	730	1	58.4
58	339	0	54.4
53	439	0	52.9
27	994	1	48.6

df1 = pd.read_sql('select t1.name, t2.value, t2.age from t1, t2 where t1.name = t2.name;', conn1)

df1

	Name	Value	Age
0	a	5	0
1	c	6	10
2	e	9	20
3	a	5	0
4	c	6	10
5	e	9	20

c.close()
c1.close()
conn.close()
conn1.close()

store = pd.HDFStore('dump.h5')
store['transplant'] = df
store['tables'] = df1
store.close()

/Users/cliburn/anaconda/lib/python2.7/site-packages/pandas/io/pytables.py:2453: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block2_values] [items->['Name']]

  warnings.warn(ws, PerformanceWarning)

transplant_df = pd.read_hdf('dump.h5', 'transplant')
transplant_df.take(np.random.randint(0, len(df), 6))

	survival	censors	age
50	305	0	49.3
3	46	1	42.5
0	15	1	54.3
22	1	1	41.5
47	63	1	56.4
19	1549	0	40.6

table_df = pd.read_hdf('dump.h5', 'tables')
table_df

	Name	Value	Age
0	a	5	0
1	c	6	10
2	e	9	20
3	a	5	0
4	c	6	10
5	e	9	20

store

<class 'pandas.io.pytables.HDFStore'>
File path: dump.h5
File is CLOSED

store = pd.HDFStore('dump.h5')

store.keys()

['/tables', '/transplant']

store.close()