%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
%precision 4
import os, sys, glob
This data contains the survival time after receiving a heart transplant, the age of the patient and whether or not the survival time was censored
Variable name definitions:: * death - Days after surgery until death * age - age at the time of surgery * censored - indicates if an observation is censored. 1 is uncensored
import statsmodels.api as sm
heart = sm.datasets.heart.load_pandas().data
heart.take(np.random.choice(len(heart), 6))
survival | censors | age | |
---|---|---|---|
66 | 110 | 0 | 23.7 |
24 | 1367 | 0 | 48.6 |
30 | 897 | 1 | 46.1 |
67 | 13 | 0 | 28.9 |
49 | 499 | 0 | 52.2 |
35 | 322 | 1 | 48.1 |
import sqlite3
conn = sqlite3.connect('heart.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS transplant
(survival integer, censors integer, age real)''')
c.executemany("insert into transplant(survival, censors, age) values (?, ?, ?)", heart.values);
SQL Queries take the form
select (distinct) ... from ... (limit ...)
where ...
groupby ..
order by ...
where most of the query apart from the select ... from ...
are
optional.
for row in c.execute('''select * from transplant limit 5;'''):
print row
(15, 1, 54.3)
(3, 1, 40.4)
(624, 1, 51.0)
(46, 1, 42.5)
(127, 1, 48.0)
# only find censored data for subjects < 40 years old
for row in c.execute('''
select * from transplant
where censors=0 and age < 40 limit 5;'''):
print row
(1775, 0, 33.3)
(1106, 0, 36.8)
(875, 0, 38.9)
(815, 0, 32.7)
(592, 0, 26.7)
for row in c.execute('''select count(*), avg(age) from transplant where censors=0 and age < 40;'''):
print row
(9, 31.43333333333333)
query = '''
select censors, count(*), avg(age) from transplant
group by censors;
'''
for row in c.execute(query):
print row
(0, 24, 41.729166666666664)
(1, 45, 48.484444444444456)
query = '''
select censors, count(*), avg(age) from transplant
group by censors
having avg(age) < 45;
'''
for row in c.execute(query):
print row
(0, 24, 41.729166666666664)
query = '''
select * from transplant
where age < 40
order by age desc;
'''
for row in c.execute(query):
print row
(875, 0, 38.9)
(1106, 0, 36.8)
(44, 1, 36.2)
(1, 0, 35.2)
(1775, 0, 33.3)
(815, 0, 32.7)
(12, 1, 29.2)
(13, 0, 28.9)
(592, 0, 26.7)
(167, 0, 26.7)
(110, 0, 23.7)
(228, 1, 19.7)
result = c.execute(query).fetchall()
arr = np.fromiter(result, dtype='i4,i4,f4')
arr.dtype.names = ['survival', 'censors', 'age']
print '\n'.join(map(str, arr))
(875, 0, 38.900001525878906)
(1106, 0, 36.79999923706055)
(44, 1, 36.20000076293945)
(1, 0, 35.20000076293945)
(1775, 0, 33.29999923706055)
(815, 0, 32.70000076293945)
(12, 1, 29.200000762939453)
(13, 0, 28.899999618530273)
(592, 0, 26.700000762939453)
(167, 0, 26.700000762939453)
(110, 0, 23.700000762939453)
(228, 1, 19.700000762939453)
from itertools import chain
result = c.execute(query).fetchall()
arr = np.fromiter(chain.from_iterable(result), dtype=np.float)
print arr.reshape(-1,3)
[[ 8.7500e+02 0.0000e+00 3.8900e+01]
[ 1.1060e+03 0.0000e+00 3.6800e+01]
[ 4.4000e+01 1.0000e+00 3.6200e+01]
[ 1.0000e+00 0.0000e+00 3.5200e+01]
[ 1.7750e+03 0.0000e+00 3.3300e+01]
[ 8.1500e+02 0.0000e+00 3.2700e+01]
[ 1.2000e+01 1.0000e+00 2.9200e+01]
[ 1.3000e+01 0.0000e+00 2.8900e+01]
[ 5.9200e+02 0.0000e+00 2.6700e+01]
[ 1.6700e+02 0.0000e+00 2.6700e+01]
[ 1.1000e+02 0.0000e+00 2.3700e+01]
[ 2.2800e+02 1.0000e+00 1.9700e+01]]
We will consturct a new database with 2 tables to illustrate the concept of joins.
conn1 = sqlite3.connect('samples.db')
c1 = conn1.cursor()
c1.execute(
'''
CREATE TABLE IF NOT EXISTS t1(
ID TEXT,
Name TEXT,
Value Real);
''')
c1.execute('''
CREATE TABLE IF NOT EXISTS t2(
ID TEXT,
Name TEXT,
Value Real,
Age INTEGER);
''');
from string import ascii_lowercase
for i in range(5):
c1.execute('''insert into t1(ID, Name, Value) values (%d, '%s', %.2f)''' % (i, ascii_lowercase[i], i*i));
c1.execute('''insert into t2(ID, Name, Value, Age) values (%d, '%s', %.2f, %d)''' % (i*2, ascii_lowercase[i*2], i*i+5, 10*i));
# Without specifiying a join, the result is all possible combinations
query = '''
select t1.ID, t2.ID from t1, t2;
'''
for row in c1.execute(query):
print row
(u'0', u'0')
(u'0', u'2')
(u'0', u'4')
(u'0', u'6')
(u'0', u'8')
(u'1', u'0')
(u'1', u'2')
(u'1', u'4')
(u'1', u'6')
(u'1', u'8')
(u'2', u'0')
(u'2', u'2')
(u'2', u'4')
(u'2', u'6')
(u'2', u'8')
(u'3', u'0')
(u'3', u'2')
(u'3', u'4')
(u'3', u'6')
(u'3', u'8')
(u'4', u'0')
(u'4', u'2')
(u'4', u'4')
(u'4', u'6')
(u'4', u'8')
# Inner join (intersection)
query = '''
select t1.ID, t2.ID, t1.value, t2.value, t1.value * t2.value from t1, t2
where t1.ID = t2.ID;
'''
for row in c1.execute(query):
print row
(u'0', u'0', 0.0, 5.0, 0.0)
(u'2', u'2', 4.0, 6.0, 24.0)
(u'4', u'4', 16.0, 9.0, 144.0)
# left join keeps all values from the left table (t2)
# and values from the right (t1) where there is a match
query = '''
select t1.id, t2.ID, t1.value, t2.value from t2 left join t1 on t1.ID = t2.ID
'''
for row in c1.execute(query):
print row
(u'0', u'0', 0.0, 5.0)
(u'2', u'2', 4.0, 6.0)
(u'4', u'4', 16.0, 9.0)
(None, u'6', None, 14.0)
(None, u'8', None, 21.0)
# same join but we swtich left and right tables
query = '''
select t1.ID, t2.ID, t1.value, t2.value from t1 left join t2 on t1.ID = t2.ID
'''
for row in c1.execute(query):
print row
(u'0', u'0', 0.0, 5.0)
(u'1', None, 1.0, None)
(u'2', u'2', 4.0, 6.0)
(u'3', None, 9.0, None)
(u'4', u'4', 16.0, 9.0)
# we can join a table to itself by using aliases
# lets add a few more rows to t1 which may have the same id and name but different values
for i in range(5):
c1.execute('''insert into t1(ID, Name, Value) values (%d, '%s', %.2f)''' % (i, ascii_lowercase[i], i*i*i));
for row in c1.execute('select * from t1;'):
print row
(u'0', u'a', 0.0)
(u'1', u'b', 1.0)
(u'2', u'c', 4.0)
(u'3', u'd', 9.0)
(u'4', u'e', 16.0)
(u'0', u'a', 0.0)
(u'1', u'b', 1.0)
(u'2', u'c', 8.0)
(u'3', u'd', 27.0)
(u'4', u'e', 64.0)
# Now use a self-join to find paired values for the same ID and name
query = '''
select t1a.ID, t1a.Name, t1a.value, t1b.value from t1 as t1a, t1 as t1b
where t1a.Name = t1b.Name and t1a.Value < t1b.Value
order by t1a.ID ASC;
'''
for row in c1.execute(query):
print row
(u'2', u'c', 4.0, 8.0)
(u'3', u'd', 9.0, 27.0)
(u'4', u'e', 16.0, 64.0)
In which we convert a dataframe into a normalized database.
names = ['ann', 'bob', 'ann', 'bob', 'carl', 'delia', 'ann']
tests = ['wbc', 'wbc', 'rbc', 'rbc', 'wbc', 'rbc', 'platelets']
values1 = [10, 11.2, 300, 204, 9.8, 340, 125]
values2 = [10.6, 13.2, 322, 214, 10.3, 343, 145]
df = pd.DataFrame([names, tests, values1, values2]).T
df.columns = ['names', 'tests', 'values1', 'values2']
df
names | tests | values1 | values2 | |
---|---|---|---|---|
0 | ann | wbc | 10 | 10.6 |
1 | bob | wbc | 11.2 | 13.2 |
2 | ann | rbc | 300 | 322 |
3 | bob | rbc | 204 | 214 |
4 | carl | wbc | 9.8 | 10.3 |
5 | delia | rbc | 340 | 343 |
6 | ann | platelets | 125 | 145 |
# names are put into their own table so there is no dubplication
name_table = pd.DataFrame(df['names'].unique(), columns=['name'])
name_table['name_id'] = name_table.index
columns = ['name_id', 'name']
name_table[columns]
name_id | name | |
---|---|---|
0 | 0 | ann |
1 | 1 | bob |
2 | 2 | carl |
3 | 3 | delia |
# tests are put inot their own table so there is no duplication
test_table = pd.DataFrame(df['tests'].unique(), columns=['test'])
test_table['test_id'] = test_table.index
columns = ['test_id', 'test']
test_table[columns]
test_id | test | |
---|---|---|
0 | 0 | wbc |
1 | 1 | rbc |
2 | 2 | platelets |
# the values1 and values2 correspond to visit 1 and 2, so
# we create a visits table
visit_table = pd.DataFrame([1,2], columns=['visit'])
visit_table['visit_id'] = visit_table.index
columns = ['visit_id', 'visit']
visit_table[columns]
visit_id | visit | |
---|---|---|
0 | 0 | 1 |
1 | 1 | 2 |
# finally, we link each value to a triple(name_id, test_id, visit_id)
value_table = pd.DataFrame([
[0,0,0,10], [1,0,0,11.2], [0,1,0,300], [1,1,0,204], [2,0,0,9.8], [3,1,0,340], [0,2,0,125],
[0,0,1,10.6], [1,0,1,13.2], [0,1,1,322], [1,1,1,214], [2,0,1,10.3], [3,1,1,343], [0,2,1,145]
], columns=['name_id', 'test_id', 'visit_id', 'value'])
value_table
name_id | test_id | visit_id | value | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 10.0 |
1 | 1 | 0 | 0 | 11.2 |
2 | 0 | 1 | 0 | 300.0 |
3 | 1 | 1 | 0 | 204.0 |
4 | 2 | 0 | 0 | 9.8 |
5 | 3 | 1 | 0 | 340.0 |
6 | 0 | 2 | 0 | 125.0 |
7 | 0 | 0 | 1 | 10.6 |
8 | 1 | 0 | 1 | 13.2 |
9 | 0 | 1 | 1 | 322.0 |
10 | 1 | 1 | 1 | 214.0 |
11 | 2 | 0 | 1 | 10.3 |
12 | 3 | 1 | 1 | 343.0 |
13 | 0 | 2 | 1 | 145.0 |
At the end of the normalizaiton, we have gone from 1 dataframe with multiple redundancies to 4 tables with unique entries in each row. This organization helps maintain data integrity and is necesssary for effficeincy as the number of test values grows, possibly into millions of rows. As we have seen, we can use SQL queries to recreate the origianl dataformat if that is more convenient for analysis.
When your data consists of many numerical and matrices, each of which is relatively independent, relational databases offer little benefit, and it is more efficient to use HDF5 (Hierarchical Data Format) for storage. For example, your data may come from a simulation which generates a 3D matrix and a list of count data at every iteration.
import h5py
f = h5py.File('simulation.h5')
for i in range(10): # iterations in simulation
xs = np.random.random((100,100,100))
ys = np.random.randint(0,100,(i+1)*10)
group = f.create_group('Iteration%03d' % i)
group.create_dataset('xs', data=xs)
group.create_dataset('ys', data=ys)
f.keys()
[u'Iteration000',
u'Iteration001',
u'Iteration002',
u'Iteration003',
u'Iteration004',
u'Iteration005',
u'Iteration006',
u'Iteration007',
u'Iteration008',
u'Iteration009']
f['Iteration008'].keys()
[u'xs', u'ys']
g8 = f['Iteration008']
print g8['xs'][2:5,2:5,2:5]
print g8['ys'][-10:]
[[[ 0.0367 0.2883 0.5562]
[ 0.9494 0.5614 0.1159]
[ 0.8887 0.7396 0.891 ]]
[[ 0.7552 0.1539 0.216 ]
[ 0.6671 0.4682 0.9107]
[ 0.5565 0.5443 0.1665]]
[[ 0.3972 0.1205 0.9487]
[ 0.7874 0.3466 0.2818]
[ 0.1248 0.0161 0.6898]]]
[37 69 5 15 10 44 20 73 74 24]
import pandas as pd
df = pd.read_sql('select * from transplant;', conn)
df.take(np.random.randint(0, len(df), 6))
survival | censors | age | |
---|---|---|---|
8 | 23 | 1 | 56.9 |
38 | 815 | 0 | 32.7 |
12 | 730 | 1 | 58.4 |
58 | 339 | 0 | 54.4 |
53 | 439 | 0 | 52.9 |
27 | 994 | 1 | 48.6 |
df1 = pd.read_sql('select t1.name, t2.value, t2.age from t1, t2 where t1.name = t2.name;', conn1)
df1
Name | Value | Age | |
---|---|---|---|
0 | a | 5 | 0 |
1 | c | 6 | 10 |
2 | e | 9 | 20 |
3 | a | 5 | 0 |
4 | c | 6 | 10 |
5 | e | 9 | 20 |
c.close()
c1.close()
conn.close()
conn1.close()
store = pd.HDFStore('dump.h5')
store['transplant'] = df
store['tables'] = df1
store.close()
/Users/cliburn/anaconda/lib/python2.7/site-packages/pandas/io/pytables.py:2453: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->block2_values] [items->['Name']]
warnings.warn(ws, PerformanceWarning)
transplant_df = pd.read_hdf('dump.h5', 'transplant')
transplant_df.take(np.random.randint(0, len(df), 6))
survival | censors | age | |
---|---|---|---|
50 | 305 | 0 | 49.3 |
3 | 46 | 1 | 42.5 |
0 | 15 | 1 | 54.3 |
22 | 1 | 1 | 41.5 |
47 | 63 | 1 | 56.4 |
19 | 1549 | 0 | 40.6 |
table_df = pd.read_hdf('dump.h5', 'tables')
table_df
Name | Value | Age | |
---|---|---|---|
0 | a | 5 | 0 |
1 | c | 6 | 10 |
2 | e | 9 | 20 |
3 | a | 5 | 0 |
4 | c | 6 | 10 |
5 | e | 9 | 20 |
store
<class 'pandas.io.pytables.HDFStore'>
File path: dump.h5
File is CLOSED
store = pd.HDFStore('dump.h5')
store.keys()
['/tables', '/transplant']
store.close()