SQL Queries 02

[1]:
import pandas as pd
[2]:
import numpy as np
[3]:
%load_ext sql
[4]:
%sql sqlite://

Create toy data set

[5]:
from faker import Faker
[6]:
fake = Faker()
[7]:
names = list(set([fake.name() for i in range(100)]))
[8]:
np.random.seed(123)

n1 = 30
bios821 = pd.DataFrame(dict(
    name=np.random.choice(names, n1, replace=False),
    grade=np.random.randint(50, 101, n1)))

n2 = 30
bios823 = pd.DataFrame(dict(
    name=np.random.choice(names, n2, replace=False),
    grade=np.random.randint(50, 101, n2)))
[9]:
%%sql

DROP TABLE IF EXISTS bios821;
DROP TABLE IF EXISTs bios823
 * sqlite://
Done.
Done.
[9]:
[]
[10]:
%sql -p bios821
 * sqlite://
[10]:
'Persisted bios821'
[11]:
%sql -p bios823
 * sqlite://
[11]:
'Persisted bios823'
[12]:
%sql SELECT * FROM bios821 LIMIT 3
 * sqlite://
Done.
[12]:
index name grade
0 Mrs. Michele Ingram 65
1 Mark Shaffer 90
2 Alexandra James 75
[13]:
%sql SELECT * FROM bios823 LIMIT 3
 * sqlite://
Done.
[13]:
index name grade
0 Loretta Smith 59
1 Allison Kirby 90
2 Robert Wong 91

Subqueries

There are 3 ways to use a sub-query:

  • with in

  • with exists

  • with a comparison operator

1. What students take both bios821 and bios823?

Using set operations.

[14]:
%%sql

SELECT name FROM bios821
INTERSECT
SELECT name FROM bios823
 * sqlite://
Done.
[14]:
name
Alexandra James
Cindy Clark
Dr. Scott Mendoza MD
Jennifer Hunt
Kathy Cochran
Nancy Castaneda
Robert Wong
Sharon Porter

Using EQUIJOIN.

[15]:
%%sql

SELECT DISTINCT bios821.name
FROM bios821, bios823
WHERE bios821.name = bios823.name
 * sqlite://
Done.
[15]:
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter

Using sub-query.

[16]:
%%sql

SELECT DISTINCT name FROM bios821
WHERE name IN (
    SELECT name FROM bios823
)
 * sqlite://
Done.
[16]:
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter

Using correlated sub-query.

[17]:
%%sql

SELECT DISTINCT name FROM bios821
WHERE EXISTS (
    SELECT * FROM bios823
    WHERE bios821.name = bios823.name
)
 * sqlite://
Done.
[17]:
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter

2. Which students taking bios823 have grades higher than the average grade of bios821?

  • Do this for all students

  • Do this just for the subet of patinets taking both classes

[18]:
%%sql

SELECT DISTINCT name, grade FROM bios823
WHERE grade > (
    SELECT AVG(grade) FROM bios821
)
 * sqlite://
Done.
[18]:
name grade
Allison Kirby 90
Robert Wong 91
Steven Chavez 90
Jennifer Paul 75
Allison Fletcher 100
Jennifer Hunt 86
Dr. Amanda Ryan 94
Terri Perez 78
Julie Nguyen 85
Brian Garrison 89
Hannah Wagner 94
Alexandra Wallace 90
Veronica Herrera 78
Sharon Porter 98
Manuel Reyes 96
Vicki Johnson 99
John Williams 96

Using a view.

[19]:
%%sql

CREATE VIEW view_common AS
    SELECT DISTINCT
        bios821.name,
        bios821.grade as grade_821,
        bios823.grade as grade_823
    FROM
        bios821, bios823
    WHERE
        bios821.name = bios823.name
    AND bios821.name IN (
        SELECT name
        FROM bios823
    )
 * sqlite://
Done.
[19]:
[]
[20]:
%%sql

SELECT name, grade_823
FROM view_common
WHERE grade_823 > (
    SELECT AVG(grade_821)
    FROM view_common
)
 * sqlite://
Done.
[20]:
name grade_823
Alexandra James 73
Nancy Castaneda 67
Jennifer Hunt 86
Kathy Cochran 74
Cindy Clark 73
Robert Wong 91
Sharon Porter 98

Using pandas.

[21]:
df = bios821.merge(bios823, on='name', suffixes=['_821', '_823'])
[22]:
df
[22]:
name grade_821 grade_823
0 Alexandra James 75 73
1 Nancy Castaneda 50 67
2 Jennifer Hunt 51 86
3 Dr. Scott Mendoza MD 88 64
4 Kathy Cochran 51 74
5 Cindy Clark 56 73
6 Robert Wong 63 91
7 Sharon Porter 78 98
[23]:
df[df.grade_823 > df.grade_821.mean()]
[23]:
name grade_821 grade_823
0 Alexandra James 75 73
1 Nancy Castaneda 50 67
2 Jennifer Hunt 51 86
4 Kathy Cochran 51 74
5 Cindy Clark 56 73
6 Robert Wong 63 91
7 Sharon Porter 78 98

Common table expressions (CTE)

CTEs are temporary tables created for a specific query.

[24]:
%%sql

WITH
    t1(name) AS (SELECT 'Bob'),
    t2(age) AS (SELECT 23)
SELECT * from t1, t2
 * sqlite://
Done.
[24]:
name age
Bob 23

Using CTEs to solve previous problem

[25]:
%%sql

WITH common AS (
    SELECT DISTINCT
        bios821.name,
        bios821.grade as grade_821,
        bios823.grade as grade_823
    FROM
        bios821,
        bios823
    WHERE bios821.name IN (
        SELECT name FROM bios823
    ) AND
    bios821.name = bios823.name
)
SELECT name, grade_823
FROM common
WHERE grade_823 > (
    SELECT AVG(grade_821)
    FROM common
)
 * sqlite://
Done.
[25]:
name grade_823
Alexandra James 73
Nancy Castaneda 67
Jennifer Hunt 86
Kathy Cochran 74
Cindy Clark 73
Robert Wong 91
Sharon Porter 98

As a generator

[26]:
%%sql

WITH RECURSIVE count(n) AS (
    SELECT 1
        UNION ALL
    SELECT n+1 FROM count WHERe n < 5
)
SELECT * FROM count
 * sqlite://
Done.
[26]:
n
1
2
3
4
5

Generating dates

[27]:
%%sql

WITH RECURSIVE make_dates(d) AS (
    SELECT '2019-01-15'
        UNION ALL
    SELECT DATE(d, '+1 MONTHS') FROM make_dates WHERe d < '2019-12'
)
SELECT d AS "Homework due" FROM make_dates
 * sqlite://
Done.
[27]:
Homework due
2019-01-15
2019-02-15
2019-03-15
2019-04-15
2019-05-15
2019-06-15
2019-07-15
2019-08-15
2019-09-15
2019-10-15
2019-11-15
2019-12-15

CTEs to do programming in SQL(!)

[28]:
%%sql

WITH RECURSIVE fact(n, f) AS (
    SELECT 1, 1
        UNION ALL
    SELECT n+1, (n+1) * f
    FROM fact
    WHERe n < 5
)
SELECT * FROM fact
LIMIT 5
 * sqlite://
Done.
[28]:
n f
1 1
2 2
3 6
4 24
5 120
[29]:
%%sql

WITH RECURSIVE fib(n, f1, f2) AS (
    SELECT 1, 0, 1
        UNION ALL
    SELECT n+1, f2, f1+f2
    FROM fib
    WHERe n < 10
)
SELECT * FROM fib
LIMIT 10
 * sqlite://
Done.
[29]:
n f1 f2
1 0 1
2 1 1
3 1 2
4 2 3
5 3 5
6 5 8
7 8 13
8 13 21
9 21 34
10 34 55

A common use of CTEs is to work with naturally recursive structures (trees or graphs)

Suppose we have a cell subset taxonomy with parent-child relations. We want to find all descendants of a particular cell type. This is hard to do without recursive CTEs in SQL.

[30]:
%%sql sqlite:///

DROP TABLE IF EXISTS cell;

CREATE TABLE cell(
    cell_id integer PRIMARY KEY,
    name VARCHAR(30),
    parent_id integer
);

INSERT INTO cell (
   cell_id,
   name,
   parent_id
)
VALUES
   (1, 'WBC', NULL),
   (2, 'Lymphocyte', 1),
   (3, 'T Cell', 2),
   (4, 'B Cell', 2),
   (5, 'NK Cell', 2),
   (6, 'T helper cell', 3),
   (7, 'T cytotoxic cell', 3),
   (8, 'T regulatory cell', 3),
   (9, 'Naive B cell', 4),
   (10, 'Memory B cell', 4),
   (11, 'Plasma cell', 4),
   (12, 'Granulocyte', 1),
   (13, 'Basophil', 12),
   (14, 'Eosinophil', 12),
   (15, 'Neutrophil', 12)
Done.
Done.
15 rows affected.
[30]:
[]
[31]:
%%sql

SELECT * FROM cell LIMIT 10
   sqlite://
 * sqlite:///
Done.
[31]:
cell_id name parent_id
1 WBC None
2 Lymphocyte 1
3 T Cell 2
4 B Cell 2
5 NK Cell 2
6 T helper cell 3
7 T cytotoxic cell 3
8 T regulatory cell 3
9 Naive B cell 4
10 Memory B cell 4
[32]:
%%sql

    SELECT cell_id, name, parent_id
    FROM cell
    WHERE name='T Cell'
   sqlite://
 * sqlite:///
Done.
[32]:
cell_id name parent_id
3 T Cell 2
[33]:
target = 'Lymphocyte'
[34]:
%%sql

WITH RECURSIVE lineage AS (
    SELECT
        cell_id,
        name,
        parent_id
    FROM
        cell
    WHERE
        name=:target
UNION ALL
    SELECT
        c.cell_id,
        c.name,
        c.parent_id
    FROM
        cell c
    INNER JOIN
        lineage l
    ON
        l.cell_id = c.parent_id
)
SELECT * FROM lineage
LIMIT 10
   sqlite://
 * sqlite:///
Done.
[34]:
cell_id name parent_id
2 Lymphocyte 1
4 B Cell 2
5 NK Cell 2
3 T Cell 2
10 Memory B cell 4
9 Naive B cell 4
11 Plasma cell 4
7 T cytotoxic cell 3
6 T helper cell 3
8 T regulatory cell 3

Window Functions

[35]:
np.random.seed(23)
n = 10
df = pd.DataFrame(
    dict(person=np.random.choice(['A', 'B', 'C', 'D'], n,),
                time=np.random.randint(0, 10, n),
                bsl=np.random.randint(50, 400, n)))
[36]:
df.sort_values(['person', 'time'])
[36]:
person time bsl
8 A 0 115
5 A 2 237
2 A 3 129
7 B 5 86
3 B 6 396
4 C 1 107
1 C 9 347
6 D 5 89
9 D 5 221
0 D 7 98
[37]:
%sql DROP TABLE IF EXISTS df
   sqlite://
 * sqlite:///
Done.
[37]:
[]

Magic shortcut to creating a database table from pandas DataFrame.

[38]:
%sql -p df
   sqlite://
 * sqlite:///
[38]:
'Persisted df'

I’ve given pandas equivalents where possible, but sometimes they are rather unnatural.

[39]:
%%sql

SELECT person, time, bsl, row_number()
OVER () as row_num
FROM df;
   sqlite://
 * sqlite:///
Done.
[39]:
person time bsl row_num
D 7 98 1
C 9 347 2
A 3 129 3
B 6 396 4
C 1 107 5
A 2 237 6
D 5 89 7
B 5 86 8
A 0 115 9
D 5 221 10
[40]:
df.assign(row_num =  df.person.expanding(1).count().astype('int'))
[40]:
person time bsl row_num
0 D 7 98 1
1 C 9 347 2
2 A 3 129 3
3 B 6 396 4
4 C 1 107 5
5 A 2 237 6
6 D 5 89 7
7 B 5 86 8
8 A 0 115 9
9 D 5 221 10
[41]:
%%sql

SELECT person, time, bsl,
lag(bsl, 1) OVER () as lag1,
lead(bsl, 2) OVER () as lead2
FROM df;
   sqlite://
 * sqlite:///
Done.
[41]:
person time bsl lag1 lead2
D 7 98 None 129
C 9 347 98 396
A 3 129 347 107
B 6 396 129 237
C 1 107 396 89
A 2 237 107 86
D 5 89 237 115
B 5 86 89 221
A 0 115 86 None
D 5 221 115 None
[42]:
df_ = df.copy()
df_['lag1'] = df_.bsl.shift(1)
df_['lead2'] = df_.bsl.shift(-2)
df_
[42]:
person time bsl lag1 lead2
0 D 7 98 NaN 129.0
1 C 9 347 98.0 396.0
2 A 3 129 347.0 107.0
3 B 6 396 129.0 237.0
4 C 1 107 396.0 89.0
5 A 2 237 107.0 86.0
6 D 5 89 237.0 115.0
7 B 5 86 89.0 221.0
8 A 0 115 86.0 NaN
9 D 5 221 115.0 NaN
[43]:
%%sql

SELECT person, time, bsl, row_number()
OVER (ORDER BY person, time)
FROM df;
   sqlite://
 * sqlite:///
Done.
[43]:
person time bsl row_number()
OVER (ORDER BY person, time)
A 0 115 1
A 2 237 2
A 3 129 3
B 5 86 4
B 6 396 5
C 1 107 6
C 9 347 7
D 5 89 8
D 5 221 9
D 7 98 10
[44]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_.assign(row_num =  df_.person.expanding(1).count().astype('int'))
[44]:
person time bsl row_num
8 A 0 115 1
5 A 2 237 2
2 A 3 129 3
7 B 5 86 4
3 B 6 396 5
4 C 1 107 6
1 C 9 347 7
6 D 5 89 8
9 D 5 221 9
0 D 7 98 10
[45]:
%%sql

SELECT person, time, bsl, row_number()
OVER (PARTITION BY person ORDER BY time) as row_number
FROM df;
   sqlite://
 * sqlite:///
Done.
[45]:
person time bsl row_number
A 0 115 1
A 2 237 2
A 3 129 3
B 5 86 1
B 6 396 2
C 1 107 1
C 9 347 2
D 5 89 1
D 5 221 2
D 7 98 3
[46]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_['row_number'] = df_.groupby(['person']).cumcount()+1
df_
[46]:
person time bsl row_number
8 A 0 115 1
5 A 2 237 2
2 A 3 129 3
7 B 5 86 1
3 B 6 396 2
4 C 1 107 1
1 C 9 347 2
6 D 5 89 1
9 D 5 221 2
0 D 7 98 3
[47]:
%%sql

SELECT person, time, bsl, group_concat(bsl, ', ')
OVER (PARTITION BY person ORDER BY time) as window
FROM df;
   sqlite://
 * sqlite:///
Done.
[47]:
person time bsl window
A 0 115 115
A 2 237 115, 237
A 3 129 115, 237, 129
B 5 86 86
B 6 396 86, 396
C 1 107 107
C 9 347 107, 347
D 5 89 89, 221
D 5 221 89, 221
D 7 98 89, 221, 98
[48]:
df_['window'] = (
    df_.groupby('person').
    apply(lambda x: pd.Series([x.bsl.iloc[:(i+1)].values
                               for i in pd.Series(np.arange(len(x)))]))
).values
df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))
df_
[48]:
person time bsl row_number window
8 A 0 115 1 115
5 A 2 237 2 115,237
2 A 3 129 3 115,237,129
7 B 5 86 1 86
3 B 6 396 2 86,396
4 C 1 107 1 107
1 C 9 347 2 107,347
6 D 5 89 1 89
9 D 5 221 2 89,221
0 D 7 98 3 89,221,98
[49]:
%%sql

SELECT person, time, bsl, group_concat(bsl, ', ')
OVER (
    PARTITION BY person
    ORDER BY time
    ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
     ) as window
FROM df;
   sqlite://
 * sqlite:///
Done.
[49]:
person time bsl window
A 0 115 115, 237
A 2 237 115, 237, 129
A 3 129 237, 129
B 5 86 86, 396
B 6 396 86, 396
C 1 107 107, 347
C 9 347 107, 347
D 5 89 89, 221
D 5 221 89, 221, 98
D 7 98 221, 98
[50]:
df_['window'] = (
    df_.groupby('person').
    apply(lambda x: pd.Series([x.bsl.iloc[(i-len(x)-1):(i+2)].values
                               for i in pd.Series(np.arange(len(x)))]))
).values
df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))
df_
[50]:
person time bsl row_number window
8 A 0 115 1 115,237
5 A 2 237 2 115,237,129
2 A 3 129 3 237,129
7 B 5 86 1 86,396
3 B 6 396 2 86,396
4 C 1 107 1 107,347
1 C 9 347 2 107,347
6 D 5 89 1 89,221
9 D 5 221 2 89,221,98
0 D 7 98 3 221,98
[51]:
%%sql

SELECT person, time, bsl,
       row_number() OVER win    AS row_number,
       rank() OVER win          AS rank,
       dense_rank() OVER win    AS dense_rank,
       percent_rank() OVER win  AS percent_rank,
       cume_dist() OVER win     AS cume_dist
FROM df
WINDOW win AS (ORDER BY person);
   sqlite://
 * sqlite:///
Done.
[51]:
person time bsl row_number rank dense_rank percent_rank cume_dist
A 3 129 1 1 1 0.0 0.3
A 2 237 2 1 1 0.0 0.3
A 0 115 3 1 1 0.0 0.3
B 6 396 4 4 2 0.3333333333333333 0.5
B 5 86 5 4 2 0.3333333333333333 0.5
C 9 347 6 6 3 0.5555555555555556 0.7
C 1 107 7 6 3 0.5555555555555556 0.7
D 7 98 8 8 4 0.7777777777777778 1.0
D 5 89 9 8 4 0.7777777777777778 1.0
D 5 221 10 8 4 0.7777777777777778 1.0
[52]:
df_ = df.copy()
df_ = df_.sort_values(['person'])
df_['row_num'] = df_['person'].expanding(1).count().astype('int')
df_['rank'] = df_['person'].rank(method='min').astype('int')
df_['dense_rank'] = df_['person'].rank(method='dense').astype('int')
df_['percent_rank'] = (df_.person.rank(method='min') - 1) / (df_.person.count()-1)
df_['cume_dist'] = df_['person'].rank(method='max', pct=True)
df_
[52]:
person time bsl row_num rank dense_rank percent_rank cume_dist
2 A 3 129 1 1 1 0.000000 0.3
5 A 2 237 2 1 1 0.000000 0.3
8 A 0 115 3 1 1 0.000000 0.3
3 B 6 396 4 4 2 0.333333 0.5
7 B 5 86 5 4 2 0.333333 0.5
1 C 9 347 6 6 3 0.555556 0.7
4 C 1 107 7 6 3 0.555556 0.7
0 D 7 98 8 8 4 0.777778 1.0
6 D 5 89 9 8 4 0.777778 1.0
9 D 5 221 10 8 4 0.777778 1.0
[53]:
scores = pd.DataFrame(dict(scores=np.random.randint(0, 10, 10)))
scores
[53]:
scores
0 6
1 7
2 3
3 6
4 9
5 2
6 3
7 0
8 8
9 6
[54]:
%sql -p scores
   sqlite://
 * sqlite:///
[54]:
'Persisted scores'
[55]:
%%sql

SELECT scores, NTILE(4)
OVER (
    -- PARTITION BY scores
    ORDER BY scores
) AS quartile
FROM scores
   sqlite://
 * sqlite:///
Done.
[55]:
scores quartile
0 1
2 1
3 1
3 2
6 2
6 2
6 3
7 3
8 4
9 4

Note: See this for an explanation of the difference between NTILE and qcut

[56]:
quartiles = pd.qcut(scores.scores, 4, labels=[1,2,3,4])
scores['quantile'] = quartiles
scores.sort_values('scores')
[56]:
scores quantile
7 0 1
5 2 1
2 3 1
6 3 1
0 6 2
3 6 2
9 6 2
1 7 4
8 8 4
4 9 4

The sqlite3 implementation of LAST_VALUE seems to be buggy.

[57]:
%%sql

SELECT person, time, bsl,
       SUM(bsl) OVER win    AS bsl_sum,
       AVG(bsl) OVER win    AS bsl_avg,
       MIN(bsl) OVER win    AS bsl_min,
       MAX(bsl) over win as bsl_max,
       FIRST_VALUE(bsl) OVER win as bsl_start,
       LAST_VALUE(bsl) OVER win as bsl_end
FROM df
WINDOW win AS (PARTITION BY person ORDER BY time);
   sqlite://
 * sqlite:///
Done.
[57]:
person time bsl bsl_sum bsl_avg bsl_min bsl_max bsl_start bsl_end
A 0 115 115 115.0 115 115 115 115
A 2 237 352 176.0 115 237 115 237
A 3 129 481 160.33333333333334 115 237 115 129
B 5 86 86 86.0 86 86 86 86
B 6 396 482 241.0 86 396 86 396
C 1 107 107 107.0 107 107 107 107
C 9 347 454 227.0 107 347 107 347
D 5 89 310 155.0 89 221 89 221
D 5 221 310 155.0 89 221 89 221
D 7 98 408 136.0 89 221 89 98
[58]:
g = df.groupby('person')
g.first()
[58]:
time bsl
person
A 3 129
B 6 396
C 9 347
D 7 98
[59]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_['bsl_sum'] = df_.groupby(['person'])['bsl'].cumsum()
df_['bsl_avg'] = df_.groupby(['person']).expanding().agg({'bsl': 'mean'}).values
df_['bsl_min'] = df_.groupby(['person'])['bsl'].cummin()
df_['bsl_max'] = df_.groupby(['person'])['bsl'].cummax()
df_['bsl_start'] = df_.groupby(['person'])['bsl'].transform('first')
df_['bsl_end'] = df_.groupby(['person'])['bsl'].transform('last')
df_
[59]:
person time bsl bsl_sum bsl_avg bsl_min bsl_max bsl_start bsl_end
8 A 0 115 115 115.000000 115 115 115 129
5 A 2 237 352 176.000000 115 237 115 129
2 A 3 129 481 160.333333 115 237 115 129
7 B 5 86 86 86.000000 86 86 86 396
3 B 6 396 482 241.000000 86 396 86 396
4 C 1 107 107 107.000000 107 107 107 347
1 C 9 347 454 227.000000 107 347 107 347
6 D 5 89 89 89.000000 89 89 89 98
9 D 5 221 310 155.000000 89 221 89 98
0 D 7 98 408 136.000000 89 221 89 98
[60]:
%%sql

SELECT person, time, bsl,
       GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,
       SUM(bsl) OVER win AS bsl_sum,
       AVG(bsl) OVER win AS bsl_avg
FROM df
WINDOW win AS (
    PARTITION BY person
    ORDER BY time
    ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
)
ORDER BY person, time;
   sqlite://
 * sqlite:///
Done.
[60]:
person time bsl vals bsl_sum bsl_avg
A 0 115 115, 237 352 176.0
A 2 237 115, 237, 129 481 160.33333333333334
A 3 129 237, 129 366 183.0
B 5 86 86, 396 482 241.0
B 6 396 86, 396 482 241.0
C 1 107 107, 347 454 227.0
C 9 347 107, 347 454 227.0
D 5 89 89, 221 310 155.0
D 5 221 89, 221, 98 408 136.0
D 7 98 221, 98 319 159.5
[61]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_['bsl_sum'] = (df_.groupby('person').
                  bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).sum()))
df_['bsl_avg'] = (df_.groupby('person').
                  bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).mean()))
df_
[61]:
person time bsl bsl_sum bsl_avg
8 A 0 115 352.0 176.000000
5 A 2 237 481.0 160.333333
2 A 3 129 366.0 183.000000
7 B 5 86 482.0 241.000000
3 B 6 396 482.0 241.000000
4 C 1 107 454.0 227.000000
1 C 9 347 454.0 227.000000
6 D 5 89 310.0 155.000000
9 D 5 221 408.0 136.000000
0 D 7 98 319.0 159.500000

For Range, all rows with the same ORDER BY value are considered peers.

[62]:
%%sql

SELECT person, time, bsl,
       GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,
       SUM(bsl) OVER win AS bsl_sum,
       AVG(bsl) OVER win AS bsl_avg
FROM df
WINDOW win AS (
    ORDER BY person, time
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
)
ORDER BY person, time;
   sqlite://
 * sqlite:///
Done.
[62]:
person time bsl vals bsl_sum bsl_avg
A 0 115 115 115 115.0
A 2 237 115, 237 352 176.0
A 3 129 115, 237, 129 481 160.33333333333334
B 5 86 115, 237, 129, 86 567 141.75
B 6 396 115, 237, 129, 86, 396 963 192.6
C 1 107 115, 237, 129, 86, 396, 107 1070 178.33333333333334
C 9 347 115, 237, 129, 86, 396, 107, 347 1417 202.42857142857142
D 5 89 115, 237, 129, 86, 396, 107, 347, 89 1506 188.25
D 5 221 115, 237, 129, 86, 396, 107, 347, 89, 221 1727 191.88888888888889
D 7 98 115, 237, 129, 86, 396, 107, 347, 89, 221, 98 1825 182.5
[63]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_['bsl_sum'] = df_.bsl.expanding().sum().astype('int')
df_['bsl_avg'] = df_.bsl.expanding().mean()
df_
[63]:
person time bsl bsl_sum bsl_avg
8 A 0 115 115 115.000000
5 A 2 237 352 176.000000
2 A 3 129 481 160.333333
7 B 5 86 567 141.750000
3 B 6 396 963 192.600000
4 C 1 107 1070 178.333333
1 C 9 347 1417 202.428571
6 D 5 89 1506 188.250000
9 D 5 221 1727 191.888889
0 D 7 98 1825 182.500000
[64]:
%%sql

SELECT person, time, bsl,
       GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,
       SUM(bsl) OVER win AS bsl_sum,
       AVG(bsl) OVER win AS bsl_avg
FROM df
WINDOW win AS (
    ORDER BY person
    RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
)
ORDER BY person, time;
   sqlite://
 * sqlite:///
Done.
[64]:
person time bsl vals bsl_sum bsl_avg
A 0 115 129, 237, 115 481 160.33333333333334
A 2 237 129, 237, 115 481 160.33333333333334
A 3 129 129, 237, 115 481 160.33333333333334
B 5 86 129, 237, 115, 396, 86 963 192.6
B 6 396 129, 237, 115, 396, 86 963 192.6
C 1 107 129, 237, 115, 396, 86, 347, 107 1417 202.42857142857142
C 9 347 129, 237, 115, 396, 86, 347, 107 1417 202.42857142857142
D 5 89 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 1825 182.5
D 5 221 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 1825 182.5
D 7 98 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 1825 182.5
[65]:
df_ = df.copy()
df_ = df_.sort_values(['person', 'time'])
df_['bsl_sum'] =  df_['bsl'].cumsum().groupby(df_['person']).transform('last')
df_['bsl_sum'] =  df_['bsl'].expanding().mean().groupby(df_['person']).transform('last')
df_
[65]:
person time bsl bsl_sum
8 A 0 115 160.333333
5 A 2 237 160.333333
2 A 3 129 160.333333
7 B 5 86 192.600000
3 B 6 396 192.600000
4 C 1 107 202.428571
1 C 9 347 202.428571
6 D 5 89 182.500000
9 D 5 221 182.500000
0 D 7 98 182.500000