{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SQL Queries 02" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%load_ext sql" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "%sql sqlite://" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create toy data set" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from faker import Faker" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "fake = Faker()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "names = list(set([fake.name() for i in range(100)]))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "np.random.seed(123)\n", "\n", "n1 = 30\n", "bios821 = pd.DataFrame(dict(\n", " name=np.random.choice(names, n1, replace=False), \n", " grade=np.random.randint(50, 101, n1)))\n", "\n", "n2 = 30\n", "bios823 = pd.DataFrame(dict(\n", " name=np.random.choice(names, n2, replace=False), \n", " grade=np.random.randint(50, 101, n2)))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n", "Done.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "DROP TABLE IF EXISTS bios821;\n", "DROP TABLE IF EXISTs bios823" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n" ] }, { "data": { "text/plain": [ "'Persisted bios821'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql -p bios821" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n" ] }, { "data": { "text/plain": [ "'Persisted bios823'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql -p bios823" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexnamegrade
0Mrs. Michele Ingram65
1Mark Shaffer90
2Alexandra James75
" ], "text/plain": [ "[(0, 'Mrs. Michele Ingram', 65),\n", " (1, 'Mark Shaffer', 90),\n", " (2, 'Alexandra James', 75)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * FROM bios821 LIMIT 3" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexnamegrade
0Loretta Smith59
1Allison Kirby90
2Robert Wong91
" ], "text/plain": [ "[(0, 'Loretta Smith', 59), (1, 'Allison Kirby', 90), (2, 'Robert Wong', 91)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * FROM bios823 LIMIT 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Subqueries\n", "\n", "There are 3 ways to use a sub-query:\n", "\n", "- with `in`\n", "- with `exists`\n", "- with a comparison operator" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**1**. What students take both bios821 and bios823?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using set operations." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
Alexandra James
Cindy Clark
Dr. Scott Mendoza MD
Jennifer Hunt
Kathy Cochran
Nancy Castaneda
Robert Wong
Sharon Porter
" ], "text/plain": [ "[('Alexandra James',),\n", " ('Cindy Clark',),\n", " ('Dr. Scott Mendoza MD',),\n", " ('Jennifer Hunt',),\n", " ('Kathy Cochran',),\n", " ('Nancy Castaneda',),\n", " ('Robert Wong',),\n", " ('Sharon Porter',)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT name FROM bios821\n", "INTERSECT\n", "SELECT name FROM bios823" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using EQUIJOIN." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter
" ], "text/plain": [ "[('Alexandra James',),\n", " ('Nancy Castaneda',),\n", " ('Jennifer Hunt',),\n", " ('Dr. Scott Mendoza MD',),\n", " ('Kathy Cochran',),\n", " ('Cindy Clark',),\n", " ('Robert Wong',),\n", " ('Sharon Porter',)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT DISTINCT bios821.name \n", "FROM bios821, bios823\n", "WHERE bios821.name = bios823.name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using sub-query." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter
" ], "text/plain": [ "[('Alexandra James',),\n", " ('Nancy Castaneda',),\n", " ('Jennifer Hunt',),\n", " ('Dr. Scott Mendoza MD',),\n", " ('Kathy Cochran',),\n", " ('Cindy Clark',),\n", " ('Robert Wong',),\n", " ('Sharon Porter',)]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT DISTINCT name FROM bios821\n", "WHERE name IN (\n", " SELECT name FROM bios823\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using correlated sub-query." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name
Alexandra James
Nancy Castaneda
Jennifer Hunt
Dr. Scott Mendoza MD
Kathy Cochran
Cindy Clark
Robert Wong
Sharon Porter
" ], "text/plain": [ "[('Alexandra James',),\n", " ('Nancy Castaneda',),\n", " ('Jennifer Hunt',),\n", " ('Dr. Scott Mendoza MD',),\n", " ('Kathy Cochran',),\n", " ('Cindy Clark',),\n", " ('Robert Wong',),\n", " ('Sharon Porter',)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT DISTINCT name FROM bios821\n", "WHERE EXISTS (\n", " SELECT * FROM bios823 \n", " WHERE bios821.name = bios823.name\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**2**. Which students taking bios823 have grades higher than the average grade of bios821? \n", "\n", "- Do this for all students\n", "- Do this just for the subet of patinets taking both classes" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namegrade
Allison Kirby90
Robert Wong91
Steven Chavez90
Jennifer Paul75
Allison Fletcher100
Jennifer Hunt86
Dr. Amanda Ryan94
Terri Perez78
Julie Nguyen85
Brian Garrison89
Hannah Wagner94
Alexandra Wallace90
Veronica Herrera78
Sharon Porter98
Manuel Reyes96
Vicki Johnson99
John Williams96
" ], "text/plain": [ "[('Allison Kirby', 90),\n", " ('Robert Wong', 91),\n", " ('Steven Chavez', 90),\n", " ('Jennifer Paul', 75),\n", " ('Allison Fletcher', 100),\n", " ('Jennifer Hunt', 86),\n", " ('Dr. Amanda Ryan', 94),\n", " ('Terri Perez', 78),\n", " ('Julie Nguyen', 85),\n", " ('Brian Garrison', 89),\n", " ('Hannah Wagner', 94),\n", " ('Alexandra Wallace', 90),\n", " ('Veronica Herrera', 78),\n", " ('Sharon Porter', 98),\n", " ('Manuel Reyes', 96),\n", " ('Vicki Johnson', 99),\n", " ('John Williams', 96)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT DISTINCT name, grade FROM bios823\n", "WHERE grade > (\n", " SELECT AVG(grade) FROM bios821 \n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using a view." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "CREATE VIEW view_common AS \n", " SELECT DISTINCT\n", " bios821.name, \n", " bios821.grade as grade_821, \n", " bios823.grade as grade_823 \n", " FROM \n", " bios821, bios823\n", " WHERE \n", " bios821.name = bios823.name\n", " AND bios821.name IN (\n", " SELECT name \n", " FROM bios823 \n", " ) " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namegrade_823
Alexandra James73
Nancy Castaneda67
Jennifer Hunt86
Kathy Cochran74
Cindy Clark73
Robert Wong91
Sharon Porter98
" ], "text/plain": [ "[('Alexandra James', 73),\n", " ('Nancy Castaneda', 67),\n", " ('Jennifer Hunt', 86),\n", " ('Kathy Cochran', 74),\n", " ('Cindy Clark', 73),\n", " ('Robert Wong', 91),\n", " ('Sharon Porter', 98)]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT name, grade_823\n", "FROM view_common\n", "WHERE grade_823 > (\n", " SELECT AVG(grade_821) \n", " FROM view_common\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using `pandas`." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df = bios821.merge(bios823, on='name', suffixes=['_821', '_823'])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namegrade_821grade_823
0Alexandra James7573
1Nancy Castaneda5067
2Jennifer Hunt5186
3Dr. Scott Mendoza MD8864
4Kathy Cochran5174
5Cindy Clark5673
6Robert Wong6391
7Sharon Porter7898
\n", "
" ], "text/plain": [ " name grade_821 grade_823\n", "0 Alexandra James 75 73\n", "1 Nancy Castaneda 50 67\n", "2 Jennifer Hunt 51 86\n", "3 Dr. Scott Mendoza MD 88 64\n", "4 Kathy Cochran 51 74\n", "5 Cindy Clark 56 73\n", "6 Robert Wong 63 91\n", "7 Sharon Porter 78 98" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namegrade_821grade_823
0Alexandra James7573
1Nancy Castaneda5067
2Jennifer Hunt5186
4Kathy Cochran5174
5Cindy Clark5673
6Robert Wong6391
7Sharon Porter7898
\n", "
" ], "text/plain": [ " name grade_821 grade_823\n", "0 Alexandra James 75 73\n", "1 Nancy Castaneda 50 67\n", "2 Jennifer Hunt 51 86\n", "4 Kathy Cochran 51 74\n", "5 Cindy Clark 56 73\n", "6 Robert Wong 63 91\n", "7 Sharon Porter 78 98" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.grade_823 > df.grade_821.mean()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Common table expressions (CTE)\n", "\n", "CTEs are temporary tables created for a specific query." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Basic syntax for CTE" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
Bob23
" ], "text/plain": [ "[('Bob', 23)]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH\n", " t1(name) AS (SELECT 'Bob'),\n", " t2(age) AS (SELECT 23)\n", "SELECT * from t1, t2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Using CTEs to solve previous problem" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namegrade_823
Alexandra James73
Nancy Castaneda67
Jennifer Hunt86
Kathy Cochran74
Cindy Clark73
Robert Wong91
Sharon Porter98
" ], "text/plain": [ "[('Alexandra James', 73),\n", " ('Nancy Castaneda', 67),\n", " ('Jennifer Hunt', 86),\n", " ('Kathy Cochran', 74),\n", " ('Cindy Clark', 73),\n", " ('Robert Wong', 91),\n", " ('Sharon Porter', 98)]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH common AS (\n", " SELECT DISTINCT \n", " bios821.name, \n", " bios821.grade as grade_821, \n", " bios823.grade as grade_823 \n", " FROM \n", " bios821,\n", " bios823\n", " WHERE bios821.name IN (\n", " SELECT name FROM bios823\n", " ) AND\n", " bios821.name = bios823.name\n", ")\n", "SELECT name, grade_823\n", "FROM common\n", "WHERE grade_823 > (\n", " SELECT AVG(grade_821) \n", " FROM common\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recursive CTEs\n", "\n", "Image source: https://cdn.sqlservertutorial.net/wp-content/uploads/SQL-Server-Recursive-CTE-execution-flow.png\n", "![img](https://cdn.sqlservertutorial.net/wp-content/uploads/SQL-Server-Recursive-CTE-execution-flow.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### As a generator" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
n
1
2
3
4
5
" ], "text/plain": [ "[(1,), (2,), (3,), (4,), (5,)]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH RECURSIVE count(n) AS (\n", " SELECT 1\n", " UNION ALL\n", " SELECT n+1 FROM count WHERe n < 5\n", ")\n", "SELECT * FROM count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Generating dates" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Homework due
2019-01-15
2019-02-15
2019-03-15
2019-04-15
2019-05-15
2019-06-15
2019-07-15
2019-08-15
2019-09-15
2019-10-15
2019-11-15
2019-12-15
" ], "text/plain": [ "[('2019-01-15',),\n", " ('2019-02-15',),\n", " ('2019-03-15',),\n", " ('2019-04-15',),\n", " ('2019-05-15',),\n", " ('2019-06-15',),\n", " ('2019-07-15',),\n", " ('2019-08-15',),\n", " ('2019-09-15',),\n", " ('2019-10-15',),\n", " ('2019-11-15',),\n", " ('2019-12-15',)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH RECURSIVE make_dates(d) AS (\n", " SELECT '2019-01-15'\n", " UNION ALL\n", " SELECT DATE(d, '+1 MONTHS') FROM make_dates WHERe d < '2019-12'\n", ")\n", "SELECT d AS \"Homework due\" FROM make_dates" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CTEs to do programming in SQL(!)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nf
11
22
36
424
5120
" ], "text/plain": [ "[(1, 1), (2, 2), (3, 6), (4, 24), (5, 120)]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH RECURSIVE fact(n, f) AS (\n", " SELECT 1, 1\n", " UNION ALL\n", " SELECT n+1, (n+1) * f \n", " FROM fact \n", " WHERe n < 5\n", ")\n", "SELECT * FROM fact\n", "LIMIT 5" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite://\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nf1f2
101
211
312
423
535
658
7813
81321
92134
103455
" ], "text/plain": [ "[(1, 0, 1),\n", " (2, 1, 1),\n", " (3, 1, 2),\n", " (4, 2, 3),\n", " (5, 3, 5),\n", " (6, 5, 8),\n", " (7, 8, 13),\n", " (8, 13, 21),\n", " (9, 21, 34),\n", " (10, 34, 55)]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH RECURSIVE fib(n, f1, f2) AS (\n", " SELECT 1, 0, 1\n", " UNION ALL\n", " SELECT n+1, f2, f1+f2\n", " FROM fib \n", " WHERe n < 10\n", ")\n", "SELECT * FROM fib\n", "LIMIT 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### A common use of CTEs is to work with naturally recursive structures (trees or graphs)\n", "\n", "Suppose we have a cell subset taxonomy with parent-child relations. We want to find all `descendants` of a particular cell type. This is hard to do without recursive CTEs in SQL. " ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "15 rows affected.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql sqlite:///\n", "\n", "DROP TABLE IF EXISTS cell;\n", "\n", "CREATE TABLE cell(\n", " cell_id integer PRIMARY KEY,\n", " name VARCHAR(30),\n", " parent_id integer\n", ");\n", "\n", "INSERT INTO cell (\n", " cell_id,\n", " name,\n", " parent_id\n", ")\n", "VALUES\n", " (1, 'WBC', NULL),\n", " (2, 'Lymphocyte', 1),\n", " (3, 'T Cell', 2),\n", " (4, 'B Cell', 2),\n", " (5, 'NK Cell', 2),\n", " (6, 'T helper cell', 3),\n", " (7, 'T cytotoxic cell', 3),\n", " (8, 'T regulatory cell', 3),\n", " (9, 'Naive B cell', 4),\n", " (10, 'Memory B cell', 4),\n", " (11, 'Plasma cell', 4),\n", " (12, 'Granulocyte', 1),\n", " (13, 'Basophil', 12),\n", " (14, 'Eosinophil', 12),\n", " (15, 'Neutrophil', 12)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cell_idnameparent_id
1WBCNone
2Lymphocyte1
3T Cell2
4B Cell2
5NK Cell2
6T helper cell3
7T cytotoxic cell3
8T regulatory cell3
9Naive B cell4
10Memory B cell4
" ], "text/plain": [ "[(1, 'WBC', None),\n", " (2, 'Lymphocyte', 1),\n", " (3, 'T Cell', 2),\n", " (4, 'B Cell', 2),\n", " (5, 'NK Cell', 2),\n", " (6, 'T helper cell', 3),\n", " (7, 'T cytotoxic cell', 3),\n", " (8, 'T regulatory cell', 3),\n", " (9, 'Naive B cell', 4),\n", " (10, 'Memory B cell', 4)]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT * FROM cell LIMIT 10" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cell_idnameparent_id
3T Cell2
" ], "text/plain": [ "[(3, 'T Cell', 2)]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", " SELECT cell_id, name, parent_id\n", " FROM cell\n", " WHERE name='T Cell'" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "target = 'Lymphocyte'" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cell_idnameparent_id
2Lymphocyte1
4B Cell2
5NK Cell2
3T Cell2
10Memory B cell4
9Naive B cell4
11Plasma cell4
7T cytotoxic cell3
6T helper cell3
8T regulatory cell3
" ], "text/plain": [ "[(2, 'Lymphocyte', 1),\n", " (4, 'B Cell', 2),\n", " (5, 'NK Cell', 2),\n", " (3, 'T Cell', 2),\n", " (10, 'Memory B cell', 4),\n", " (9, 'Naive B cell', 4),\n", " (11, 'Plasma cell', 4),\n", " (7, 'T cytotoxic cell', 3),\n", " (6, 'T helper cell', 3),\n", " (8, 'T regulatory cell', 3)]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "WITH RECURSIVE lineage AS (\n", " SELECT \n", " cell_id, \n", " name, \n", " parent_id\n", " FROM \n", " cell\n", " WHERE \n", " name=:target\n", "UNION ALL\n", " SELECT\n", " c.cell_id, \n", " c.name, \n", " c.parent_id\n", " FROM \n", " cell c\n", " INNER JOIN \n", " lineage l \n", " ON \n", " l.cell_id = c.parent_id\n", ")\n", "SELECT * FROM lineage \n", "LIMIT 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Window Functions\n", "\n", "- [Official Refs](https://www.sqlite.org/windowfunctions.html)\n", "![img](https://www.sqlitetutorial.net/wp-content/uploads/2018/11/SQLite-Window-Functions-1.png)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "np.random.seed(23)\n", "n = 10\n", "df = pd.DataFrame(\n", " dict(person=np.random.choice(['A', 'B', 'C', 'D'], n,),\n", " time=np.random.randint(0, 10, n), \n", " bsl=np.random.randint(50, 400, n)))" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebsl
8A0115
5A2237
2A3129
7B586
3B6396
4C1107
1C9347
6D589
9D5221
0D798
\n", "
" ], "text/plain": [ " person time bsl\n", "8 A 0 115\n", "5 A 2 237\n", "2 A 3 129\n", "7 B 5 86\n", "3 B 6 396\n", "4 C 1 107\n", "1 C 9 347\n", "6 D 5 89\n", "9 D 5 221\n", "0 D 7 98" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(['person', 'time'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql DROP TABLE IF EXISTS df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Magic shortcut to creating a database table from `pandas` DataFrame." ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n" ] }, { "data": { "text/plain": [ "'Persisted df'" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql -p df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Over creates widows\n", "\n", "I've given `pandas` equivalents where possible, but sometimes they are rather unnatural." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_num
D7981
C93472
A31293
B63964
C11075
A22376
D5897
B5868
A01159
D522110
" ], "text/plain": [ "[('D', 7, 98, 1),\n", " ('C', 9, 347, 2),\n", " ('A', 3, 129, 3),\n", " ('B', 6, 396, 4),\n", " ('C', 1, 107, 5),\n", " ('A', 2, 237, 6),\n", " ('D', 5, 89, 7),\n", " ('B', 5, 86, 8),\n", " ('A', 0, 115, 9),\n", " ('D', 5, 221, 10)]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, row_number() \n", "OVER () as row_num\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_num
0D7981
1C93472
2A31293
3B63964
4C11075
5A22376
6D5897
7B5868
8A01159
9D522110
\n", "
" ], "text/plain": [ " person time bsl row_num\n", "0 D 7 98 1\n", "1 C 9 347 2\n", "2 A 3 129 3\n", "3 B 6 396 4\n", "4 C 1 107 5\n", "5 A 2 237 6\n", "6 D 5 89 7\n", "7 B 5 86 8\n", "8 A 0 115 9\n", "9 D 5 221 10" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.assign(row_num = df.person.expanding(1).count().astype('int'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Lag and Lead" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebsllag1lead2
D798None129
C934798396
A3129347107
B6396129237
C110739689
A223710786
D589237115
B58689221
A011586None
D5221115None
" ], "text/plain": [ "[('D', 7, 98, None, 129),\n", " ('C', 9, 347, 98, 396),\n", " ('A', 3, 129, 347, 107),\n", " ('B', 6, 396, 129, 237),\n", " ('C', 1, 107, 396, 89),\n", " ('A', 2, 237, 107, 86),\n", " ('D', 5, 89, 237, 115),\n", " ('B', 5, 86, 89, 221),\n", " ('A', 0, 115, 86, None),\n", " ('D', 5, 221, 115, None)]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, \n", "lag(bsl, 1) OVER () as lag1,\n", "lead(bsl, 2) OVER () as lead2\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebsllag1lead2
0D798NaN129.0
1C934798.0396.0
2A3129347.0107.0
3B6396129.0237.0
4C1107396.089.0
5A2237107.086.0
6D589237.0115.0
7B58689.0221.0
8A011586.0NaN
9D5221115.0NaN
\n", "
" ], "text/plain": [ " person time bsl lag1 lead2\n", "0 D 7 98 NaN 129.0\n", "1 C 9 347 98.0 396.0\n", "2 A 3 129 347.0 107.0\n", "3 B 6 396 129.0 237.0\n", "4 C 1 107 396.0 89.0\n", "5 A 2 237 107.0 86.0\n", "6 D 5 89 237.0 115.0\n", "7 B 5 86 89.0 221.0\n", "8 A 0 115 86.0 NaN\n", "9 D 5 221 115.0 NaN" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_['lag1'] = df_.bsl.shift(1)\n", "df_['lead2'] = df_.bsl.shift(-2)\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Order by" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_number()
OVER (ORDER BY person, time)
A01151
A22372
A31293
B5864
B63965
C11076
C93477
D5898
D52219
D79810
" ], "text/plain": [ "[('A', 0, 115, 1),\n", " ('A', 2, 237, 2),\n", " ('A', 3, 129, 3),\n", " ('B', 5, 86, 4),\n", " ('B', 6, 396, 5),\n", " ('C', 1, 107, 6),\n", " ('C', 9, 347, 7),\n", " ('D', 5, 89, 8),\n", " ('D', 5, 221, 9),\n", " ('D', 7, 98, 10)]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, row_number() \n", "OVER (ORDER BY person, time)\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_num
8A01151
5A22372
2A31293
7B5864
3B63965
4C11076
1C93477
6D5898
9D52219
0D79810
\n", "
" ], "text/plain": [ " person time bsl row_num\n", "8 A 0 115 1\n", "5 A 2 237 2\n", "2 A 3 129 3\n", "7 B 5 86 4\n", "3 B 6 396 5\n", "4 C 1 107 6\n", "1 C 9 347 7\n", "6 D 5 89 8\n", "9 D 5 221 9\n", "0 D 7 98 10" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_.assign(row_num = df_.person.expanding(1).count().astype('int'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Partition by" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_number
A01151
A22372
A31293
B5861
B63962
C11071
C93472
D5891
D52212
D7983
" ], "text/plain": [ "[('A', 0, 115, 1),\n", " ('A', 2, 237, 2),\n", " ('A', 3, 129, 3),\n", " ('B', 5, 86, 1),\n", " ('B', 6, 396, 2),\n", " ('C', 1, 107, 1),\n", " ('C', 9, 347, 2),\n", " ('D', 5, 89, 1),\n", " ('D', 5, 221, 2),\n", " ('D', 7, 98, 3)]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, row_number() \n", "OVER (PARTITION BY person ORDER BY time) as row_number\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_number
8A01151
5A22372
2A31293
7B5861
3B63962
4C11071
1C93472
6D5891
9D52212
0D7983
\n", "
" ], "text/plain": [ " person time bsl row_number\n", "8 A 0 115 1\n", "5 A 2 237 2\n", "2 A 3 129 3\n", "7 B 5 86 1\n", "3 B 6 396 2\n", "4 C 1 107 1\n", "1 C 9 347 2\n", "6 D 5 89 1\n", "9 D 5 221 2\n", "0 D 7 98 3" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_['row_number'] = df_.groupby(['person']).cumcount()+1\n", "df_" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslwindow
A0115115
A2237115, 237
A3129115, 237, 129
B58686
B639686, 396
C1107107
C9347107, 347
D58989, 221
D522189, 221
D79889, 221, 98
" ], "text/plain": [ "[('A', 0, 115, '115'),\n", " ('A', 2, 237, '115, 237'),\n", " ('A', 3, 129, '115, 237, 129'),\n", " ('B', 5, 86, '86'),\n", " ('B', 6, 396, '86, 396'),\n", " ('C', 1, 107, '107'),\n", " ('C', 9, 347, '107, 347'),\n", " ('D', 5, 89, '89, 221'),\n", " ('D', 5, 221, '89, 221'),\n", " ('D', 7, 98, '89, 221, 98')]" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, group_concat(bsl, ', ')\n", "OVER (PARTITION BY person ORDER BY time) as window\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_numberwindow
8A01151115
5A22372115,237
2A31293115,237,129
7B586186
3B6396286,396
4C11071107
1C93472107,347
6D589189
9D5221289,221
0D798389,221,98
\n", "
" ], "text/plain": [ " person time bsl row_number window\n", "8 A 0 115 1 115\n", "5 A 2 237 2 115,237\n", "2 A 3 129 3 115,237,129\n", "7 B 5 86 1 86\n", "3 B 6 396 2 86,396\n", "4 C 1 107 1 107\n", "1 C 9 347 2 107,347\n", "6 D 5 89 1 89\n", "9 D 5 221 2 89,221\n", "0 D 7 98 3 89,221,98" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_['window'] = (\n", " df_.groupby('person').\n", " apply(lambda x: pd.Series([x.bsl.iloc[:(i+1)].values \n", " for i in pd.Series(np.arange(len(x)))]))\n", ").values\n", "df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Specifying rows in window" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslwindow
A0115115, 237
A2237115, 237, 129
A3129237, 129
B58686, 396
B639686, 396
C1107107, 347
C9347107, 347
D58989, 221
D522189, 221, 98
D798221, 98
" ], "text/plain": [ "[('A', 0, 115, '115, 237'),\n", " ('A', 2, 237, '115, 237, 129'),\n", " ('A', 3, 129, '237, 129'),\n", " ('B', 5, 86, '86, 396'),\n", " ('B', 6, 396, '86, 396'),\n", " ('C', 1, 107, '107, 347'),\n", " ('C', 9, 347, '107, 347'),\n", " ('D', 5, 89, '89, 221'),\n", " ('D', 5, 221, '89, 221, 98'),\n", " ('D', 7, 98, '221, 98')]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, group_concat(bsl, ', ')\n", "OVER (\n", " PARTITION BY person\n", " ORDER BY time\n", " ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING\n", " ) as window\n", "FROM df;" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_numberwindow
8A01151115,237
5A22372115,237,129
2A31293237,129
7B586186,396
3B6396286,396
4C11071107,347
1C93472107,347
6D589189,221
9D5221289,221,98
0D7983221,98
\n", "
" ], "text/plain": [ " person time bsl row_number window\n", "8 A 0 115 1 115,237\n", "5 A 2 237 2 115,237,129\n", "2 A 3 129 3 237,129\n", "7 B 5 86 1 86,396\n", "3 B 6 396 2 86,396\n", "4 C 1 107 1 107,347\n", "1 C 9 347 2 107,347\n", "6 D 5 89 1 89,221\n", "9 D 5 221 2 89,221,98\n", "0 D 7 98 3 221,98" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_['window'] = (\n", " df_.groupby('person').\n", " apply(lambda x: pd.Series([x.bsl.iloc[(i-len(x)-1):(i+2)].values \n", " for i in pd.Series(np.arange(len(x)))]))\n", ").values\n", "df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using window functions" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_numberrankdense_rankpercent_rankcume_dist
A31291110.00.3
A22372110.00.3
A01153110.00.3
B63964420.33333333333333330.5
B5865420.33333333333333330.5
C93476630.55555555555555560.7
C11077630.55555555555555560.7
D7988840.77777777777777781.0
D5899840.77777777777777781.0
D522110840.77777777777777781.0
" ], "text/plain": [ "[('A', 3, 129, 1, 1, 1, 0.0, 0.3),\n", " ('A', 2, 237, 2, 1, 1, 0.0, 0.3),\n", " ('A', 0, 115, 3, 1, 1, 0.0, 0.3),\n", " ('B', 6, 396, 4, 4, 2, 0.3333333333333333, 0.5),\n", " ('B', 5, 86, 5, 4, 2, 0.3333333333333333, 0.5),\n", " ('C', 9, 347, 6, 6, 3, 0.5555555555555556, 0.7),\n", " ('C', 1, 107, 7, 6, 3, 0.5555555555555556, 0.7),\n", " ('D', 7, 98, 8, 8, 4, 0.7777777777777778, 1.0),\n", " ('D', 5, 89, 9, 8, 4, 0.7777777777777778, 1.0),\n", " ('D', 5, 221, 10, 8, 4, 0.7777777777777778, 1.0)]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, \n", " row_number() OVER win AS row_number,\n", " rank() OVER win AS rank,\n", " dense_rank() OVER win AS dense_rank,\n", " percent_rank() OVER win AS percent_rank,\n", " cume_dist() OVER win AS cume_dist\n", "FROM df\n", "WINDOW win AS (ORDER BY person);" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslrow_numrankdense_rankpercent_rankcume_dist
2A31291110.0000000.3
5A22372110.0000000.3
8A01153110.0000000.3
3B63964420.3333330.5
7B5865420.3333330.5
1C93476630.5555560.7
4C11077630.5555560.7
0D7988840.7777781.0
6D5899840.7777781.0
9D522110840.7777781.0
\n", "
" ], "text/plain": [ " person time bsl row_num rank dense_rank percent_rank cume_dist\n", "2 A 3 129 1 1 1 0.000000 0.3\n", "5 A 2 237 2 1 1 0.000000 0.3\n", "8 A 0 115 3 1 1 0.000000 0.3\n", "3 B 6 396 4 4 2 0.333333 0.5\n", "7 B 5 86 5 4 2 0.333333 0.5\n", "1 C 9 347 6 6 3 0.555556 0.7\n", "4 C 1 107 7 6 3 0.555556 0.7\n", "0 D 7 98 8 8 4 0.777778 1.0\n", "6 D 5 89 9 8 4 0.777778 1.0\n", "9 D 5 221 10 8 4 0.777778 1.0" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person'])\n", "df_['row_num'] = df_['person'].expanding(1).count().astype('int')\n", "df_['rank'] = df_['person'].rank(method='min').astype('int')\n", "df_['dense_rank'] = df_['person'].rank(method='dense').astype('int')\n", "df_['percent_rank'] = (df_.person.rank(method='min') - 1) / (df_.person.count()-1)\n", "df_['cume_dist'] = df_['person'].rank(method='max', pct=True)\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The NTILE window function" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
scores
06
17
23
36
49
52
63
70
88
96
\n", "
" ], "text/plain": [ " scores\n", "0 6\n", "1 7\n", "2 3\n", "3 6\n", "4 9\n", "5 2\n", "6 3\n", "7 0\n", "8 8\n", "9 6" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores = pd.DataFrame(dict(scores=np.random.randint(0, 10, 10)))\n", "scores" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n" ] }, { "data": { "text/plain": [ "'Persisted scores'" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql -p scores" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
scoresquartile
01
21
31
32
62
62
63
73
84
94
" ], "text/plain": [ "[(0, 1),\n", " (2, 1),\n", " (3, 1),\n", " (3, 2),\n", " (6, 2),\n", " (6, 2),\n", " (6, 3),\n", " (7, 3),\n", " (8, 4),\n", " (9, 4)]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT scores, NTILE(4)\n", "OVER (\n", " -- PARTITION BY scores\n", " ORDER BY scores \n", ") AS quartile\n", "FROM scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: See [this](https://stackoverflow.com/questions/20726493/python-pandas-qcut-behavior-with-of-observations-not-divisible-by-of-bins) for an explanation of the difference between NTILE and `qcut`" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
scoresquantile
701
521
231
631
062
362
962
174
884
494
\n", "
" ], "text/plain": [ " scores quantile\n", "7 0 1\n", "5 2 1\n", "2 3 1\n", "6 3 1\n", "0 6 2\n", "3 6 2\n", "9 6 2\n", "1 7 4\n", "8 8 4\n", "4 9 4" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "quartiles = pd.qcut(scores.scores, 4, labels=[1,2,3,4])\n", "scores['quantile'] = quartiles\n", "scores.sort_values('scores')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using aggregate functions\n", "\n", "The `sqlite3` implementation of LAST_VALUE seems to be buggy." ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslbsl_sumbsl_avgbsl_minbsl_maxbsl_startbsl_end
A0115115115.0115115115115
A2237352176.0115237115237
A3129481160.33333333333334115237115129
B5868686.086868686
B6396482241.08639686396
C1107107107.0107107107107
C9347454227.0107347107347
D589310155.08922189221
D5221310155.08922189221
D798408136.0892218998
" ], "text/plain": [ "[('A', 0, 115, 115, 115.0, 115, 115, 115, 115),\n", " ('A', 2, 237, 352, 176.0, 115, 237, 115, 237),\n", " ('A', 3, 129, 481, 160.33333333333334, 115, 237, 115, 129),\n", " ('B', 5, 86, 86, 86.0, 86, 86, 86, 86),\n", " ('B', 6, 396, 482, 241.0, 86, 396, 86, 396),\n", " ('C', 1, 107, 107, 107.0, 107, 107, 107, 107),\n", " ('C', 9, 347, 454, 227.0, 107, 347, 107, 347),\n", " ('D', 5, 89, 310, 155.0, 89, 221, 89, 221),\n", " ('D', 5, 221, 310, 155.0, 89, 221, 89, 221),\n", " ('D', 7, 98, 408, 136.0, 89, 221, 89, 98)]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl,\n", " SUM(bsl) OVER win AS bsl_sum,\n", " AVG(bsl) OVER win AS bsl_avg,\n", " MIN(bsl) OVER win AS bsl_min,\n", " MAX(bsl) over win as bsl_max,\n", " FIRST_VALUE(bsl) OVER win as bsl_start,\n", " LAST_VALUE(bsl) OVER win as bsl_end\n", "FROM df\n", "WINDOW win AS (PARTITION BY person ORDER BY time);" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timebsl
person
A3129
B6396
C9347
D798
\n", "
" ], "text/plain": [ " time bsl\n", "person \n", "A 3 129\n", "B 6 396\n", "C 9 347\n", "D 7 98" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = df.groupby('person')\n", "g.first()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslbsl_sumbsl_avgbsl_minbsl_maxbsl_startbsl_end
8A0115115115.000000115115115129
5A2237352176.000000115237115129
2A3129481160.333333115237115129
7B5868686.000000868686396
3B6396482241.0000008639686396
4C1107107107.000000107107107347
1C9347454227.000000107347107347
6D5898989.00000089898998
9D5221310155.000000892218998
0D798408136.000000892218998
\n", "
" ], "text/plain": [ " person time bsl bsl_sum bsl_avg bsl_min bsl_max bsl_start bsl_end\n", "8 A 0 115 115 115.000000 115 115 115 129\n", "5 A 2 237 352 176.000000 115 237 115 129\n", "2 A 3 129 481 160.333333 115 237 115 129\n", "7 B 5 86 86 86.000000 86 86 86 396\n", "3 B 6 396 482 241.000000 86 396 86 396\n", "4 C 1 107 107 107.000000 107 107 107 347\n", "1 C 9 347 454 227.000000 107 347 107 347\n", "6 D 5 89 89 89.000000 89 89 89 98\n", "9 D 5 221 310 155.000000 89 221 89 98\n", "0 D 7 98 408 136.000000 89 221 89 98" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_['bsl_sum'] = df_.groupby(['person'])['bsl'].cumsum()\n", "df_['bsl_avg'] = df_.groupby(['person']).expanding().agg({'bsl': 'mean'}).values\n", "df_['bsl_min'] = df_.groupby(['person'])['bsl'].cummin()\n", "df_['bsl_max'] = df_.groupby(['person'])['bsl'].cummax()\n", "df_['bsl_start'] = df_.groupby(['person'])['bsl'].transform('first')\n", "df_['bsl_end'] = df_.groupby(['person'])['bsl'].transform('last')\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using rows and range to constrain windows" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslvalsbsl_sumbsl_avg
A0115115, 237352176.0
A2237115, 237, 129481160.33333333333334
A3129237, 129366183.0
B58686, 396482241.0
B639686, 396482241.0
C1107107, 347454227.0
C9347107, 347454227.0
D58989, 221310155.0
D522189, 221, 98408136.0
D798221, 98319159.5
" ], "text/plain": [ "[('A', 0, 115, '115, 237', 352, 176.0),\n", " ('A', 2, 237, '115, 237, 129', 481, 160.33333333333334),\n", " ('A', 3, 129, '237, 129', 366, 183.0),\n", " ('B', 5, 86, '86, 396', 482, 241.0),\n", " ('B', 6, 396, '86, 396', 482, 241.0),\n", " ('C', 1, 107, '107, 347', 454, 227.0),\n", " ('C', 9, 347, '107, 347', 454, 227.0),\n", " ('D', 5, 89, '89, 221', 310, 155.0),\n", " ('D', 5, 221, '89, 221, 98', 408, 136.0),\n", " ('D', 7, 98, '221, 98', 319, 159.5)]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, \n", " GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n", " SUM(bsl) OVER win AS bsl_sum,\n", " AVG(bsl) OVER win AS bsl_avg\n", "FROM df\n", "WINDOW win AS (\n", " PARTITION BY person\n", " ORDER BY time\n", " ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING\n", ")\n", "ORDER BY person, time;" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslbsl_sumbsl_avg
8A0115352.0176.000000
5A2237481.0160.333333
2A3129366.0183.000000
7B586482.0241.000000
3B6396482.0241.000000
4C1107454.0227.000000
1C9347454.0227.000000
6D589310.0155.000000
9D5221408.0136.000000
0D798319.0159.500000
\n", "
" ], "text/plain": [ " person time bsl bsl_sum bsl_avg\n", "8 A 0 115 352.0 176.000000\n", "5 A 2 237 481.0 160.333333\n", "2 A 3 129 366.0 183.000000\n", "7 B 5 86 482.0 241.000000\n", "3 B 6 396 482.0 241.000000\n", "4 C 1 107 454.0 227.000000\n", "1 C 9 347 454.0 227.000000\n", "6 D 5 89 310.0 155.000000\n", "9 D 5 221 408.0 136.000000\n", "0 D 7 98 319.0 159.500000" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_['bsl_sum'] = (df_.groupby('person').\n", " bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).sum()))\n", "df_['bsl_avg'] = (df_.groupby('person').\n", " bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).mean()))\n", "df_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Frames using Rows and Range\n", "\n", "For Range, all rows with the same ORDER BY value are considered peers." ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslvalsbsl_sumbsl_avg
A0115115115115.0
A2237115, 237352176.0
A3129115, 237, 129481160.33333333333334
B586115, 237, 129, 86567141.75
B6396115, 237, 129, 86, 396963192.6
C1107115, 237, 129, 86, 396, 1071070178.33333333333334
C9347115, 237, 129, 86, 396, 107, 3471417202.42857142857142
D589115, 237, 129, 86, 396, 107, 347, 891506188.25
D5221115, 237, 129, 86, 396, 107, 347, 89, 2211727191.88888888888889
D798115, 237, 129, 86, 396, 107, 347, 89, 221, 981825182.5
" ], "text/plain": [ "[('A', 0, 115, '115', 115, 115.0),\n", " ('A', 2, 237, '115, 237', 352, 176.0),\n", " ('A', 3, 129, '115, 237, 129', 481, 160.33333333333334),\n", " ('B', 5, 86, '115, 237, 129, 86', 567, 141.75),\n", " ('B', 6, 396, '115, 237, 129, 86, 396', 963, 192.6),\n", " ('C', 1, 107, '115, 237, 129, 86, 396, 107', 1070, 178.33333333333334),\n", " ('C', 9, 347, '115, 237, 129, 86, 396, 107, 347', 1417, 202.42857142857142),\n", " ('D', 5, 89, '115, 237, 129, 86, 396, 107, 347, 89', 1506, 188.25),\n", " ('D', 5, 221, '115, 237, 129, 86, 396, 107, 347, 89, 221', 1727, 191.88888888888889),\n", " ('D', 7, 98, '115, 237, 129, 86, 396, 107, 347, 89, 221, 98', 1825, 182.5)]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, \n", " GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n", " SUM(bsl) OVER win AS bsl_sum,\n", " AVG(bsl) OVER win AS bsl_avg\n", "FROM df\n", "WINDOW win AS (\n", " ORDER BY person, time\n", " ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n", ")\n", "ORDER BY person, time;" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslbsl_sumbsl_avg
8A0115115115.000000
5A2237352176.000000
2A3129481160.333333
7B586567141.750000
3B6396963192.600000
4C11071070178.333333
1C93471417202.428571
6D5891506188.250000
9D52211727191.888889
0D7981825182.500000
\n", "
" ], "text/plain": [ " person time bsl bsl_sum bsl_avg\n", "8 A 0 115 115 115.000000\n", "5 A 2 237 352 176.000000\n", "2 A 3 129 481 160.333333\n", "7 B 5 86 567 141.750000\n", "3 B 6 396 963 192.600000\n", "4 C 1 107 1070 178.333333\n", "1 C 9 347 1417 202.428571\n", "6 D 5 89 1506 188.250000\n", "9 D 5 221 1727 191.888889\n", "0 D 7 98 1825 182.500000" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_['bsl_sum'] = df_.bsl.expanding().sum().astype('int')\n", "df_['bsl_avg'] = df_.bsl.expanding().mean()\n", "df_" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " sqlite://\n", " * sqlite:///\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslvalsbsl_sumbsl_avg
A0115129, 237, 115481160.33333333333334
A2237129, 237, 115481160.33333333333334
A3129129, 237, 115481160.33333333333334
B586129, 237, 115, 396, 86963192.6
B6396129, 237, 115, 396, 86963192.6
C1107129, 237, 115, 396, 86, 347, 1071417202.42857142857142
C9347129, 237, 115, 396, 86, 347, 1071417202.42857142857142
D589129, 237, 115, 396, 86, 347, 107, 98, 89, 2211825182.5
D5221129, 237, 115, 396, 86, 347, 107, 98, 89, 2211825182.5
D798129, 237, 115, 396, 86, 347, 107, 98, 89, 2211825182.5
" ], "text/plain": [ "[('A', 0, 115, '129, 237, 115', 481, 160.33333333333334),\n", " ('A', 2, 237, '129, 237, 115', 481, 160.33333333333334),\n", " ('A', 3, 129, '129, 237, 115', 481, 160.33333333333334),\n", " ('B', 5, 86, '129, 237, 115, 396, 86', 963, 192.6),\n", " ('B', 6, 396, '129, 237, 115, 396, 86', 963, 192.6),\n", " ('C', 1, 107, '129, 237, 115, 396, 86, 347, 107', 1417, 202.42857142857142),\n", " ('C', 9, 347, '129, 237, 115, 396, 86, 347, 107', 1417, 202.42857142857142),\n", " ('D', 5, 89, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5),\n", " ('D', 5, 221, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5),\n", " ('D', 7, 98, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5)]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "\n", "SELECT person, time, bsl, \n", " GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n", " SUM(bsl) OVER win AS bsl_sum,\n", " AVG(bsl) OVER win AS bsl_avg\n", "FROM df\n", "WINDOW win AS (\n", " ORDER BY person\n", " RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n", ")\n", "ORDER BY person, time;" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
persontimebslbsl_sum
8A0115160.333333
5A2237160.333333
2A3129160.333333
7B586192.600000
3B6396192.600000
4C1107202.428571
1C9347202.428571
6D589182.500000
9D5221182.500000
0D798182.500000
\n", "
" ], "text/plain": [ " person time bsl bsl_sum\n", "8 A 0 115 160.333333\n", "5 A 2 237 160.333333\n", "2 A 3 129 160.333333\n", "7 B 5 86 192.600000\n", "3 B 6 396 192.600000\n", "4 C 1 107 202.428571\n", "1 C 9 347 202.428571\n", "6 D 5 89 182.500000\n", "9 D 5 221 182.500000\n", "0 D 7 98 182.500000" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.copy()\n", "df_ = df_.sort_values(['person', 'time'])\n", "df_['bsl_sum'] = df_['bsl'].cumsum().groupby(df_['person']).transform('last')\n", "df_['bsl_sum'] = df_['bsl'].expanding().mean().groupby(df_['person']).transform('last')\n", "df_" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }