{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SQL Queries 02"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"%sql sqlite://"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create toy data set"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from faker import Faker"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"fake = Faker()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"names = list(set([fake.name() for i in range(100)]))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(123)\n",
"\n",
"n1 = 30\n",
"bios821 = pd.DataFrame(dict(\n",
" name=np.random.choice(names, n1, replace=False), \n",
" grade=np.random.randint(50, 101, n1)))\n",
"\n",
"n2 = 30\n",
"bios823 = pd.DataFrame(dict(\n",
" name=np.random.choice(names, n2, replace=False), \n",
" grade=np.random.randint(50, 101, n2)))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n",
"Done.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"DROP TABLE IF EXISTS bios821;\n",
"DROP TABLE IF EXISTs bios823"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n"
]
},
{
"data": {
"text/plain": [
"'Persisted bios821'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql -p bios821"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n"
]
},
{
"data": {
"text/plain": [
"'Persisted bios823'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql -p bios823"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"
\n",
" \n",
" index | \n",
" name | \n",
" grade | \n",
"
\n",
" \n",
" 0 | \n",
" Mrs. Michele Ingram | \n",
" 65 | \n",
"
\n",
" \n",
" 1 | \n",
" Mark Shaffer | \n",
" 90 | \n",
"
\n",
" \n",
" 2 | \n",
" Alexandra James | \n",
" 75 | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 'Mrs. Michele Ingram', 65),\n",
" (1, 'Mark Shaffer', 90),\n",
" (2, 'Alexandra James', 75)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * FROM bios821 LIMIT 3"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" index | \n",
" name | \n",
" grade | \n",
"
\n",
" \n",
" 0 | \n",
" Loretta Smith | \n",
" 59 | \n",
"
\n",
" \n",
" 1 | \n",
" Allison Kirby | \n",
" 90 | \n",
"
\n",
" \n",
" 2 | \n",
" Robert Wong | \n",
" 91 | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 'Loretta Smith', 59), (1, 'Allison Kirby', 90), (2, 'Robert Wong', 91)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * FROM bios823 LIMIT 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Subqueries\n",
"\n",
"There are 3 ways to use a sub-query:\n",
"\n",
"- with `in`\n",
"- with `exists`\n",
"- with a comparison operator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**1**. What students take both bios821 and bios823?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using set operations."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
"
\n",
" \n",
" Alexandra James | \n",
"
\n",
" \n",
" Cindy Clark | \n",
"
\n",
" \n",
" Dr. Scott Mendoza MD | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
"
\n",
" \n",
" Robert Wong | \n",
"
\n",
" \n",
" Sharon Porter | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James',),\n",
" ('Cindy Clark',),\n",
" ('Dr. Scott Mendoza MD',),\n",
" ('Jennifer Hunt',),\n",
" ('Kathy Cochran',),\n",
" ('Nancy Castaneda',),\n",
" ('Robert Wong',),\n",
" ('Sharon Porter',)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT name FROM bios821\n",
"INTERSECT\n",
"SELECT name FROM bios823"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using EQUIJOIN."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
"
\n",
" \n",
" Alexandra James | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
"
\n",
" \n",
" Dr. Scott Mendoza MD | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
"
\n",
" \n",
" Cindy Clark | \n",
"
\n",
" \n",
" Robert Wong | \n",
"
\n",
" \n",
" Sharon Porter | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James',),\n",
" ('Nancy Castaneda',),\n",
" ('Jennifer Hunt',),\n",
" ('Dr. Scott Mendoza MD',),\n",
" ('Kathy Cochran',),\n",
" ('Cindy Clark',),\n",
" ('Robert Wong',),\n",
" ('Sharon Porter',)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT DISTINCT bios821.name \n",
"FROM bios821, bios823\n",
"WHERE bios821.name = bios823.name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using sub-query."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
"
\n",
" \n",
" Alexandra James | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
"
\n",
" \n",
" Dr. Scott Mendoza MD | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
"
\n",
" \n",
" Cindy Clark | \n",
"
\n",
" \n",
" Robert Wong | \n",
"
\n",
" \n",
" Sharon Porter | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James',),\n",
" ('Nancy Castaneda',),\n",
" ('Jennifer Hunt',),\n",
" ('Dr. Scott Mendoza MD',),\n",
" ('Kathy Cochran',),\n",
" ('Cindy Clark',),\n",
" ('Robert Wong',),\n",
" ('Sharon Porter',)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT DISTINCT name FROM bios821\n",
"WHERE name IN (\n",
" SELECT name FROM bios823\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using correlated sub-query."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
"
\n",
" \n",
" Alexandra James | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
"
\n",
" \n",
" Dr. Scott Mendoza MD | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
"
\n",
" \n",
" Cindy Clark | \n",
"
\n",
" \n",
" Robert Wong | \n",
"
\n",
" \n",
" Sharon Porter | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James',),\n",
" ('Nancy Castaneda',),\n",
" ('Jennifer Hunt',),\n",
" ('Dr. Scott Mendoza MD',),\n",
" ('Kathy Cochran',),\n",
" ('Cindy Clark',),\n",
" ('Robert Wong',),\n",
" ('Sharon Porter',)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT DISTINCT name FROM bios821\n",
"WHERE EXISTS (\n",
" SELECT * FROM bios823 \n",
" WHERE bios821.name = bios823.name\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**2**. Which students taking bios823 have grades higher than the average grade of bios821? \n",
"\n",
"- Do this for all students\n",
"- Do this just for the subet of patinets taking both classes"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
" grade | \n",
"
\n",
" \n",
" Allison Kirby | \n",
" 90 | \n",
"
\n",
" \n",
" Robert Wong | \n",
" 91 | \n",
"
\n",
" \n",
" Steven Chavez | \n",
" 90 | \n",
"
\n",
" \n",
" Jennifer Paul | \n",
" 75 | \n",
"
\n",
" \n",
" Allison Fletcher | \n",
" 100 | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
" 86 | \n",
"
\n",
" \n",
" Dr. Amanda Ryan | \n",
" 94 | \n",
"
\n",
" \n",
" Terri Perez | \n",
" 78 | \n",
"
\n",
" \n",
" Julie Nguyen | \n",
" 85 | \n",
"
\n",
" \n",
" Brian Garrison | \n",
" 89 | \n",
"
\n",
" \n",
" Hannah Wagner | \n",
" 94 | \n",
"
\n",
" \n",
" Alexandra Wallace | \n",
" 90 | \n",
"
\n",
" \n",
" Veronica Herrera | \n",
" 78 | \n",
"
\n",
" \n",
" Sharon Porter | \n",
" 98 | \n",
"
\n",
" \n",
" Manuel Reyes | \n",
" 96 | \n",
"
\n",
" \n",
" Vicki Johnson | \n",
" 99 | \n",
"
\n",
" \n",
" John Williams | \n",
" 96 | \n",
"
\n",
"
"
],
"text/plain": [
"[('Allison Kirby', 90),\n",
" ('Robert Wong', 91),\n",
" ('Steven Chavez', 90),\n",
" ('Jennifer Paul', 75),\n",
" ('Allison Fletcher', 100),\n",
" ('Jennifer Hunt', 86),\n",
" ('Dr. Amanda Ryan', 94),\n",
" ('Terri Perez', 78),\n",
" ('Julie Nguyen', 85),\n",
" ('Brian Garrison', 89),\n",
" ('Hannah Wagner', 94),\n",
" ('Alexandra Wallace', 90),\n",
" ('Veronica Herrera', 78),\n",
" ('Sharon Porter', 98),\n",
" ('Manuel Reyes', 96),\n",
" ('Vicki Johnson', 99),\n",
" ('John Williams', 96)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT DISTINCT name, grade FROM bios823\n",
"WHERE grade > (\n",
" SELECT AVG(grade) FROM bios821 \n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using a view."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"CREATE VIEW view_common AS \n",
" SELECT DISTINCT\n",
" bios821.name, \n",
" bios821.grade as grade_821, \n",
" bios823.grade as grade_823 \n",
" FROM \n",
" bios821, bios823\n",
" WHERE \n",
" bios821.name = bios823.name\n",
" AND bios821.name IN (\n",
" SELECT name \n",
" FROM bios823 \n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
" grade_823 | \n",
"
\n",
" \n",
" Alexandra James | \n",
" 73 | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
" 67 | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
" 86 | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
" 74 | \n",
"
\n",
" \n",
" Cindy Clark | \n",
" 73 | \n",
"
\n",
" \n",
" Robert Wong | \n",
" 91 | \n",
"
\n",
" \n",
" Sharon Porter | \n",
" 98 | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James', 73),\n",
" ('Nancy Castaneda', 67),\n",
" ('Jennifer Hunt', 86),\n",
" ('Kathy Cochran', 74),\n",
" ('Cindy Clark', 73),\n",
" ('Robert Wong', 91),\n",
" ('Sharon Porter', 98)]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT name, grade_823\n",
"FROM view_common\n",
"WHERE grade_823 > (\n",
" SELECT AVG(grade_821) \n",
" FROM view_common\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using `pandas`."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"df = bios821.merge(bios823, on='name', suffixes=['_821', '_823'])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" grade_821 | \n",
" grade_823 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Alexandra James | \n",
" 75 | \n",
" 73 | \n",
"
\n",
" \n",
" 1 | \n",
" Nancy Castaneda | \n",
" 50 | \n",
" 67 | \n",
"
\n",
" \n",
" 2 | \n",
" Jennifer Hunt | \n",
" 51 | \n",
" 86 | \n",
"
\n",
" \n",
" 3 | \n",
" Dr. Scott Mendoza MD | \n",
" 88 | \n",
" 64 | \n",
"
\n",
" \n",
" 4 | \n",
" Kathy Cochran | \n",
" 51 | \n",
" 74 | \n",
"
\n",
" \n",
" 5 | \n",
" Cindy Clark | \n",
" 56 | \n",
" 73 | \n",
"
\n",
" \n",
" 6 | \n",
" Robert Wong | \n",
" 63 | \n",
" 91 | \n",
"
\n",
" \n",
" 7 | \n",
" Sharon Porter | \n",
" 78 | \n",
" 98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name grade_821 grade_823\n",
"0 Alexandra James 75 73\n",
"1 Nancy Castaneda 50 67\n",
"2 Jennifer Hunt 51 86\n",
"3 Dr. Scott Mendoza MD 88 64\n",
"4 Kathy Cochran 51 74\n",
"5 Cindy Clark 56 73\n",
"6 Robert Wong 63 91\n",
"7 Sharon Porter 78 98"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" grade_821 | \n",
" grade_823 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Alexandra James | \n",
" 75 | \n",
" 73 | \n",
"
\n",
" \n",
" 1 | \n",
" Nancy Castaneda | \n",
" 50 | \n",
" 67 | \n",
"
\n",
" \n",
" 2 | \n",
" Jennifer Hunt | \n",
" 51 | \n",
" 86 | \n",
"
\n",
" \n",
" 4 | \n",
" Kathy Cochran | \n",
" 51 | \n",
" 74 | \n",
"
\n",
" \n",
" 5 | \n",
" Cindy Clark | \n",
" 56 | \n",
" 73 | \n",
"
\n",
" \n",
" 6 | \n",
" Robert Wong | \n",
" 63 | \n",
" 91 | \n",
"
\n",
" \n",
" 7 | \n",
" Sharon Porter | \n",
" 78 | \n",
" 98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name grade_821 grade_823\n",
"0 Alexandra James 75 73\n",
"1 Nancy Castaneda 50 67\n",
"2 Jennifer Hunt 51 86\n",
"4 Kathy Cochran 51 74\n",
"5 Cindy Clark 56 73\n",
"6 Robert Wong 63 91\n",
"7 Sharon Porter 78 98"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.grade_823 > df.grade_821.mean()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Common table expressions (CTE)\n",
"\n",
"CTEs are temporary tables created for a specific query."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Basic syntax for CTE"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" Bob | \n",
" 23 | \n",
"
\n",
"
"
],
"text/plain": [
"[('Bob', 23)]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH\n",
" t1(name) AS (SELECT 'Bob'),\n",
" t2(age) AS (SELECT 23)\n",
"SELECT * from t1, t2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using CTEs to solve previous problem"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" name | \n",
" grade_823 | \n",
"
\n",
" \n",
" Alexandra James | \n",
" 73 | \n",
"
\n",
" \n",
" Nancy Castaneda | \n",
" 67 | \n",
"
\n",
" \n",
" Jennifer Hunt | \n",
" 86 | \n",
"
\n",
" \n",
" Kathy Cochran | \n",
" 74 | \n",
"
\n",
" \n",
" Cindy Clark | \n",
" 73 | \n",
"
\n",
" \n",
" Robert Wong | \n",
" 91 | \n",
"
\n",
" \n",
" Sharon Porter | \n",
" 98 | \n",
"
\n",
"
"
],
"text/plain": [
"[('Alexandra James', 73),\n",
" ('Nancy Castaneda', 67),\n",
" ('Jennifer Hunt', 86),\n",
" ('Kathy Cochran', 74),\n",
" ('Cindy Clark', 73),\n",
" ('Robert Wong', 91),\n",
" ('Sharon Porter', 98)]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH common AS (\n",
" SELECT DISTINCT \n",
" bios821.name, \n",
" bios821.grade as grade_821, \n",
" bios823.grade as grade_823 \n",
" FROM \n",
" bios821,\n",
" bios823\n",
" WHERE bios821.name IN (\n",
" SELECT name FROM bios823\n",
" ) AND\n",
" bios821.name = bios823.name\n",
")\n",
"SELECT name, grade_823\n",
"FROM common\n",
"WHERE grade_823 > (\n",
" SELECT AVG(grade_821) \n",
" FROM common\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Recursive CTEs\n",
"\n",
"Image source: https://cdn.sqlservertutorial.net/wp-content/uploads/SQL-Server-Recursive-CTE-execution-flow.png\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### As a generator"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" n | \n",
"
\n",
" \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1,), (2,), (3,), (4,), (5,)]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH RECURSIVE count(n) AS (\n",
" SELECT 1\n",
" UNION ALL\n",
" SELECT n+1 FROM count WHERe n < 5\n",
")\n",
"SELECT * FROM count"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Generating dates"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" Homework due | \n",
"
\n",
" \n",
" 2019-01-15 | \n",
"
\n",
" \n",
" 2019-02-15 | \n",
"
\n",
" \n",
" 2019-03-15 | \n",
"
\n",
" \n",
" 2019-04-15 | \n",
"
\n",
" \n",
" 2019-05-15 | \n",
"
\n",
" \n",
" 2019-06-15 | \n",
"
\n",
" \n",
" 2019-07-15 | \n",
"
\n",
" \n",
" 2019-08-15 | \n",
"
\n",
" \n",
" 2019-09-15 | \n",
"
\n",
" \n",
" 2019-10-15 | \n",
"
\n",
" \n",
" 2019-11-15 | \n",
"
\n",
" \n",
" 2019-12-15 | \n",
"
\n",
"
"
],
"text/plain": [
"[('2019-01-15',),\n",
" ('2019-02-15',),\n",
" ('2019-03-15',),\n",
" ('2019-04-15',),\n",
" ('2019-05-15',),\n",
" ('2019-06-15',),\n",
" ('2019-07-15',),\n",
" ('2019-08-15',),\n",
" ('2019-09-15',),\n",
" ('2019-10-15',),\n",
" ('2019-11-15',),\n",
" ('2019-12-15',)]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH RECURSIVE make_dates(d) AS (\n",
" SELECT '2019-01-15'\n",
" UNION ALL\n",
" SELECT DATE(d, '+1 MONTHS') FROM make_dates WHERe d < '2019-12'\n",
")\n",
"SELECT d AS \"Homework due\" FROM make_dates"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### CTEs to do programming in SQL(!)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" n | \n",
" f | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
"
\n",
" \n",
" 4 | \n",
" 24 | \n",
"
\n",
" \n",
" 5 | \n",
" 120 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 1), (2, 2), (3, 6), (4, 24), (5, 120)]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH RECURSIVE fact(n, f) AS (\n",
" SELECT 1, 1\n",
" UNION ALL\n",
" SELECT n+1, (n+1) * f \n",
" FROM fact \n",
" WHERe n < 5\n",
")\n",
"SELECT * FROM fact\n",
"LIMIT 5"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite://\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" n | \n",
" f1 | \n",
" f2 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" 5 | \n",
" 3 | \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" 8 | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 13 | \n",
"
\n",
" \n",
" 8 | \n",
" 13 | \n",
" 21 | \n",
"
\n",
" \n",
" 9 | \n",
" 21 | \n",
" 34 | \n",
"
\n",
" \n",
" 10 | \n",
" 34 | \n",
" 55 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 0, 1),\n",
" (2, 1, 1),\n",
" (3, 1, 2),\n",
" (4, 2, 3),\n",
" (5, 3, 5),\n",
" (6, 5, 8),\n",
" (7, 8, 13),\n",
" (8, 13, 21),\n",
" (9, 21, 34),\n",
" (10, 34, 55)]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH RECURSIVE fib(n, f1, f2) AS (\n",
" SELECT 1, 0, 1\n",
" UNION ALL\n",
" SELECT n+1, f2, f1+f2\n",
" FROM fib \n",
" WHERe n < 10\n",
")\n",
"SELECT * FROM fib\n",
"LIMIT 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### A common use of CTEs is to work with naturally recursive structures (trees or graphs)\n",
"\n",
"Suppose we have a cell subset taxonomy with parent-child relations. We want to find all `descendants` of a particular cell type. This is hard to do without recursive CTEs in SQL. "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"15 rows affected.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql sqlite:///\n",
"\n",
"DROP TABLE IF EXISTS cell;\n",
"\n",
"CREATE TABLE cell(\n",
" cell_id integer PRIMARY KEY,\n",
" name VARCHAR(30),\n",
" parent_id integer\n",
");\n",
"\n",
"INSERT INTO cell (\n",
" cell_id,\n",
" name,\n",
" parent_id\n",
")\n",
"VALUES\n",
" (1, 'WBC', NULL),\n",
" (2, 'Lymphocyte', 1),\n",
" (3, 'T Cell', 2),\n",
" (4, 'B Cell', 2),\n",
" (5, 'NK Cell', 2),\n",
" (6, 'T helper cell', 3),\n",
" (7, 'T cytotoxic cell', 3),\n",
" (8, 'T regulatory cell', 3),\n",
" (9, 'Naive B cell', 4),\n",
" (10, 'Memory B cell', 4),\n",
" (11, 'Plasma cell', 4),\n",
" (12, 'Granulocyte', 1),\n",
" (13, 'Basophil', 12),\n",
" (14, 'Eosinophil', 12),\n",
" (15, 'Neutrophil', 12)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" cell_id | \n",
" name | \n",
" parent_id | \n",
"
\n",
" \n",
" 1 | \n",
" WBC | \n",
" None | \n",
"
\n",
" \n",
" 2 | \n",
" Lymphocyte | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" T Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" B Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 5 | \n",
" NK Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" T helper cell | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" T cytotoxic cell | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" T regulatory cell | \n",
" 3 | \n",
"
\n",
" \n",
" 9 | \n",
" Naive B cell | \n",
" 4 | \n",
"
\n",
" \n",
" 10 | \n",
" Memory B cell | \n",
" 4 | \n",
"
\n",
"
"
],
"text/plain": [
"[(1, 'WBC', None),\n",
" (2, 'Lymphocyte', 1),\n",
" (3, 'T Cell', 2),\n",
" (4, 'B Cell', 2),\n",
" (5, 'NK Cell', 2),\n",
" (6, 'T helper cell', 3),\n",
" (7, 'T cytotoxic cell', 3),\n",
" (8, 'T regulatory cell', 3),\n",
" (9, 'Naive B cell', 4),\n",
" (10, 'Memory B cell', 4)]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT * FROM cell LIMIT 10"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" cell_id | \n",
" name | \n",
" parent_id | \n",
"
\n",
" \n",
" 3 | \n",
" T Cell | \n",
" 2 | \n",
"
\n",
"
"
],
"text/plain": [
"[(3, 'T Cell', 2)]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
" SELECT cell_id, name, parent_id\n",
" FROM cell\n",
" WHERE name='T Cell'"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"target = 'Lymphocyte'"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" cell_id | \n",
" name | \n",
" parent_id | \n",
"
\n",
" \n",
" 2 | \n",
" Lymphocyte | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" B Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 5 | \n",
" NK Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" T Cell | \n",
" 2 | \n",
"
\n",
" \n",
" 10 | \n",
" Memory B cell | \n",
" 4 | \n",
"
\n",
" \n",
" 9 | \n",
" Naive B cell | \n",
" 4 | \n",
"
\n",
" \n",
" 11 | \n",
" Plasma cell | \n",
" 4 | \n",
"
\n",
" \n",
" 7 | \n",
" T cytotoxic cell | \n",
" 3 | \n",
"
\n",
" \n",
" 6 | \n",
" T helper cell | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" T regulatory cell | \n",
" 3 | \n",
"
\n",
"
"
],
"text/plain": [
"[(2, 'Lymphocyte', 1),\n",
" (4, 'B Cell', 2),\n",
" (5, 'NK Cell', 2),\n",
" (3, 'T Cell', 2),\n",
" (10, 'Memory B cell', 4),\n",
" (9, 'Naive B cell', 4),\n",
" (11, 'Plasma cell', 4),\n",
" (7, 'T cytotoxic cell', 3),\n",
" (6, 'T helper cell', 3),\n",
" (8, 'T regulatory cell', 3)]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"WITH RECURSIVE lineage AS (\n",
" SELECT \n",
" cell_id, \n",
" name, \n",
" parent_id\n",
" FROM \n",
" cell\n",
" WHERE \n",
" name=:target\n",
"UNION ALL\n",
" SELECT\n",
" c.cell_id, \n",
" c.name, \n",
" c.parent_id\n",
" FROM \n",
" cell c\n",
" INNER JOIN \n",
" lineage l \n",
" ON \n",
" l.cell_id = c.parent_id\n",
")\n",
"SELECT * FROM lineage \n",
"LIMIT 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Window Functions\n",
"\n",
"- [Official Refs](https://www.sqlite.org/windowfunctions.html)\n",
""
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(23)\n",
"n = 10\n",
"df = pd.DataFrame(\n",
" dict(person=np.random.choice(['A', 'B', 'C', 'D'], n,),\n",
" time=np.random.randint(0, 10, n), \n",
" bsl=np.random.randint(50, 400, n)))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl\n",
"8 A 0 115\n",
"5 A 2 237\n",
"2 A 3 129\n",
"7 B 5 86\n",
"3 B 6 396\n",
"4 C 1 107\n",
"1 C 9 347\n",
"6 D 5 89\n",
"9 D 5 221\n",
"0 D 7 98"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(['person', 'time'])"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql DROP TABLE IF EXISTS df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Magic shortcut to creating a database table from `pandas` DataFrame."
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n"
]
},
{
"data": {
"text/plain": [
"'Persisted df'"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql -p df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Over creates widows\n",
"\n",
"I've given `pandas` equivalents where possible, but sometimes they are rather unnatural."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_num | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 1 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 4 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 5 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 6 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 7 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 8 | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 9 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 10 | \n",
"
\n",
"
"
],
"text/plain": [
"[('D', 7, 98, 1),\n",
" ('C', 9, 347, 2),\n",
" ('A', 3, 129, 3),\n",
" ('B', 6, 396, 4),\n",
" ('C', 1, 107, 5),\n",
" ('A', 2, 237, 6),\n",
" ('D', 5, 89, 7),\n",
" ('B', 5, 86, 8),\n",
" ('A', 0, 115, 9),\n",
" ('D', 5, 221, 10)]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, row_number() \n",
"OVER () as row_num\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_num | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 6 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 7 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 8 | \n",
"
\n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 9 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_num\n",
"0 D 7 98 1\n",
"1 C 9 347 2\n",
"2 A 3 129 3\n",
"3 B 6 396 4\n",
"4 C 1 107 5\n",
"5 A 2 237 6\n",
"6 D 5 89 7\n",
"7 B 5 86 8\n",
"8 A 0 115 9\n",
"9 D 5 221 10"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.assign(row_num = df.person.expanding(1).count().astype('int'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lag and Lead"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" lag1 | \n",
" lead2 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" None | \n",
" 129 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 98 | \n",
" 396 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 347 | \n",
" 107 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 129 | \n",
" 237 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 396 | \n",
" 89 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 107 | \n",
" 86 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 237 | \n",
" 115 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 89 | \n",
" 221 | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 86 | \n",
" None | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 115 | \n",
" None | \n",
"
\n",
"
"
],
"text/plain": [
"[('D', 7, 98, None, 129),\n",
" ('C', 9, 347, 98, 396),\n",
" ('A', 3, 129, 347, 107),\n",
" ('B', 6, 396, 129, 237),\n",
" ('C', 1, 107, 396, 89),\n",
" ('A', 2, 237, 107, 86),\n",
" ('D', 5, 89, 237, 115),\n",
" ('B', 5, 86, 89, 221),\n",
" ('A', 0, 115, 86, None),\n",
" ('D', 5, 221, 115, None)]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, \n",
"lag(bsl, 1) OVER () as lag1,\n",
"lead(bsl, 2) OVER () as lead2\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" lag1 | \n",
" lead2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" NaN | \n",
" 129.0 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 98.0 | \n",
" 396.0 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 347.0 | \n",
" 107.0 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 129.0 | \n",
" 237.0 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 396.0 | \n",
" 89.0 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 107.0 | \n",
" 86.0 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 237.0 | \n",
" 115.0 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 89.0 | \n",
" 221.0 | \n",
"
\n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 86.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 115.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl lag1 lead2\n",
"0 D 7 98 NaN 129.0\n",
"1 C 9 347 98.0 396.0\n",
"2 A 3 129 347.0 107.0\n",
"3 B 6 396 129.0 237.0\n",
"4 C 1 107 396.0 89.0\n",
"5 A 2 237 107.0 86.0\n",
"6 D 5 89 237.0 115.0\n",
"7 B 5 86 89.0 221.0\n",
"8 A 0 115 86.0 NaN\n",
"9 D 5 221 115.0 NaN"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_['lag1'] = df_.bsl.shift(1)\n",
"df_['lead2'] = df_.bsl.shift(-2)\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Order by"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number() OVER (ORDER BY person, time) | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 4 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 5 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 6 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 7 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 8 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 9 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 10 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, 1),\n",
" ('A', 2, 237, 2),\n",
" ('A', 3, 129, 3),\n",
" ('B', 5, 86, 4),\n",
" ('B', 6, 396, 5),\n",
" ('C', 1, 107, 6),\n",
" ('C', 9, 347, 7),\n",
" ('D', 5, 89, 8),\n",
" ('D', 5, 221, 9),\n",
" ('D', 7, 98, 10)]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, row_number() \n",
"OVER (ORDER BY person, time)\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_num | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 4 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 6 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 7 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 8 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 9 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_num\n",
"8 A 0 115 1\n",
"5 A 2 237 2\n",
"2 A 3 129 3\n",
"7 B 5 86 4\n",
"3 B 6 396 5\n",
"4 C 1 107 6\n",
"1 C 9 347 7\n",
"6 D 5 89 8\n",
"9 D 5 221 9\n",
"0 D 7 98 10"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_.assign(row_num = df_.person.expanding(1).count().astype('int'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Partition by"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 1 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 2 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 1 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 1 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 2 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 3 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, 1),\n",
" ('A', 2, 237, 2),\n",
" ('A', 3, 129, 3),\n",
" ('B', 5, 86, 1),\n",
" ('B', 6, 396, 2),\n",
" ('C', 1, 107, 1),\n",
" ('C', 9, 347, 2),\n",
" ('D', 5, 89, 1),\n",
" ('D', 5, 221, 2),\n",
" ('D', 7, 98, 3)]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, row_number() \n",
"OVER (PARTITION BY person ORDER BY time) as row_number\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 2 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_number\n",
"8 A 0 115 1\n",
"5 A 2 237 2\n",
"2 A 3 129 3\n",
"7 B 5 86 1\n",
"3 B 6 396 2\n",
"4 C 1 107 1\n",
"1 C 9 347 2\n",
"6 D 5 89 1\n",
"9 D 5 221 2\n",
"0 D 7 98 3"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_['row_number'] = df_.groupby(['person']).cumcount()+1\n",
"df_"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" window | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 115, 237 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 115, 237, 129 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 86 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 86, 396 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 107 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 107, 347 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 89, 221 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 89, 221 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 89, 221, 98 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, '115'),\n",
" ('A', 2, 237, '115, 237'),\n",
" ('A', 3, 129, '115, 237, 129'),\n",
" ('B', 5, 86, '86'),\n",
" ('B', 6, 396, '86, 396'),\n",
" ('C', 1, 107, '107'),\n",
" ('C', 9, 347, '107, 347'),\n",
" ('D', 5, 89, '89, 221'),\n",
" ('D', 5, 221, '89, 221'),\n",
" ('D', 7, 98, '89, 221, 98')]"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, group_concat(bsl, ', ')\n",
"OVER (PARTITION BY person ORDER BY time) as window\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number | \n",
" window | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
" 115 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
" 115,237 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
" 115,237,129 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 1 | \n",
" 86 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 2 | \n",
" 86,396 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 1 | \n",
" 107 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
" 107,347 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 1 | \n",
" 89 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 2 | \n",
" 89,221 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 3 | \n",
" 89,221,98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_number window\n",
"8 A 0 115 1 115\n",
"5 A 2 237 2 115,237\n",
"2 A 3 129 3 115,237,129\n",
"7 B 5 86 1 86\n",
"3 B 6 396 2 86,396\n",
"4 C 1 107 1 107\n",
"1 C 9 347 2 107,347\n",
"6 D 5 89 1 89\n",
"9 D 5 221 2 89,221\n",
"0 D 7 98 3 89,221,98"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_['window'] = (\n",
" df_.groupby('person').\n",
" apply(lambda x: pd.Series([x.bsl.iloc[:(i+1)].values \n",
" for i in pd.Series(np.arange(len(x)))]))\n",
").values\n",
"df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specifying rows in window"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" window | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115, 237 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 115, 237, 129 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 237, 129 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 86, 396 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 86, 396 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 107, 347 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 107, 347 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 89, 221 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 89, 221, 98 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 221, 98 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, '115, 237'),\n",
" ('A', 2, 237, '115, 237, 129'),\n",
" ('A', 3, 129, '237, 129'),\n",
" ('B', 5, 86, '86, 396'),\n",
" ('B', 6, 396, '86, 396'),\n",
" ('C', 1, 107, '107, 347'),\n",
" ('C', 9, 347, '107, 347'),\n",
" ('D', 5, 89, '89, 221'),\n",
" ('D', 5, 221, '89, 221, 98'),\n",
" ('D', 7, 98, '221, 98')]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, group_concat(bsl, ', ')\n",
"OVER (\n",
" PARTITION BY person\n",
" ORDER BY time\n",
" ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING\n",
" ) as window\n",
"FROM df;"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number | \n",
" window | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 1 | \n",
" 115,237 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
" 115,237,129 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 3 | \n",
" 237,129 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 1 | \n",
" 86,396 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 2 | \n",
" 86,396 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 1 | \n",
" 107,347 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 2 | \n",
" 107,347 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 1 | \n",
" 89,221 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 2 | \n",
" 89,221,98 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 3 | \n",
" 221,98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_number window\n",
"8 A 0 115 1 115,237\n",
"5 A 2 237 2 115,237,129\n",
"2 A 3 129 3 237,129\n",
"7 B 5 86 1 86,396\n",
"3 B 6 396 2 86,396\n",
"4 C 1 107 1 107,347\n",
"1 C 9 347 2 107,347\n",
"6 D 5 89 1 89,221\n",
"9 D 5 221 2 89,221,98\n",
"0 D 7 98 3 221,98"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_['window'] = (\n",
" df_.groupby('person').\n",
" apply(lambda x: pd.Series([x.bsl.iloc[(i-len(x)-1):(i+2)].values \n",
" for i in pd.Series(np.arange(len(x)))]))\n",
").values\n",
"df_['window'] = df_.window.apply(lambda x: ','.join(map(str, x)))\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using window functions"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_number | \n",
" rank | \n",
" dense_rank | \n",
" percent_rank | \n",
" cume_dist | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0.0 | \n",
" 0.3 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 0.0 | \n",
" 0.3 | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" 0.0 | \n",
" 0.3 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 4 | \n",
" 4 | \n",
" 2 | \n",
" 0.3333333333333333 | \n",
" 0.5 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 5 | \n",
" 4 | \n",
" 2 | \n",
" 0.3333333333333333 | \n",
" 0.5 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 6 | \n",
" 6 | \n",
" 3 | \n",
" 0.5555555555555556 | \n",
" 0.7 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 7 | \n",
" 6 | \n",
" 3 | \n",
" 0.5555555555555556 | \n",
" 0.7 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 8 | \n",
" 8 | \n",
" 4 | \n",
" 0.7777777777777778 | \n",
" 1.0 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 9 | \n",
" 8 | \n",
" 4 | \n",
" 0.7777777777777778 | \n",
" 1.0 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 10 | \n",
" 8 | \n",
" 4 | \n",
" 0.7777777777777778 | \n",
" 1.0 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 3, 129, 1, 1, 1, 0.0, 0.3),\n",
" ('A', 2, 237, 2, 1, 1, 0.0, 0.3),\n",
" ('A', 0, 115, 3, 1, 1, 0.0, 0.3),\n",
" ('B', 6, 396, 4, 4, 2, 0.3333333333333333, 0.5),\n",
" ('B', 5, 86, 5, 4, 2, 0.3333333333333333, 0.5),\n",
" ('C', 9, 347, 6, 6, 3, 0.5555555555555556, 0.7),\n",
" ('C', 1, 107, 7, 6, 3, 0.5555555555555556, 0.7),\n",
" ('D', 7, 98, 8, 8, 4, 0.7777777777777778, 1.0),\n",
" ('D', 5, 89, 9, 8, 4, 0.7777777777777778, 1.0),\n",
" ('D', 5, 221, 10, 8, 4, 0.7777777777777778, 1.0)]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, \n",
" row_number() OVER win AS row_number,\n",
" rank() OVER win AS rank,\n",
" dense_rank() OVER win AS dense_rank,\n",
" percent_rank() OVER win AS percent_rank,\n",
" cume_dist() OVER win AS cume_dist\n",
"FROM df\n",
"WINDOW win AS (ORDER BY person);"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" row_num | \n",
" rank | \n",
" dense_rank | \n",
" percent_rank | \n",
" cume_dist | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0.000000 | \n",
" 0.3 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 0.000000 | \n",
" 0.3 | \n",
"
\n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" 0.000000 | \n",
" 0.3 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 4 | \n",
" 4 | \n",
" 2 | \n",
" 0.333333 | \n",
" 0.5 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 5 | \n",
" 4 | \n",
" 2 | \n",
" 0.333333 | \n",
" 0.5 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 6 | \n",
" 6 | \n",
" 3 | \n",
" 0.555556 | \n",
" 0.7 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 7 | \n",
" 6 | \n",
" 3 | \n",
" 0.555556 | \n",
" 0.7 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 8 | \n",
" 8 | \n",
" 4 | \n",
" 0.777778 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 9 | \n",
" 8 | \n",
" 4 | \n",
" 0.777778 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 10 | \n",
" 8 | \n",
" 4 | \n",
" 0.777778 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl row_num rank dense_rank percent_rank cume_dist\n",
"2 A 3 129 1 1 1 0.000000 0.3\n",
"5 A 2 237 2 1 1 0.000000 0.3\n",
"8 A 0 115 3 1 1 0.000000 0.3\n",
"3 B 6 396 4 4 2 0.333333 0.5\n",
"7 B 5 86 5 4 2 0.333333 0.5\n",
"1 C 9 347 6 6 3 0.555556 0.7\n",
"4 C 1 107 7 6 3 0.555556 0.7\n",
"0 D 7 98 8 8 4 0.777778 1.0\n",
"6 D 5 89 9 8 4 0.777778 1.0\n",
"9 D 5 221 10 8 4 0.777778 1.0"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person'])\n",
"df_['row_num'] = df_['person'].expanding(1).count().astype('int')\n",
"df_['rank'] = df_['person'].rank(method='min').astype('int')\n",
"df_['dense_rank'] = df_['person'].rank(method='dense').astype('int')\n",
"df_['percent_rank'] = (df_.person.rank(method='min') - 1) / (df_.person.count()-1)\n",
"df_['cume_dist'] = df_['person'].rank(method='max', pct=True)\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### The NTILE window function"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" scores | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
"
\n",
" \n",
" 1 | \n",
" 7 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
"
\n",
" \n",
" 4 | \n",
" 9 | \n",
"
\n",
" \n",
" 5 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 8 | \n",
"
\n",
" \n",
" 9 | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" scores\n",
"0 6\n",
"1 7\n",
"2 3\n",
"3 6\n",
"4 9\n",
"5 2\n",
"6 3\n",
"7 0\n",
"8 8\n",
"9 6"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores = pd.DataFrame(dict(scores=np.random.randint(0, 10, 10)))\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n"
]
},
{
"data": {
"text/plain": [
"'Persisted scores'"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql -p scores"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" scores | \n",
" quartile | \n",
"
\n",
" \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 3 | \n",
"
\n",
" \n",
" 7 | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" 4 | \n",
"
\n",
" \n",
" 9 | \n",
" 4 | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 1),\n",
" (2, 1),\n",
" (3, 1),\n",
" (3, 2),\n",
" (6, 2),\n",
" (6, 2),\n",
" (6, 3),\n",
" (7, 3),\n",
" (8, 4),\n",
" (9, 4)]"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT scores, NTILE(4)\n",
"OVER (\n",
" -- PARTITION BY scores\n",
" ORDER BY scores \n",
") AS quartile\n",
"FROM scores"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: See [this](https://stackoverflow.com/questions/20726493/python-pandas-qcut-behavior-with-of-observations-not-divisible-by-of-bins) for an explanation of the difference between NTILE and `qcut`"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" scores | \n",
" quantile | \n",
"
\n",
" \n",
" \n",
" \n",
" 7 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" 3 | \n",
" 1 | \n",
"
\n",
" \n",
" 0 | \n",
" 6 | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" 6 | \n",
" 2 | \n",
"
\n",
" \n",
" 9 | \n",
" 6 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 7 | \n",
" 4 | \n",
"
\n",
" \n",
" 8 | \n",
" 8 | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" 9 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" scores quantile\n",
"7 0 1\n",
"5 2 1\n",
"2 3 1\n",
"6 3 1\n",
"0 6 2\n",
"3 6 2\n",
"9 6 2\n",
"1 7 4\n",
"8 8 4\n",
"4 9 4"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quartiles = pd.qcut(scores.scores, 4, labels=[1,2,3,4])\n",
"scores['quantile'] = quartiles\n",
"scores.sort_values('scores')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using aggregate functions\n",
"\n",
"The `sqlite3` implementation of LAST_VALUE seems to be buggy."
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" bsl_sum | \n",
" bsl_avg | \n",
" bsl_min | \n",
" bsl_max | \n",
" bsl_start | \n",
" bsl_end | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115 | \n",
" 115.0 | \n",
" 115 | \n",
" 115 | \n",
" 115 | \n",
" 115 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 352 | \n",
" 176.0 | \n",
" 115 | \n",
" 237 | \n",
" 115 | \n",
" 237 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
" 115 | \n",
" 237 | \n",
" 115 | \n",
" 129 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 86 | \n",
" 86.0 | \n",
" 86 | \n",
" 86 | \n",
" 86 | \n",
" 86 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 482 | \n",
" 241.0 | \n",
" 86 | \n",
" 396 | \n",
" 86 | \n",
" 396 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 107 | \n",
" 107.0 | \n",
" 107 | \n",
" 107 | \n",
" 107 | \n",
" 107 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 454 | \n",
" 227.0 | \n",
" 107 | \n",
" 347 | \n",
" 107 | \n",
" 347 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 310 | \n",
" 155.0 | \n",
" 89 | \n",
" 221 | \n",
" 89 | \n",
" 221 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 310 | \n",
" 155.0 | \n",
" 89 | \n",
" 221 | \n",
" 89 | \n",
" 221 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 408 | \n",
" 136.0 | \n",
" 89 | \n",
" 221 | \n",
" 89 | \n",
" 98 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, 115, 115.0, 115, 115, 115, 115),\n",
" ('A', 2, 237, 352, 176.0, 115, 237, 115, 237),\n",
" ('A', 3, 129, 481, 160.33333333333334, 115, 237, 115, 129),\n",
" ('B', 5, 86, 86, 86.0, 86, 86, 86, 86),\n",
" ('B', 6, 396, 482, 241.0, 86, 396, 86, 396),\n",
" ('C', 1, 107, 107, 107.0, 107, 107, 107, 107),\n",
" ('C', 9, 347, 454, 227.0, 107, 347, 107, 347),\n",
" ('D', 5, 89, 310, 155.0, 89, 221, 89, 221),\n",
" ('D', 5, 221, 310, 155.0, 89, 221, 89, 221),\n",
" ('D', 7, 98, 408, 136.0, 89, 221, 89, 98)]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl,\n",
" SUM(bsl) OVER win AS bsl_sum,\n",
" AVG(bsl) OVER win AS bsl_avg,\n",
" MIN(bsl) OVER win AS bsl_min,\n",
" MAX(bsl) over win as bsl_max,\n",
" FIRST_VALUE(bsl) OVER win as bsl_start,\n",
" LAST_VALUE(bsl) OVER win as bsl_end\n",
"FROM df\n",
"WINDOW win AS (PARTITION BY person ORDER BY time);"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" time | \n",
" bsl | \n",
"
\n",
" \n",
" person | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" time bsl\n",
"person \n",
"A 3 129\n",
"B 6 396\n",
"C 9 347\n",
"D 7 98"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g = df.groupby('person')\n",
"g.first()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" bsl_sum | \n",
" bsl_avg | \n",
" bsl_min | \n",
" bsl_max | \n",
" bsl_start | \n",
" bsl_end | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115 | \n",
" 115.000000 | \n",
" 115 | \n",
" 115 | \n",
" 115 | \n",
" 129 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 352 | \n",
" 176.000000 | \n",
" 115 | \n",
" 237 | \n",
" 115 | \n",
" 129 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 481 | \n",
" 160.333333 | \n",
" 115 | \n",
" 237 | \n",
" 115 | \n",
" 129 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 86 | \n",
" 86.000000 | \n",
" 86 | \n",
" 86 | \n",
" 86 | \n",
" 396 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 482 | \n",
" 241.000000 | \n",
" 86 | \n",
" 396 | \n",
" 86 | \n",
" 396 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 107 | \n",
" 107.000000 | \n",
" 107 | \n",
" 107 | \n",
" 107 | \n",
" 347 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 454 | \n",
" 227.000000 | \n",
" 107 | \n",
" 347 | \n",
" 107 | \n",
" 347 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 89 | \n",
" 89.000000 | \n",
" 89 | \n",
" 89 | \n",
" 89 | \n",
" 98 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 310 | \n",
" 155.000000 | \n",
" 89 | \n",
" 221 | \n",
" 89 | \n",
" 98 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 408 | \n",
" 136.000000 | \n",
" 89 | \n",
" 221 | \n",
" 89 | \n",
" 98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl bsl_sum bsl_avg bsl_min bsl_max bsl_start bsl_end\n",
"8 A 0 115 115 115.000000 115 115 115 129\n",
"5 A 2 237 352 176.000000 115 237 115 129\n",
"2 A 3 129 481 160.333333 115 237 115 129\n",
"7 B 5 86 86 86.000000 86 86 86 396\n",
"3 B 6 396 482 241.000000 86 396 86 396\n",
"4 C 1 107 107 107.000000 107 107 107 347\n",
"1 C 9 347 454 227.000000 107 347 107 347\n",
"6 D 5 89 89 89.000000 89 89 89 98\n",
"9 D 5 221 310 155.000000 89 221 89 98\n",
"0 D 7 98 408 136.000000 89 221 89 98"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_['bsl_sum'] = df_.groupby(['person'])['bsl'].cumsum()\n",
"df_['bsl_avg'] = df_.groupby(['person']).expanding().agg({'bsl': 'mean'}).values\n",
"df_['bsl_min'] = df_.groupby(['person'])['bsl'].cummin()\n",
"df_['bsl_max'] = df_.groupby(['person'])['bsl'].cummax()\n",
"df_['bsl_start'] = df_.groupby(['person'])['bsl'].transform('first')\n",
"df_['bsl_end'] = df_.groupby(['person'])['bsl'].transform('last')\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using rows and range to constrain windows"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" vals | \n",
" bsl_sum | \n",
" bsl_avg | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115, 237 | \n",
" 352 | \n",
" 176.0 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 115, 237, 129 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 237, 129 | \n",
" 366 | \n",
" 183.0 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 86, 396 | \n",
" 482 | \n",
" 241.0 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 86, 396 | \n",
" 482 | \n",
" 241.0 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 107, 347 | \n",
" 454 | \n",
" 227.0 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 107, 347 | \n",
" 454 | \n",
" 227.0 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 89, 221 | \n",
" 310 | \n",
" 155.0 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 89, 221, 98 | \n",
" 408 | \n",
" 136.0 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 221, 98 | \n",
" 319 | \n",
" 159.5 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, '115, 237', 352, 176.0),\n",
" ('A', 2, 237, '115, 237, 129', 481, 160.33333333333334),\n",
" ('A', 3, 129, '237, 129', 366, 183.0),\n",
" ('B', 5, 86, '86, 396', 482, 241.0),\n",
" ('B', 6, 396, '86, 396', 482, 241.0),\n",
" ('C', 1, 107, '107, 347', 454, 227.0),\n",
" ('C', 9, 347, '107, 347', 454, 227.0),\n",
" ('D', 5, 89, '89, 221', 310, 155.0),\n",
" ('D', 5, 221, '89, 221, 98', 408, 136.0),\n",
" ('D', 7, 98, '221, 98', 319, 159.5)]"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, \n",
" GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n",
" SUM(bsl) OVER win AS bsl_sum,\n",
" AVG(bsl) OVER win AS bsl_avg\n",
"FROM df\n",
"WINDOW win AS (\n",
" PARTITION BY person\n",
" ORDER BY time\n",
" ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING\n",
")\n",
"ORDER BY person, time;"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" bsl_sum | \n",
" bsl_avg | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 352.0 | \n",
" 176.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 481.0 | \n",
" 160.333333 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 366.0 | \n",
" 183.000000 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 482.0 | \n",
" 241.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 482.0 | \n",
" 241.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 454.0 | \n",
" 227.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 454.0 | \n",
" 227.000000 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 310.0 | \n",
" 155.000000 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 408.0 | \n",
" 136.000000 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 319.0 | \n",
" 159.500000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl bsl_sum bsl_avg\n",
"8 A 0 115 352.0 176.000000\n",
"5 A 2 237 481.0 160.333333\n",
"2 A 3 129 366.0 183.000000\n",
"7 B 5 86 482.0 241.000000\n",
"3 B 6 396 482.0 241.000000\n",
"4 C 1 107 454.0 227.000000\n",
"1 C 9 347 454.0 227.000000\n",
"6 D 5 89 310.0 155.000000\n",
"9 D 5 221 408.0 136.000000\n",
"0 D 7 98 319.0 159.500000"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_['bsl_sum'] = (df_.groupby('person').\n",
" bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).sum()))\n",
"df_['bsl_avg'] = (df_.groupby('person').\n",
" bsl.apply(lambda x: x.rolling(3, min_periods=0, center=True).mean()))\n",
"df_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Frames using Rows and Range\n",
"\n",
"For Range, all rows with the same ORDER BY value are considered peers."
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" vals | \n",
" bsl_sum | \n",
" bsl_avg | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115 | \n",
" 115 | \n",
" 115.0 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 115, 237 | \n",
" 352 | \n",
" 176.0 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 115, 237, 129 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 115, 237, 129, 86 | \n",
" 567 | \n",
" 141.75 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 115, 237, 129, 86, 396 | \n",
" 963 | \n",
" 192.6 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 115, 237, 129, 86, 396, 107 | \n",
" 1070 | \n",
" 178.33333333333334 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 115, 237, 129, 86, 396, 107, 347 | \n",
" 1417 | \n",
" 202.42857142857142 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 115, 237, 129, 86, 396, 107, 347, 89 | \n",
" 1506 | \n",
" 188.25 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 115, 237, 129, 86, 396, 107, 347, 89, 221 | \n",
" 1727 | \n",
" 191.88888888888889 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 115, 237, 129, 86, 396, 107, 347, 89, 221, 98 | \n",
" 1825 | \n",
" 182.5 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, '115', 115, 115.0),\n",
" ('A', 2, 237, '115, 237', 352, 176.0),\n",
" ('A', 3, 129, '115, 237, 129', 481, 160.33333333333334),\n",
" ('B', 5, 86, '115, 237, 129, 86', 567, 141.75),\n",
" ('B', 6, 396, '115, 237, 129, 86, 396', 963, 192.6),\n",
" ('C', 1, 107, '115, 237, 129, 86, 396, 107', 1070, 178.33333333333334),\n",
" ('C', 9, 347, '115, 237, 129, 86, 396, 107, 347', 1417, 202.42857142857142),\n",
" ('D', 5, 89, '115, 237, 129, 86, 396, 107, 347, 89', 1506, 188.25),\n",
" ('D', 5, 221, '115, 237, 129, 86, 396, 107, 347, 89, 221', 1727, 191.88888888888889),\n",
" ('D', 7, 98, '115, 237, 129, 86, 396, 107, 347, 89, 221, 98', 1825, 182.5)]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, \n",
" GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n",
" SUM(bsl) OVER win AS bsl_sum,\n",
" AVG(bsl) OVER win AS bsl_avg\n",
"FROM df\n",
"WINDOW win AS (\n",
" ORDER BY person, time\n",
" ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n",
")\n",
"ORDER BY person, time;"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" bsl_sum | \n",
" bsl_avg | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 115 | \n",
" 115.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 352 | \n",
" 176.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 481 | \n",
" 160.333333 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 567 | \n",
" 141.750000 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 963 | \n",
" 192.600000 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 1070 | \n",
" 178.333333 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 1417 | \n",
" 202.428571 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 1506 | \n",
" 188.250000 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 1727 | \n",
" 191.888889 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 1825 | \n",
" 182.500000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl bsl_sum bsl_avg\n",
"8 A 0 115 115 115.000000\n",
"5 A 2 237 352 176.000000\n",
"2 A 3 129 481 160.333333\n",
"7 B 5 86 567 141.750000\n",
"3 B 6 396 963 192.600000\n",
"4 C 1 107 1070 178.333333\n",
"1 C 9 347 1417 202.428571\n",
"6 D 5 89 1506 188.250000\n",
"9 D 5 221 1727 191.888889\n",
"0 D 7 98 1825 182.500000"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_['bsl_sum'] = df_.bsl.expanding().sum().astype('int')\n",
"df_['bsl_avg'] = df_.bsl.expanding().mean()\n",
"df_"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sqlite://\n",
" * sqlite:///\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" person | \n",
" time | \n",
" bsl | \n",
" vals | \n",
" bsl_sum | \n",
" bsl_avg | \n",
"
\n",
" \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 129, 237, 115 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
"
\n",
" \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 129, 237, 115 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
"
\n",
" \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 129, 237, 115 | \n",
" 481 | \n",
" 160.33333333333334 | \n",
"
\n",
" \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 129, 237, 115, 396, 86 | \n",
" 963 | \n",
" 192.6 | \n",
"
\n",
" \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 129, 237, 115, 396, 86 | \n",
" 963 | \n",
" 192.6 | \n",
"
\n",
" \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 129, 237, 115, 396, 86, 347, 107 | \n",
" 1417 | \n",
" 202.42857142857142 | \n",
"
\n",
" \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 129, 237, 115, 396, 86, 347, 107 | \n",
" 1417 | \n",
" 202.42857142857142 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 | \n",
" 1825 | \n",
" 182.5 | \n",
"
\n",
" \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 | \n",
" 1825 | \n",
" 182.5 | \n",
"
\n",
" \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 129, 237, 115, 396, 86, 347, 107, 98, 89, 221 | \n",
" 1825 | \n",
" 182.5 | \n",
"
\n",
"
"
],
"text/plain": [
"[('A', 0, 115, '129, 237, 115', 481, 160.33333333333334),\n",
" ('A', 2, 237, '129, 237, 115', 481, 160.33333333333334),\n",
" ('A', 3, 129, '129, 237, 115', 481, 160.33333333333334),\n",
" ('B', 5, 86, '129, 237, 115, 396, 86', 963, 192.6),\n",
" ('B', 6, 396, '129, 237, 115, 396, 86', 963, 192.6),\n",
" ('C', 1, 107, '129, 237, 115, 396, 86, 347, 107', 1417, 202.42857142857142),\n",
" ('C', 9, 347, '129, 237, 115, 396, 86, 347, 107', 1417, 202.42857142857142),\n",
" ('D', 5, 89, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5),\n",
" ('D', 5, 221, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5),\n",
" ('D', 7, 98, '129, 237, 115, 396, 86, 347, 107, 98, 89, 221', 1825, 182.5)]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"\n",
"SELECT person, time, bsl, \n",
" GROUP_CONCAT(CAST(bsl AS TEXT), ', ') OVER win AS vals,\n",
" SUM(bsl) OVER win AS bsl_sum,\n",
" AVG(bsl) OVER win AS bsl_avg\n",
"FROM df\n",
"WINDOW win AS (\n",
" ORDER BY person\n",
" RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW\n",
")\n",
"ORDER BY person, time;"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" person | \n",
" time | \n",
" bsl | \n",
" bsl_sum | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" A | \n",
" 0 | \n",
" 115 | \n",
" 160.333333 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" 2 | \n",
" 237 | \n",
" 160.333333 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 3 | \n",
" 129 | \n",
" 160.333333 | \n",
"
\n",
" \n",
" 7 | \n",
" B | \n",
" 5 | \n",
" 86 | \n",
" 192.600000 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" 6 | \n",
" 396 | \n",
" 192.600000 | \n",
"
\n",
" \n",
" 4 | \n",
" C | \n",
" 1 | \n",
" 107 | \n",
" 202.428571 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" 9 | \n",
" 347 | \n",
" 202.428571 | \n",
"
\n",
" \n",
" 6 | \n",
" D | \n",
" 5 | \n",
" 89 | \n",
" 182.500000 | \n",
"
\n",
" \n",
" 9 | \n",
" D | \n",
" 5 | \n",
" 221 | \n",
" 182.500000 | \n",
"
\n",
" \n",
" 0 | \n",
" D | \n",
" 7 | \n",
" 98 | \n",
" 182.500000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person time bsl bsl_sum\n",
"8 A 0 115 160.333333\n",
"5 A 2 237 160.333333\n",
"2 A 3 129 160.333333\n",
"7 B 5 86 192.600000\n",
"3 B 6 396 192.600000\n",
"4 C 1 107 202.428571\n",
"1 C 9 347 202.428571\n",
"6 D 5 89 182.500000\n",
"9 D 5 221 182.500000\n",
"0 D 7 98 182.500000"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ = df.copy()\n",
"df_ = df_.sort_values(['person', 'time'])\n",
"df_['bsl_sum'] = df_['bsl'].cumsum().groupby(df_['person']).transform('last')\n",
"df_['bsl_sum'] = df_['bsl'].expanding().mean().groupby(df_['person']).transform('last')\n",
"df_"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}