{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlastmajorstate_abbrevstate_namecountrycourse_idcourse_name
01annang[(stats, Statistics), (biol, Biology)]NCNorth CarolinaUSABIOS101Frogs in medicine
12barrybonds[(stat, Statistics), (math, Mathematics)]QCQuebecCanadaBIOS101Frogs in medicine
22barrybonds[(stat, Statistics), (math, Mathematics)]QCQuebecCanadaBIOS102Medicine for frogs
\n", "
" ], "text/plain": [ " student_id first last major \\\n", "0 1 ann ang [(stats, Statistics), (biol, Biology)] \n", "1 2 barry bonds [(stat, Statistics), (math, Mathematics)] \n", "2 2 barry bonds [(stat, Statistics), (math, Mathematics)] \n", "\n", " state_abbrev state_name country course_id course_name \n", "0 NC North Carolina USA BIOS101 Frogs in medicine \n", "1 QC Quebec Canada BIOS101 Frogs in medicine \n", "2 QC Quebec Canada BIOS102 Medicine for frogs " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " dict(\n", " student_id = [1,2,2],\n", " first = ['ann', 'barry', 'barry'],\n", " last = ['ang', 'bonds', 'bonds'],\n", " major = [[('stats', 'Statistics'), ('biol', 'Biology')], \n", " [('stat', 'Statistics'),('math', 'Mathematics')],\n", " [('stat', 'Statistics'),('math', 'Mathematics')]],\n", " state_abbrev = ['NC', 'QC', 'QC'],\n", " state_name = ['North Carolina', 'Quebec', 'Quebec'],\n", " country = ['USA', 'Canada', 'Canada'],\n", " course_id = ['BIOS101', 'BIOS101', 'BIOS102'],\n", " course_name = [\"Frogs in medicine\", \n", " \"Frogs in medicine\", \n", " \"Medicine for frogs\"]\n", " )\n", ")\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Normalization\n", "\n", "- Every table should not have any \n", " - duplication\n", " - dependencies that are not key or domain constraints" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1NF\n", "\n", "- Split composite entries" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlastmajorstate_abbrevstate_namecountrycourse_idcourse_name
01annang(stats, Statistics)NCNorth CarolinaUSABIOS101Frogs in medicine
01annang(biol, Biology)NCNorth CarolinaUSABIOS101Frogs in medicine
12barrybonds(stat, Statistics)QCQuebecCanadaBIOS101Frogs in medicine
12barrybonds(math, Mathematics)QCQuebecCanadaBIOS101Frogs in medicine
22barrybonds(stat, Statistics)QCQuebecCanadaBIOS102Medicine for frogs
22barrybonds(math, Mathematics)QCQuebecCanadaBIOS102Medicine for frogs
\n", "
" ], "text/plain": [ " student_id first last major state_abbrev state_name \\\n", "0 1 ann ang (stats, Statistics) NC North Carolina \n", "0 1 ann ang (biol, Biology) NC North Carolina \n", "1 2 barry bonds (stat, Statistics) QC Quebec \n", "1 2 barry bonds (math, Mathematics) QC Quebec \n", "2 2 barry bonds (stat, Statistics) QC Quebec \n", "2 2 barry bonds (math, Mathematics) QC Quebec \n", "\n", " country course_id course_name \n", "0 USA BIOS101 Frogs in medicine \n", "0 USA BIOS101 Frogs in medicine \n", "1 Canada BIOS101 Frogs in medicine \n", "1 Canada BIOS101 Frogs in medicine \n", "2 Canada BIOS102 Medicine for frogs \n", "2 Canada BIOS102 Medicine for frogs " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = df.explode('major')\n", "df1" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_namecountrycourse_idcourse_namemajor_idmajor_name
01annangNCNorth CarolinaUSABIOS101Frogs in medicinestatsStatistics
01annangNCNorth CarolinaUSABIOS101Frogs in medicinebiolBiology
12barrybondsQCQuebecCanadaBIOS101Frogs in medicinestatStatistics
12barrybondsQCQuebecCanadaBIOS101Frogs in medicinemathMathematics
22barrybondsQCQuebecCanadaBIOS102Medicine for frogsstatStatistics
22barrybondsQCQuebecCanadaBIOS102Medicine for frogsmathMathematics
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name country course_id \\\n", "0 1 ann ang NC North Carolina USA BIOS101 \n", "0 1 ann ang NC North Carolina USA BIOS101 \n", "1 2 barry bonds QC Quebec Canada BIOS101 \n", "1 2 barry bonds QC Quebec Canada BIOS101 \n", "2 2 barry bonds QC Quebec Canada BIOS102 \n", "2 2 barry bonds QC Quebec Canada BIOS102 \n", "\n", " course_name major_id major_name \n", "0 Frogs in medicine stats Statistics \n", "0 Frogs in medicine biol Biology \n", "1 Frogs in medicine stat Statistics \n", "1 Frogs in medicine math Mathematics \n", "2 Medicine for frogs stat Statistics \n", "2 Medicine for frogs math Mathematics " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_cols = ['major_id', 'major_name']\n", "for i, c in enumerate(new_cols):\n", " df1[c] = df1.major.apply(lambda x: x[i])\n", "df1 = df1.drop('major', axis=1)\n", "df1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2NF\n", "\n", "- Break partial dependencies\n", " - Identify candidate PK for each row\n", " - If there is a composite PK, see if other columns have partial dependencies" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_namecountry
01annangNCNorth CarolinaUSA
12barrybondsQCQuebecCanada
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name country\n", "0 1 ann ang NC North Carolina USA\n", "1 2 barry bonds QC Quebec Canada" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()\n", "df_student" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
course_idcourse_name
0BIOS101Frogs in medicine
2BIOS102Medicine for frogs
\n", "
" ], "text/plain": [ " course_id course_name\n", "0 BIOS101 Frogs in medicine\n", "2 BIOS102 Medicine for frogs" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_course = df1.iloc[:,6:8].drop_duplicates()\n", "df_course" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
major_idmajor_name
0statsStatistics
0biolBiology
1statStatistics
1mathMathematics
\n", "
" ], "text/plain": [ " major_id major_name\n", "0 stats Statistics\n", "0 biol Biology\n", "1 stat Statistics\n", "1 math Mathematics" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_major = df1.iloc[:,8:10].drop_duplicates()\n", "df_major" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idcourse_id
01BIOS101
12BIOS101
22BIOS102
\n", "
" ], "text/plain": [ " student_id course_id\n", "0 1 BIOS101\n", "1 2 BIOS101\n", "2 2 BIOS102" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()\n", "df_student_course" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idmajor_id
01stats
01biol
12stat
12math
\n", "
" ], "text/plain": [ " student_id major_id\n", "0 1 stats\n", "0 1 biol\n", "1 2 stat\n", "1 2 math" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()\n", "df_student_major" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3NF\n", "\n", "- Remove transitive dependencies" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_name
01annangNCNorth Carolina
12barrybondsQCQuebec
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name\n", "0 1 ann ang NC North Carolina\n", "1 2 barry bonds QC Quebec" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_ = df_student.iloc[:, :5].drop_duplicates()\n", "df_student_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state_namecountry
0North CarolinaUSA
1QuebecCanada
\n", "
" ], "text/plain": [ " state_name country\n", "0 North Carolina USA\n", "1 Quebec Canada" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_origin = df_student.iloc[:, 4:7].drop_duplicates()\n", "df_origin" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Final tables" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_name
01annangNCNorth Carolina
12barrybondsQCQuebec
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name\n", "0 1 ann ang NC North Carolina\n", "1 2 barry bonds QC Quebec" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state_namecountry
0North CarolinaUSA
1QuebecCanada
\n", "
" ], "text/plain": [ " state_name country\n", "0 North Carolina USA\n", "1 Quebec Canada" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_origin" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
course_idcourse_name
0BIOS101Frogs in medicine
2BIOS102Medicine for frogs
\n", "
" ], "text/plain": [ " course_id course_name\n", "0 BIOS101 Frogs in medicine\n", "2 BIOS102 Medicine for frogs" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_course" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idcourse_id
01BIOS101
12BIOS101
22BIOS102
\n", "
" ], "text/plain": [ " student_id course_id\n", "0 1 BIOS101\n", "1 2 BIOS101\n", "2 2 BIOS102" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_course" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
major_idmajor_name
0statsStatistics
0biolBiology
1statStatistics
1mathMathematics
\n", "
" ], "text/plain": [ " major_id major_name\n", "0 stats Statistics\n", "0 biol Biology\n", "1 stat Statistics\n", "1 math Mathematics" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_major" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idmajor_id
01stats
01biol
12stat
12math
\n", "
" ], "text/plain": [ " student_id major_id\n", "0 1 stats\n", "0 1 biol\n", "1 2 stat\n", "1 2 math" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_major" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Denormalization" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_namecountry
01annangNCNorth CarolinaUSA
12barrybondsQCQuebecCanada
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name country\n", "0 1 ann ang NC North Carolina USA\n", "1 2 barry bonds QC Quebec Canada" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_student_.merge(df_origin)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
student_idfirstlaststate_abbrevstate_namecountrycourse_idcourse_namemajor_idmajor_name
01annangNCNorth CarolinaUSABIOS101Frogs in medicinestatsStatistics
11annangNCNorth CarolinaUSABIOS101Frogs in medicinebiolBiology
22barrybondsQCQuebecCanadaBIOS101Frogs in medicinestatStatistics
32barrybondsQCQuebecCanadaBIOS102Medicine for frogsstatStatistics
42barrybondsQCQuebecCanadaBIOS101Frogs in medicinemathMathematics
52barrybondsQCQuebecCanadaBIOS102Medicine for frogsmathMathematics
\n", "
" ], "text/plain": [ " student_id first last state_abbrev state_name country course_id \\\n", "0 1 ann ang NC North Carolina USA BIOS101 \n", "1 1 ann ang NC North Carolina USA BIOS101 \n", "2 2 barry bonds QC Quebec Canada BIOS101 \n", "3 2 barry bonds QC Quebec Canada BIOS102 \n", "4 2 barry bonds QC Quebec Canada BIOS101 \n", "5 2 barry bonds QC Quebec Canada BIOS102 \n", "\n", " course_name major_id major_name \n", "0 Frogs in medicine stats Statistics \n", "1 Frogs in medicine biol Biology \n", "2 Frogs in medicine stat Statistics \n", "3 Medicine for frogs stat Statistics \n", "4 Frogs in medicine math Mathematics \n", "5 Medicine for frogs math Mathematics " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\n", " df_student_.\n", " merge(df_origin).\n", " merge(df_student_course).\n", " merge(df_course).\n", " merge(df_student_major).\n", " merge(df_major)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }