[1]:
import pandas as pd
[2]:
df = pd.DataFrame(
dict(
student_id = [1,2,2],
first = ['ann', 'barry', 'barry'],
last = ['ang', 'bonds', 'bonds'],
major = [[('stats', 'Statistics'), ('biol', 'Biology')],
[('stat', 'Statistics'),('math', 'Mathematics')],
[('stat', 'Statistics'),('math', 'Mathematics')]],
state_abbrev = ['NC', 'QC', 'QC'],
state_name = ['North Carolina', 'Quebec', 'Quebec'],
country = ['USA', 'Canada', 'Canada'],
course_id = ['BIOS101', 'BIOS101', 'BIOS102'],
course_name = ["Frogs in medicine",
"Frogs in medicine",
"Medicine for frogs"]
)
)
df
[2]:
| student_id | first | last | major | state_abbrev | state_name | country | course_id | course_name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | [(stats, Statistics), (biol, Biology)] | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
| 1 | 2 | barry | bonds | [(stat, Statistics), (math, Mathematics)] | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
| 2 | 2 | barry | bonds | [(stat, Statistics), (math, Mathematics)] | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
Normalization¶
Every table should not have any
duplication
dependencies that are not key or domain constraints
1NF¶
Split composite entries
[3]:
df1 = df.explode('major')
df1
[3]:
| student_id | first | last | major | state_abbrev | state_name | country | course_id | course_name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | (stats, Statistics) | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
| 0 | 1 | ann | ang | (biol, Biology) | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
| 1 | 2 | barry | bonds | (stat, Statistics) | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
| 1 | 2 | barry | bonds | (math, Mathematics) | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
| 2 | 2 | barry | bonds | (stat, Statistics) | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
| 2 | 2 | barry | bonds | (math, Mathematics) | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
[4]:
new_cols = ['major_id', 'major_name']
for i, c in enumerate(new_cols):
df1[c] = df1.major.apply(lambda x: x[i])
df1 = df1.drop('major', axis=1)
df1
[4]:
| student_id | first | last | state_abbrev | state_name | country | course_id | course_name | major_id | major_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | stats | Statistics |
| 0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | biol | Biology |
| 1 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | stat | Statistics |
| 1 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | math | Mathematics |
| 2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | stat | Statistics |
| 2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | math | Mathematics |
2NF¶
Break partial dependencies
Identify candidate PK for each row
If there is a composite PK, see if other columns have partial dependencies
[5]:
df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()
df_student
[5]:
| student_id | first | last | state_abbrev | state_name | country | |
|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina | USA |
| 1 | 2 | barry | bonds | QC | Quebec | Canada |
[6]:
df_course = df1.iloc[:,6:8].drop_duplicates()
df_course
[6]:
| course_id | course_name | |
|---|---|---|
| 0 | BIOS101 | Frogs in medicine |
| 2 | BIOS102 | Medicine for frogs |
[7]:
df_major = df1.iloc[:,8:10].drop_duplicates()
df_major
[7]:
| major_id | major_name | |
|---|---|---|
| 0 | stats | Statistics |
| 0 | biol | Biology |
| 1 | stat | Statistics |
| 1 | math | Mathematics |
[8]:
df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()
df_student_course
[8]:
| student_id | course_id | |
|---|---|---|
| 0 | 1 | BIOS101 |
| 1 | 2 | BIOS101 |
| 2 | 2 | BIOS102 |
[9]:
df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()
df_student_major
[9]:
| student_id | major_id | |
|---|---|---|
| 0 | 1 | stats |
| 0 | 1 | biol |
| 1 | 2 | stat |
| 1 | 2 | math |
3NF¶
Remove transitive dependencies
[10]:
df_student_ = df_student.iloc[:, :5].drop_duplicates()
df_student_
[10]:
| student_id | first | last | state_abbrev | state_name | |
|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina |
| 1 | 2 | barry | bonds | QC | Quebec |
[11]:
df_origin = df_student.iloc[:, 4:7].drop_duplicates()
df_origin
[11]:
| state_name | country | |
|---|---|---|
| 0 | North Carolina | USA |
| 1 | Quebec | Canada |
Final tables¶
[12]:
df_student_
[12]:
| student_id | first | last | state_abbrev | state_name | |
|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina |
| 1 | 2 | barry | bonds | QC | Quebec |
[13]:
df_origin
[13]:
| state_name | country | |
|---|---|---|
| 0 | North Carolina | USA |
| 1 | Quebec | Canada |
[14]:
df_course
[14]:
| course_id | course_name | |
|---|---|---|
| 0 | BIOS101 | Frogs in medicine |
| 2 | BIOS102 | Medicine for frogs |
[15]:
df_student_course
[15]:
| student_id | course_id | |
|---|---|---|
| 0 | 1 | BIOS101 |
| 1 | 2 | BIOS101 |
| 2 | 2 | BIOS102 |
[16]:
df_major
[16]:
| major_id | major_name | |
|---|---|---|
| 0 | stats | Statistics |
| 0 | biol | Biology |
| 1 | stat | Statistics |
| 1 | math | Mathematics |
[17]:
df_student_major
[17]:
| student_id | major_id | |
|---|---|---|
| 0 | 1 | stats |
| 0 | 1 | biol |
| 1 | 2 | stat |
| 1 | 2 | math |
Denormalization¶
[18]:
df_student_.merge(df_origin)
[18]:
| student_id | first | last | state_abbrev | state_name | country | |
|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina | USA |
| 1 | 2 | barry | bonds | QC | Quebec | Canada |
[19]:
(
df_student_.
merge(df_origin).
merge(df_student_course).
merge(df_course).
merge(df_student_major).
merge(df_major)
)
[19]:
| student_id | first | last | state_abbrev | state_name | country | course_id | course_name | major_id | major_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | stats | Statistics |
| 1 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | biol | Biology |
| 2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | stat | Statistics |
| 3 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | stat | Statistics |
| 4 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | math | Mathematics |
| 5 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | math | Mathematics |
[ ]: