[1]:
import pandas as pd
[2]:
df = pd.DataFrame(
dict(
student_id = [1,2,2],
first = ['ann', 'barry', 'barry'],
last = ['ang', 'bonds', 'bonds'],
major = [[('stats', 'Statistics'), ('biol', 'Biology')],
[('stat', 'Statistics'),('math', 'Mathematics')],
[('stat', 'Statistics'),('math', 'Mathematics')]],
state_abbrev = ['NC', 'QC', 'QC'],
state_name = ['North Carolina', 'Quebec', 'Quebec'],
country = ['USA', 'Canada', 'Canada'],
course_id = ['BIOS101', 'BIOS101', 'BIOS102'],
course_name = ["Frogs in medicine",
"Frogs in medicine",
"Medicine for frogs"]
)
)
df
[2]:
student_id | first | last | major | state_abbrev | state_name | country | course_id | course_name | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | ann | ang | [(stats, Statistics), (biol, Biology)] | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
1 | 2 | barry | bonds | [(stat, Statistics), (math, Mathematics)] | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
2 | 2 | barry | bonds | [(stat, Statistics), (math, Mathematics)] | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
Normalization¶
Every table should not have any
duplication
dependencies that are not key or domain constraints
1NF¶
Split composite entries
[3]:
df1 = df.explode('major')
df1
[3]:
student_id | first | last | major | state_abbrev | state_name | country | course_id | course_name | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | ann | ang | (stats, Statistics) | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
0 | 1 | ann | ang | (biol, Biology) | NC | North Carolina | USA | BIOS101 | Frogs in medicine |
1 | 2 | barry | bonds | (stat, Statistics) | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
1 | 2 | barry | bonds | (math, Mathematics) | QC | Quebec | Canada | BIOS101 | Frogs in medicine |
2 | 2 | barry | bonds | (stat, Statistics) | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
2 | 2 | barry | bonds | (math, Mathematics) | QC | Quebec | Canada | BIOS102 | Medicine for frogs |
[4]:
new_cols = ['major_id', 'major_name']
for i, c in enumerate(new_cols):
df1[c] = df1.major.apply(lambda x: x[i])
df1 = df1.drop('major', axis=1)
df1
[4]:
student_id | first | last | state_abbrev | state_name | country | course_id | course_name | major_id | major_name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | stats | Statistics |
0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | biol | Biology |
1 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | stat | Statistics |
1 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | math | Mathematics |
2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | stat | Statistics |
2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | math | Mathematics |
2NF¶
Break partial dependencies
Identify candidate PK for each row
If there is a composite PK, see if other columns have partial dependencies
[5]:
df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()
df_student
[5]:
student_id | first | last | state_abbrev | state_name | country | |
---|---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina | USA |
1 | 2 | barry | bonds | QC | Quebec | Canada |
[6]:
df_course = df1.iloc[:,6:8].drop_duplicates()
df_course
[6]:
course_id | course_name | |
---|---|---|
0 | BIOS101 | Frogs in medicine |
2 | BIOS102 | Medicine for frogs |
[7]:
df_major = df1.iloc[:,8:10].drop_duplicates()
df_major
[7]:
major_id | major_name | |
---|---|---|
0 | stats | Statistics |
0 | biol | Biology |
1 | stat | Statistics |
1 | math | Mathematics |
[8]:
df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()
df_student_course
[8]:
student_id | course_id | |
---|---|---|
0 | 1 | BIOS101 |
1 | 2 | BIOS101 |
2 | 2 | BIOS102 |
[9]:
df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()
df_student_major
[9]:
student_id | major_id | |
---|---|---|
0 | 1 | stats |
0 | 1 | biol |
1 | 2 | stat |
1 | 2 | math |
3NF¶
Remove transitive dependencies
[10]:
df_student_ = df_student.iloc[:, :5].drop_duplicates()
df_student_
[10]:
student_id | first | last | state_abbrev | state_name | |
---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina |
1 | 2 | barry | bonds | QC | Quebec |
[11]:
df_origin = df_student.iloc[:, 4:7].drop_duplicates()
df_origin
[11]:
state_name | country | |
---|---|---|
0 | North Carolina | USA |
1 | Quebec | Canada |
Final tables¶
[12]:
df_student_
[12]:
student_id | first | last | state_abbrev | state_name | |
---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina |
1 | 2 | barry | bonds | QC | Quebec |
[13]:
df_origin
[13]:
state_name | country | |
---|---|---|
0 | North Carolina | USA |
1 | Quebec | Canada |
[14]:
df_course
[14]:
course_id | course_name | |
---|---|---|
0 | BIOS101 | Frogs in medicine |
2 | BIOS102 | Medicine for frogs |
[15]:
df_student_course
[15]:
student_id | course_id | |
---|---|---|
0 | 1 | BIOS101 |
1 | 2 | BIOS101 |
2 | 2 | BIOS102 |
[16]:
df_major
[16]:
major_id | major_name | |
---|---|---|
0 | stats | Statistics |
0 | biol | Biology |
1 | stat | Statistics |
1 | math | Mathematics |
[17]:
df_student_major
[17]:
student_id | major_id | |
---|---|---|
0 | 1 | stats |
0 | 1 | biol |
1 | 2 | stat |
1 | 2 | math |
Denormalization¶
[18]:
df_student_.merge(df_origin)
[18]:
student_id | first | last | state_abbrev | state_name | country | |
---|---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina | USA |
1 | 2 | barry | bonds | QC | Quebec | Canada |
[19]:
(
df_student_.
merge(df_origin).
merge(df_student_course).
merge(df_course).
merge(df_student_major).
merge(df_major)
)
[19]:
student_id | first | last | state_abbrev | state_name | country | course_id | course_name | major_id | major_name | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | stats | Statistics |
1 | 1 | ann | ang | NC | North Carolina | USA | BIOS101 | Frogs in medicine | biol | Biology |
2 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | stat | Statistics |
3 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | stat | Statistics |
4 | 2 | barry | bonds | QC | Quebec | Canada | BIOS101 | Frogs in medicine | math | Mathematics |
5 | 2 | barry | bonds | QC | Quebec | Canada | BIOS102 | Medicine for frogs | math | Mathematics |
[ ]: