[1]:

import pandas as pd

[2]:

df = pd.DataFrame(
    dict(
        student_id = [1,2,2],
        first = ['ann', 'barry', 'barry'],
        last = ['ang', 'bonds', 'bonds'],
        major = [[('stats', 'Statistics'), ('biol', 'Biology')],
                 [('stat', 'Statistics'),('math', 'Mathematics')],
                 [('stat', 'Statistics'),('math', 'Mathematics')]],
        state_abbrev = ['NC', 'QC', 'QC'],
        state_name = ['North Carolina', 'Quebec', 'Quebec'],
        country = ['USA', 'Canada', 'Canada'],
        course_id = ['BIOS101', 'BIOS101', 'BIOS102'],
        course_name = ["Frogs in medicine",
                       "Frogs in medicine",
                       "Medicine for frogs"]
    )
)

df

[2]:

	student_id	first	last	major	state_abbrev	state_name	country	course_id	course_name
0	1	ann	ang	[(stats, Statistics), (biol, Biology)]	NC	North Carolina	USA	BIOS101	Frogs in medicine
1	2	barry	bonds	[(stat, Statistics), (math, Mathematics)]	QC	Quebec	Canada	BIOS101	Frogs in medicine
2	2	barry	bonds	[(stat, Statistics), (math, Mathematics)]	QC	Quebec	Canada	BIOS102	Medicine for frogs

Normalization¶

Every table should not have any
- duplication
- dependencies that are not key or domain constraints

1NF¶

Split composite entries

[3]:

df1 = df.explode('major')
df1

[3]:

	student_id	first	last	major	state_abbrev	state_name	country	course_id	course_name
0	1	ann	ang	(stats, Statistics)	NC	North Carolina	USA	BIOS101	Frogs in medicine
0	1	ann	ang	(biol, Biology)	NC	North Carolina	USA	BIOS101	Frogs in medicine
1	2	barry	bonds	(stat, Statistics)	QC	Quebec	Canada	BIOS101	Frogs in medicine
1	2	barry	bonds	(math, Mathematics)	QC	Quebec	Canada	BIOS101	Frogs in medicine
2	2	barry	bonds	(stat, Statistics)	QC	Quebec	Canada	BIOS102	Medicine for frogs
2	2	barry	bonds	(math, Mathematics)	QC	Quebec	Canada	BIOS102	Medicine for frogs

[4]:

new_cols = ['major_id', 'major_name']
for i, c in enumerate(new_cols):
    df1[c] = df1.major.apply(lambda x: x[i])
df1 = df1.drop('major', axis=1)
df1

[4]:

	student_id	first	last	state_abbrev	state_name	country	course_id	course_name	major_id	major_name
0	1	ann	ang	NC	North Carolina	USA	BIOS101	Frogs in medicine	stats	Statistics
0	1	ann	ang	NC	North Carolina	USA	BIOS101	Frogs in medicine	biol	Biology
1	2	barry	bonds	QC	Quebec	Canada	BIOS101	Frogs in medicine	stat	Statistics
1	2	barry	bonds	QC	Quebec	Canada	BIOS101	Frogs in medicine	math	Mathematics
2	2	barry	bonds	QC	Quebec	Canada	BIOS102	Medicine for frogs	stat	Statistics
2	2	barry	bonds	QC	Quebec	Canada	BIOS102	Medicine for frogs	math	Mathematics

2NF¶

Break partial dependencies
- Identify candidate PK for each row
- If there is a composite PK, see if other columns have partial dependencies

[5]:

df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()
df_student

[5]:

	student_id	first	last	state_abbrev	state_name	country
0	1	ann	ang	NC	North Carolina	USA
1	2	barry	bonds	QC	Quebec	Canada

[6]:

df_course = df1.iloc[:,6:8].drop_duplicates()
df_course

[6]:

	course_id	course_name
0	BIOS101	Frogs in medicine
2	BIOS102	Medicine for frogs

[7]:

df_major = df1.iloc[:,8:10].drop_duplicates()
df_major

[7]:

	major_id	major_name
0	stats	Statistics
0	biol	Biology
1	stat	Statistics
1	math	Mathematics

[8]:

df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()
df_student_course

[8]:

	student_id	course_id
0	1	BIOS101
1	2	BIOS101
2	2	BIOS102

[9]:

df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()
df_student_major

[9]:

	student_id	major_id
0	1	stats
0	1	biol
1	2	stat
1	2	math

3NF¶

Remove transitive dependencies

[10]:

df_student_ = df_student.iloc[:, :5].drop_duplicates()
df_student_

[10]:

	student_id	first	last	state_abbrev	state_name
0	1	ann	ang	NC	North Carolina
1	2	barry	bonds	QC	Quebec

[11]:

df_origin = df_student.iloc[:, 4:7].drop_duplicates()
df_origin

[11]:

	state_name	country
0	North Carolina	USA
1	Quebec	Canada

Final tables¶

[12]:

df_student_

[12]:

	student_id	first	last	state_abbrev	state_name
0	1	ann	ang	NC	North Carolina
1	2	barry	bonds	QC	Quebec

[13]:

df_origin

[13]:

	state_name	country
0	North Carolina	USA
1	Quebec	Canada

[14]:

df_course

[14]:

	course_id	course_name
0	BIOS101	Frogs in medicine
2	BIOS102	Medicine for frogs

[15]:

df_student_course

[15]:

	student_id	course_id
0	1	BIOS101
1	2	BIOS101
2	2	BIOS102

[16]:

df_major

[16]:

	major_id	major_name
0	stats	Statistics
0	biol	Biology
1	stat	Statistics
1	math	Mathematics

[17]:

df_student_major

[17]:

	student_id	major_id
0	1	stats
0	1	biol
1	2	stat
1	2	math

Denormalization¶

[18]:

df_student_.merge(df_origin)

[18]:

	student_id	first	last	state_abbrev	state_name	country
0	1	ann	ang	NC	North Carolina	USA
1	2	barry	bonds	QC	Quebec	Canada

[19]:

(
    df_student_.
    merge(df_origin).
    merge(df_student_course).
    merge(df_course).
    merge(df_student_major).
    merge(df_major)
)

[19]:

	student_id	first	last	state_abbrev	state_name	country	course_id	course_name	major_id	major_name
0	1	ann	ang	NC	North Carolina	USA	BIOS101	Frogs in medicine	stats	Statistics
1	1	ann	ang	NC	North Carolina	USA	BIOS101	Frogs in medicine	biol	Biology
2	2	barry	bonds	QC	Quebec	Canada	BIOS101	Frogs in medicine	stat	Statistics
3	2	barry	bonds	QC	Quebec	Canada	BIOS102	Medicine for frogs	stat	Statistics
4	2	barry	bonds	QC	Quebec	Canada	BIOS101	Frogs in medicine	math	Mathematics
5	2	barry	bonds	QC	Quebec	Canada	BIOS102	Medicine for frogs	math	Mathematics

[ ]: