[1]:
import pandas as pd
[2]:
df = pd.DataFrame(
    dict(
        student_id = [1,2,2],
        first = ['ann', 'barry', 'barry'],
        last = ['ang', 'bonds', 'bonds'],
        major = [[('stats', 'Statistics'), ('biol', 'Biology')],
                 [('stat', 'Statistics'),('math', 'Mathematics')],
                 [('stat', 'Statistics'),('math', 'Mathematics')]],
        state_abbrev = ['NC', 'QC', 'QC'],
        state_name = ['North Carolina', 'Quebec', 'Quebec'],
        country = ['USA', 'Canada', 'Canada'],
        course_id = ['BIOS101', 'BIOS101', 'BIOS102'],
        course_name = ["Frogs in medicine",
                       "Frogs in medicine",
                       "Medicine for frogs"]
    )
)

df
[2]:
student_id first last major state_abbrev state_name country course_id course_name
0 1 ann ang [(stats, Statistics), (biol, Biology)] NC North Carolina USA BIOS101 Frogs in medicine
1 2 barry bonds [(stat, Statistics), (math, Mathematics)] QC Quebec Canada BIOS101 Frogs in medicine
2 2 barry bonds [(stat, Statistics), (math, Mathematics)] QC Quebec Canada BIOS102 Medicine for frogs

Normalization

  • Every table should not have any

    • duplication

    • dependencies that are not key or domain constraints

1NF

  • Split composite entries

[3]:
df1 = df.explode('major')
df1
[3]:
student_id first last major state_abbrev state_name country course_id course_name
0 1 ann ang (stats, Statistics) NC North Carolina USA BIOS101 Frogs in medicine
0 1 ann ang (biol, Biology) NC North Carolina USA BIOS101 Frogs in medicine
1 2 barry bonds (stat, Statistics) QC Quebec Canada BIOS101 Frogs in medicine
1 2 barry bonds (math, Mathematics) QC Quebec Canada BIOS101 Frogs in medicine
2 2 barry bonds (stat, Statistics) QC Quebec Canada BIOS102 Medicine for frogs
2 2 barry bonds (math, Mathematics) QC Quebec Canada BIOS102 Medicine for frogs
[4]:
new_cols = ['major_id', 'major_name']
for i, c in enumerate(new_cols):
    df1[c] = df1.major.apply(lambda x: x[i])
df1 = df1.drop('major', axis=1)
df1
[4]:
student_id first last state_abbrev state_name country course_id course_name major_id major_name
0 1 ann ang NC North Carolina USA BIOS101 Frogs in medicine stats Statistics
0 1 ann ang NC North Carolina USA BIOS101 Frogs in medicine biol Biology
1 2 barry bonds QC Quebec Canada BIOS101 Frogs in medicine stat Statistics
1 2 barry bonds QC Quebec Canada BIOS101 Frogs in medicine math Mathematics
2 2 barry bonds QC Quebec Canada BIOS102 Medicine for frogs stat Statistics
2 2 barry bonds QC Quebec Canada BIOS102 Medicine for frogs math Mathematics

2NF

  • Break partial dependencies

    • Identify candidate PK for each row

    • If there is a composite PK, see if other columns have partial dependencies

[5]:
df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()
df_student
[5]:
student_id first last state_abbrev state_name country
0 1 ann ang NC North Carolina USA
1 2 barry bonds QC Quebec Canada
[6]:
df_course = df1.iloc[:,6:8].drop_duplicates()
df_course
[6]:
course_id course_name
0 BIOS101 Frogs in medicine
2 BIOS102 Medicine for frogs
[7]:
df_major = df1.iloc[:,8:10].drop_duplicates()
df_major
[7]:
major_id major_name
0 stats Statistics
0 biol Biology
1 stat Statistics
1 math Mathematics
[8]:
df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()
df_student_course
[8]:
student_id course_id
0 1 BIOS101
1 2 BIOS101
2 2 BIOS102
[9]:
df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()
df_student_major
[9]:
student_id major_id
0 1 stats
0 1 biol
1 2 stat
1 2 math

3NF

  • Remove transitive dependencies

[10]:
df_student_ = df_student.iloc[:, :5].drop_duplicates()
df_student_
[10]:
student_id first last state_abbrev state_name
0 1 ann ang NC North Carolina
1 2 barry bonds QC Quebec
[11]:
df_origin = df_student.iloc[:, 4:7].drop_duplicates()
df_origin
[11]:
state_name country
0 North Carolina USA
1 Quebec Canada

Final tables

[12]:
df_student_
[12]:
student_id first last state_abbrev state_name
0 1 ann ang NC North Carolina
1 2 barry bonds QC Quebec
[13]:
df_origin
[13]:
state_name country
0 North Carolina USA
1 Quebec Canada
[14]:
df_course
[14]:
course_id course_name
0 BIOS101 Frogs in medicine
2 BIOS102 Medicine for frogs
[15]:
df_student_course
[15]:
student_id course_id
0 1 BIOS101
1 2 BIOS101
2 2 BIOS102
[16]:
df_major
[16]:
major_id major_name
0 stats Statistics
0 biol Biology
1 stat Statistics
1 math Mathematics
[17]:
df_student_major
[17]:
student_id major_id
0 1 stats
0 1 biol
1 2 stat
1 2 math

Denormalization

[18]:
df_student_.merge(df_origin)
[18]:
student_id first last state_abbrev state_name country
0 1 ann ang NC North Carolina USA
1 2 barry bonds QC Quebec Canada
[19]:
(
    df_student_.
    merge(df_origin).
    merge(df_student_course).
    merge(df_course).
    merge(df_student_major).
    merge(df_major)
)

[19]:
student_id first last state_abbrev state_name country course_id course_name major_id major_name
0 1 ann ang NC North Carolina USA BIOS101 Frogs in medicine stats Statistics
1 1 ann ang NC North Carolina USA BIOS101 Frogs in medicine biol Biology
2 2 barry bonds QC Quebec Canada BIOS101 Frogs in medicine stat Statistics
3 2 barry bonds QC Quebec Canada BIOS102 Medicine for frogs stat Statistics
4 2 barry bonds QC Quebec Canada BIOS101 Frogs in medicine math Mathematics
5 2 barry bonds QC Quebec Canada BIOS102 Medicine for frogs math Mathematics
[ ]: