{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>major</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>[(stats, Statistics), (biol, Biology)]</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>[(stat, Statistics), (math, Mathematics)]</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>[(stat, Statistics), (math, Mathematics)]</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last                                      major  \\\n",
       "0           1    ann    ang     [(stats, Statistics), (biol, Biology)]   \n",
       "1           2  barry  bonds  [(stat, Statistics), (math, Mathematics)]   \n",
       "2           2  barry  bonds  [(stat, Statistics), (math, Mathematics)]   \n",
       "\n",
       "  state_abbrev      state_name country course_id         course_name  \n",
       "0           NC  North Carolina     USA   BIOS101   Frogs in medicine  \n",
       "1           QC          Quebec  Canada   BIOS101   Frogs in medicine  \n",
       "2           QC          Quebec  Canada   BIOS102  Medicine for frogs  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    dict(\n",
    "        student_id = [1,2,2],\n",
    "        first = ['ann', 'barry', 'barry'],\n",
    "        last = ['ang', 'bonds', 'bonds'],\n",
    "        major = [[('stats', 'Statistics'), ('biol', 'Biology')], \n",
    "                 [('stat', 'Statistics'),('math', 'Mathematics')],\n",
    "                 [('stat', 'Statistics'),('math', 'Mathematics')]],\n",
    "        state_abbrev = ['NC', 'QC', 'QC'],\n",
    "        state_name = ['North Carolina', 'Quebec', 'Quebec'],\n",
    "        country = ['USA', 'Canada', 'Canada'],\n",
    "        course_id = ['BIOS101', 'BIOS101', 'BIOS102'],\n",
    "        course_name = [\"Frogs in medicine\", \n",
    "                       \"Frogs in medicine\", \n",
    "                       \"Medicine for frogs\"]\n",
    "    )\n",
    ")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Normalization\n",
    "\n",
    "- Every table should not have any \n",
    "    - duplication\n",
    "    - dependencies that are not key or domain constraints"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1NF\n",
    "\n",
    "- Split composite entries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>major</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>(stats, Statistics)</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>(biol, Biology)</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>(stat, Statistics)</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>(math, Mathematics)</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>(stat, Statistics)</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>(math, Mathematics)</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last                major state_abbrev      state_name  \\\n",
       "0           1    ann    ang  (stats, Statistics)           NC  North Carolina   \n",
       "0           1    ann    ang      (biol, Biology)           NC  North Carolina   \n",
       "1           2  barry  bonds   (stat, Statistics)           QC          Quebec   \n",
       "1           2  barry  bonds  (math, Mathematics)           QC          Quebec   \n",
       "2           2  barry  bonds   (stat, Statistics)           QC          Quebec   \n",
       "2           2  barry  bonds  (math, Mathematics)           QC          Quebec   \n",
       "\n",
       "  country course_id         course_name  \n",
       "0     USA   BIOS101   Frogs in medicine  \n",
       "0     USA   BIOS101   Frogs in medicine  \n",
       "1  Canada   BIOS101   Frogs in medicine  \n",
       "1  Canada   BIOS101   Frogs in medicine  \n",
       "2  Canada   BIOS102  Medicine for frogs  \n",
       "2  Canada   BIOS102  Medicine for frogs  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df.explode('major')\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "      <th>major_id</th>\n",
       "      <th>major_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>stats</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>biol</td>\n",
       "      <td>Biology</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name country course_id  \\\n",
       "0           1    ann    ang           NC  North Carolina     USA   BIOS101   \n",
       "0           1    ann    ang           NC  North Carolina     USA   BIOS101   \n",
       "1           2  barry  bonds           QC          Quebec  Canada   BIOS101   \n",
       "1           2  barry  bonds           QC          Quebec  Canada   BIOS101   \n",
       "2           2  barry  bonds           QC          Quebec  Canada   BIOS102   \n",
       "2           2  barry  bonds           QC          Quebec  Canada   BIOS102   \n",
       "\n",
       "          course_name major_id   major_name  \n",
       "0   Frogs in medicine    stats   Statistics  \n",
       "0   Frogs in medicine     biol      Biology  \n",
       "1   Frogs in medicine     stat   Statistics  \n",
       "1   Frogs in medicine     math  Mathematics  \n",
       "2  Medicine for frogs     stat   Statistics  \n",
       "2  Medicine for frogs     math  Mathematics  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_cols = ['major_id', 'major_name']\n",
    "for i, c in enumerate(new_cols):\n",
    "    df1[c] = df1.major.apply(lambda x: x[i])\n",
    "df1 = df1.drop('major', axis=1)\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2NF\n",
    "\n",
    "- Break partial dependencies\n",
    "    - Identify candidate PK for each row\n",
    "    - If there is a composite PK, see if other columns have partial dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name country\n",
       "0           1    ann    ang           NC  North Carolina     USA\n",
       "1           2  barry  bonds           QC          Quebec  Canada"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student = df1.iloc[:, [0,1,2,3,4,5]].drop_duplicates()\n",
    "df_student"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  course_id         course_name\n",
       "0   BIOS101   Frogs in medicine\n",
       "2   BIOS102  Medicine for frogs"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_course = df1.iloc[:,6:8].drop_duplicates()\n",
    "df_course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>major_id</th>\n",
       "      <th>major_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>stats</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>biol</td>\n",
       "      <td>Biology</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  major_id   major_name\n",
       "0    stats   Statistics\n",
       "0     biol      Biology\n",
       "1     stat   Statistics\n",
       "1     math  Mathematics"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_major = df1.iloc[:,8:10].drop_duplicates()\n",
    "df_major"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>course_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BIOS101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>BIOS101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>BIOS102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id course_id\n",
       "0           1   BIOS101\n",
       "1           2   BIOS101\n",
       "2           2   BIOS102"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_course = df1.iloc[:, [0, 6]].drop_duplicates()\n",
    "df_student_course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>major_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>stats</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>biol</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>stat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>math</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id major_id\n",
       "0           1    stats\n",
       "0           1     biol\n",
       "1           2     stat\n",
       "1           2     math"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_major = df1.iloc[:, [0, 8]].drop_duplicates()\n",
    "df_student_major"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3NF\n",
    "\n",
    "- Remove transitive dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name\n",
       "0           1    ann    ang           NC  North Carolina\n",
       "1           2  barry  bonds           QC          Quebec"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_ = df_student.iloc[:, :5].drop_duplicates()\n",
    "df_student_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       state_name country\n",
       "0  North Carolina     USA\n",
       "1          Quebec  Canada"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_origin = df_student.iloc[:, 4:7].drop_duplicates()\n",
    "df_origin"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name\n",
       "0           1    ann    ang           NC  North Carolina\n",
       "1           2  barry  bonds           QC          Quebec"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       state_name country\n",
       "0  North Carolina     USA\n",
       "1          Quebec  Canada"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_origin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  course_id         course_name\n",
       "0   BIOS101   Frogs in medicine\n",
       "2   BIOS102  Medicine for frogs"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>course_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>BIOS101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>BIOS101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>BIOS102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id course_id\n",
       "0           1   BIOS101\n",
       "1           2   BIOS101\n",
       "2           2   BIOS102"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>major_id</th>\n",
       "      <th>major_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>stats</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>biol</td>\n",
       "      <td>Biology</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  major_id   major_name\n",
       "0    stats   Statistics\n",
       "0     biol      Biology\n",
       "1     stat   Statistics\n",
       "1     math  Mathematics"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_major"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>major_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>stats</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>biol</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>stat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>math</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id major_id\n",
       "0           1    stats\n",
       "0           1     biol\n",
       "1           2     stat\n",
       "1           2     math"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_major"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Denormalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name country\n",
       "0           1    ann    ang           NC  North Carolina     USA\n",
       "1           2  barry  bonds           QC          Quebec  Canada"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_student_.merge(df_origin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>first</th>\n",
       "      <th>last</th>\n",
       "      <th>state_abbrev</th>\n",
       "      <th>state_name</th>\n",
       "      <th>country</th>\n",
       "      <th>course_id</th>\n",
       "      <th>course_name</th>\n",
       "      <th>major_id</th>\n",
       "      <th>major_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>stats</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>ann</td>\n",
       "      <td>ang</td>\n",
       "      <td>NC</td>\n",
       "      <td>North Carolina</td>\n",
       "      <td>USA</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>biol</td>\n",
       "      <td>Biology</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "      <td>stat</td>\n",
       "      <td>Statistics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS101</td>\n",
       "      <td>Frogs in medicine</td>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2</td>\n",
       "      <td>barry</td>\n",
       "      <td>bonds</td>\n",
       "      <td>QC</td>\n",
       "      <td>Quebec</td>\n",
       "      <td>Canada</td>\n",
       "      <td>BIOS102</td>\n",
       "      <td>Medicine for frogs</td>\n",
       "      <td>math</td>\n",
       "      <td>Mathematics</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   student_id  first   last state_abbrev      state_name country course_id  \\\n",
       "0           1    ann    ang           NC  North Carolina     USA   BIOS101   \n",
       "1           1    ann    ang           NC  North Carolina     USA   BIOS101   \n",
       "2           2  barry  bonds           QC          Quebec  Canada   BIOS101   \n",
       "3           2  barry  bonds           QC          Quebec  Canada   BIOS102   \n",
       "4           2  barry  bonds           QC          Quebec  Canada   BIOS101   \n",
       "5           2  barry  bonds           QC          Quebec  Canada   BIOS102   \n",
       "\n",
       "          course_name major_id   major_name  \n",
       "0   Frogs in medicine    stats   Statistics  \n",
       "1   Frogs in medicine     biol      Biology  \n",
       "2   Frogs in medicine     stat   Statistics  \n",
       "3  Medicine for frogs     stat   Statistics  \n",
       "4   Frogs in medicine     math  Mathematics  \n",
       "5  Medicine for frogs     math  Mathematics  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    df_student_.\n",
    "    merge(df_origin).\n",
    "    merge(df_student_course).\n",
    "    merge(df_course).\n",
    "    merge(df_student_major).\n",
    "    merge(df_major)\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}