{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Science: Data processing\n",
    "\n",
    "Typical packages: `pandas`, `plotnine`, `plotly`, `streamlit`\n",
    "\n",
    "**References**\n",
    "\n",
    "- [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)\n",
    "- [Python for Data Analysis, 2nd Edition](https://github.com/wesm/pydata-book)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction to `pandas`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Series and Data Frames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Series objects"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A `Series` is like a vector. All elements must have the same type or are nulls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    1.0\n",
       "2    2.0\n",
       "3    3.0\n",
       "4    NaN\n",
       "dtype: float64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([1,1,2,3] + [None])\n",
    "s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unique Counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0    2\n",
       "3.0    1\n",
       "2.0    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Special types of series"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      the quick\n",
       "1    quick brown\n",
       "2      brown fox\n",
       "3      fox jumps\n",
       "4     jumps over\n",
       "5       over the\n",
       "6       the lazy\n",
       "7       lazy dog\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = 'the quick brown fox jumps over the lazy dog'.split()\n",
    "s1 = pd.Series([' '.join(item) for item in zip(words[:-1], words[1:])])\n",
    "s1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      THE QUICK\n",
       "1    QUICK BROWN\n",
       "2      BROWN FOX\n",
       "3      FOX JUMPS\n",
       "4     JUMPS OVER\n",
       "5       OVER THE\n",
       "6       THE LAZY\n",
       "7       LAZY DOG\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1.str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      [the, quick]\n",
       "1    [quick, brown]\n",
       "2      [brown, fox]\n",
       "3      [fox, jumps]\n",
       "4     [jumps, over]\n",
       "5       [over, the]\n",
       "6       [the, lazy]\n",
       "7       [lazy, dog]\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1.str.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    quick\n",
       "1    brown\n",
       "2      fox\n",
       "3    jumps\n",
       "4     over\n",
       "5      the\n",
       "6     lazy\n",
       "7      dog\n",
       "dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s1.str.split().str[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       Asian\n",
       "1       Asian\n",
       "2       White\n",
       "3       Black\n",
       "4       White\n",
       "5    Hispanic\n",
       "dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2 = pd.Series(['Asian', 'Asian', 'White', 'Black', 'White', 'Hispanic'])\n",
    "s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       Asian\n",
       "1       Asian\n",
       "2       White\n",
       "3       Black\n",
       "4       White\n",
       "5    Hispanic\n",
       "dtype: category\n",
       "Categories (4, object): ['Asian', 'Black', 'Hispanic', 'White']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2 = s2.astype('category')\n",
    "s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Asian', 'Black', 'Hispanic', 'White'], dtype='object')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    0\n",
       "2    3\n",
       "3    1\n",
       "4    3\n",
       "5    2\n",
       "dtype: int8"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2.cat.codes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dates and times\n",
    "\n",
    "Datetimes are often useful as indices to a time series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pendulum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = pendulum.today()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2020-11-11'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.to_date_string()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 18\n",
    "s3 = pd.Series(range(k), \n",
    "               index=pd.date_range(d.to_date_string(),\n",
    "                                   periods=k, \n",
    "                                   freq='M'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2020-11-30     0\n",
       "2020-12-31     1\n",
       "2021-01-31     2\n",
       "2021-02-28     3\n",
       "2021-03-31     4\n",
       "2021-04-30     5\n",
       "2021-05-31     6\n",
       "2021-06-30     7\n",
       "2021-07-31     8\n",
       "2021-08-31     9\n",
       "2021-09-30    10\n",
       "2021-10-31    11\n",
       "2021-11-30    12\n",
       "2021-12-31    13\n",
       "2022-01-31    14\n",
       "2022-02-28    15\n",
       "2022-03-31    16\n",
       "2022-04-30    17\n",
       "Freq: M, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2021-01-31     2\n",
       "2021-02-28     3\n",
       "2021-03-31     4\n",
       "2021-04-30     5\n",
       "2021-05-31     6\n",
       "2021-06-30     7\n",
       "2021-07-31     8\n",
       "2021-08-31     9\n",
       "2021-09-30    10\n",
       "2021-10-31    11\n",
       "2021-11-30    12\n",
       "2021-12-31    13\n",
       "Freq: M, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s3['2021']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2021-01-31    2\n",
       "2021-02-28    3\n",
       "2021-03-31    4\n",
       "2021-04-30    5\n",
       "2021-05-31    6\n",
       "2021-06-30    7\n",
       "Freq: M, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s3['2021-01':'2021-06']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If used as a series, then need `dt` accessor method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "s4 = s3.index.to_series()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2020-11-30       Monday\n",
       "2020-12-31     Thursday\n",
       "2021-01-31       Sunday\n",
       "2021-02-28       Sunday\n",
       "2021-03-31    Wednesday\n",
       "2021-04-30       Friday\n",
       "2021-05-31       Monday\n",
       "2021-06-30    Wednesday\n",
       "2021-07-31     Saturday\n",
       "2021-08-31      Tuesday\n",
       "2021-09-30     Thursday\n",
       "2021-10-31       Sunday\n",
       "2021-11-30      Tuesday\n",
       "2021-12-31       Friday\n",
       "2022-01-31       Monday\n",
       "2022-02-28       Monday\n",
       "2022-03-31     Thursday\n",
       "2022-04-30     Saturday\n",
       "Freq: M, dtype: object"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s4.dt.day_name()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DataFrame objects"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A `DataFrame` is like a matrix. Columns in a `DataFrame` are `Series`.\n",
    "\n",
    "- Each column in a DataFrame represents a **variale**\n",
    "- Each row in a DataFrame represents an **observation**\n",
    "- Each cell in a DataFrame represents a **value**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   num\n",
       "0  1.0\n",
       "1  2.0\n",
       "2  3.0\n",
       "3  NaN"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dict(num=[1,2,3] + [None]))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    2.0\n",
       "2    3.0\n",
       "3    NaN\n",
       "Name: num, dtype: float64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.num"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Index\n",
    "\n",
    "Row and column identifiers are of `Index` type.\n",
    "\n",
    "Somewhat confusingly, index is also a a synonym for the row identifiers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=4, step=1)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Setting a column as the row index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   num\n",
       "0  1.0\n",
       "1  2.0\n",
       "2  3.0\n",
       "3  NaN"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1.0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2.0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3.0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NaN</th>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [1.0, 2.0, 3.0, nan]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df.set_index('num')\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Making an index into a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   num\n",
       "0  1.0\n",
       "1  2.0\n",
       "2  3.0\n",
       "3  NaN"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Columns\n",
    "\n",
    "This is just a different index object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['num'], dtype='object')"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Getting raw values\n",
    "\n",
    "Sometimes you just want a `numpy` array, and not a `pandas` object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.],\n",
       "       [ 2.],\n",
       "       [ 3.],\n",
       "       [nan]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating Data Frames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Manual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import OrderedDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>730</td>\n",
       "      <td>88.630097</td>\n",
       "      <td>195.922737</td>\n",
       "      <td>2020-11-11 19:25:24.062141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>112</td>\n",
       "      <td>53.958069</td>\n",
       "      <td>169.022381</td>\n",
       "      <td>2020-11-12 19:25:24.062141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>740</td>\n",
       "      <td>89.709304</td>\n",
       "      <td>180.327636</td>\n",
       "      <td>2020-11-13 19:25:24.062141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>485</td>\n",
       "      <td>52.879337</td>\n",
       "      <td>142.806223</td>\n",
       "      <td>2020-11-14 19:25:24.062141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>905</td>\n",
       "      <td>80.488371</td>\n",
       "      <td>168.384792</td>\n",
       "      <td>2020-11-15 19:25:24.062141</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "0  730  88.630097  195.922737 2020-11-11 19:25:24.062141\n",
       "1  112  53.958069  169.022381 2020-11-12 19:25:24.062141\n",
       "2  740  89.709304  180.327636 2020-11-13 19:25:24.062141\n",
       "3  485  52.879337  142.806223 2020-11-14 19:25:24.062141\n",
       "4  905  80.488371  168.384792 2020-11-15 19:25:24.062141"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = 5\n",
    "dates = pd.date_range(start='now', periods=n, freq='d')\n",
    "df = pd.DataFrame(OrderedDict(pid=np.random.randint(100, 999, n), \n",
    "                              weight=np.random.normal(70, 20, n),\n",
    "                              height=np.random.normal(170, 15, n),\n",
    "                              date=dates,\n",
    "                             ))\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### From file\n",
    "\n",
    "You can read in data from many different file types - plain text, JSON, spreadsheets, databases etc. Functions to read in data look like `read_X` where X is the data type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting measures.txt\n"
     ]
    }
   ],
   "source": [
    "%%file measures.txt\n",
    "pid\tweight\theight\tdate\n",
    "328\t72.654347\t203.560866\t2018-11-11 14:16:18.148411\n",
    "756\t34.027679\t189.847316\t2018-11-12 14:16:18.148411\n",
    "185\t28.501914\t158.646074\t2018-11-13 14:16:18.148411\n",
    "507\t17.396343\t180.795993\t2018-11-14 14:16:18.148411\n",
    "919\t64.724301\t173.564725\t2018-11-15 14:16:18.148411"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                        date\n",
       "0  328  72.654347  203.560866  2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316  2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074  2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993  2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725  2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_table('measures.txt')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Indexing Data Frames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Implicit defaults\n",
    "\n",
    "if you provide a slice, it is assumed that you are asking for rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                        date\n",
       "1  756  34.027679  189.847316  2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074  2018-11-13 14:16:18.148411"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[1:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you provide a singe value or list, it is assumed that you are asking for columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight\n",
       "0  328  72.654347\n",
       "1  756  34.027679\n",
       "2  185  28.501914\n",
       "3  507  17.396343\n",
       "4  919  64.724301"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['pid', 'weight']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extracting a column"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dictionary style access"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    328\n",
       "1    756\n",
       "2    185\n",
       "3    507\n",
       "4    919\n",
       "Name: pid, dtype: int64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['pid']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Property style access\n",
    "\n",
    "This only works for column names tat are also valid Python identifier (i.e., no spaces or dashes or keywords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    328\n",
       "1    756\n",
       "2    185\n",
       "3    507\n",
       "4    919\n",
       "Name: pid, dtype: int64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.pid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing by location\n",
    "\n",
    "This is similar to `numpy` indexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                        date\n",
       "1  756  34.027679  189.847316  2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074  2018-11-13 14:16:18.148411"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[1:3, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight                        date\n",
       "1  34.027679  2018-11-12 14:16:18.148411\n",
       "2  28.501914  2018-11-13 14:16:18.148411"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[1:3, 1:4:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing by name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "1  34.027679  189.847316\n",
       "2  28.501914  158.646074\n",
       "3  17.396343  180.795993"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[1:3, 'weight':'height']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning**: When using `loc`, the row slice indicates row names, not positions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                        date\n",
       "1  328  72.654347  203.560866  2018-11-11 14:16:18.148411\n",
       "2  756  34.027679  189.847316  2018-11-12 14:16:18.148411\n",
       "3  185  28.501914  158.646074  2018-11-13 14:16:18.148411\n",
       "4  507  17.396343  180.795993  2018-11-14 14:16:18.148411\n",
       "5  919  64.724301  173.564725  2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df.copy()\n",
    "df1.index = df.index + 1\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "1  72.654347  203.560866\n",
       "2  34.027679  189.847316\n",
       "3  28.501914  158.646074"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.loc[1:3, 'weight':'height']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Structure of a Data Frame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pid         int64\n",
       "weight    float64\n",
       "height    float64\n",
       "date       object\n",
       "dtype: object"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting data types"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Using `astype` on one column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.pid = df.pid.astype('category')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Using `astype` on multiple columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.astype(dict(weight=float, height=float))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Using a conversion function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.date = pd.to_datetime(df.date)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pid             category\n",
       "weight           float64\n",
       "height           float64\n",
       "date      datetime64[ns]\n",
       "dtype: object"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 4)"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>43.460917</td>\n",
       "      <td>181.282995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>23.960945</td>\n",
       "      <td>16.895933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>173.564725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          weight      height\n",
       "count   5.000000    5.000000\n",
       "mean   43.460917  181.282995\n",
       "std    23.960945   16.895933\n",
       "min    17.396343  158.646074\n",
       "25%    28.501914  173.564725\n",
       "50%    34.027679  180.795993\n",
       "75%    64.724301  189.847316\n",
       "max    72.654347  203.560866"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 4 columns):\n",
      " #   Column  Non-Null Count  Dtype         \n",
      "---  ------  --------------  -----         \n",
      " 0   pid     5 non-null      category      \n",
      " 1   weight  5 non-null      float64       \n",
      " 2   height  5 non-null      float64       \n",
      " 3   date    5 non-null      datetime64[ns]\n",
      "dtypes: category(1), datetime64[ns](1), float64(2)\n",
      "memory usage: 453.0 bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(frac=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Selecting, Renaming and Removing Columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Selecting columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid                       date\n",
       "0  328 2018-11-11 14:16:18.148411\n",
       "1  756 2018-11-12 14:16:18.148411\n",
       "2  185 2018-11-13 14:16:18.148411\n",
       "3  507 2018-11-14 14:16:18.148411\n",
       "4  919 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.filter(items=['pid', 'date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "0  72.654347  203.560866\n",
       "1  34.027679  189.847316\n",
       "2  28.501914  158.646074\n",
       "3  17.396343  180.795993\n",
       "4  64.724301  173.564725"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.filter(regex='.*ght')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Note that you can also use regular string methods on the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid                       date\n",
       "0  328 2018-11-11 14:16:18.148411\n",
       "1  756 2018-11-12 14:16:18.148411\n",
       "2  185 2018-11-13 14:16:18.148411\n",
       "3  507 2018-11-14 14:16:18.148411\n",
       "4  919 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[:, df.columns.str.contains('d')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Renaming columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>w</th>\n",
       "      <th>h</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid          w           h                       date\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rename(dict(weight='w', height='h'), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_cols = df.columns "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = list('abcd')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a          b           c                          d\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns = orig_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Removing columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "0  72.654347  203.560866\n",
       "1  34.027679  189.847316\n",
       "2  28.501914  158.646074\n",
       "3  17.396343  180.795993\n",
       "4  64.724301  173.564725"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop(['pid', 'date'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "0  72.654347  203.560866\n",
       "1  34.027679  189.847316\n",
       "2  28.501914  158.646074\n",
       "3  17.396343  180.795993\n",
       "4  64.724301  173.564725"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop(columns=['pid', 'date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "0  72.654347  203.560866\n",
       "1  34.027679  189.847316\n",
       "2  28.501914  158.646074\n",
       "3  17.396343  180.795993\n",
       "4  64.724301  173.564725"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop(columns=df.columns[df.columns.str.contains('d')])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Selecting, Renaming and Removing Rows"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Selecting rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.weight.between(60,70)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [pid, weight, height, date]\n",
       "Index: []"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[(69 <= df.weight) & (df.weight < 70)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.date.between(pd.to_datetime('2018-11-13'), \n",
    "                   pd.to_datetime('2018-11-15 23:59:59'))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Renaming rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "a  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "b  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "c  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "d  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "e  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rename({i:letter for i,letter in enumerate('abcde')})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.index = ['the', 'quick', 'brown', 'fox', 'jumphs']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>the</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quick</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>brown</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fox</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>jumphs</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        pid     weight      height                       date\n",
       "the     328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "quick   756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "brown   185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "fox     507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "jumphs  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dropping rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop([1,3], axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dropping duplicated data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['something'] = [1,1,None,2,None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        1.0\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df.something.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        1.0\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        2.0"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates(subset='something')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Dropping missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        1.0\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        1.0\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        2.0\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    1.0\n",
       "2    0.0\n",
       "3    2.0\n",
       "4    0.0\n",
       "Name: something, dtype: float64"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.something.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    1.0\n",
       "2    1.0\n",
       "3    2.0\n",
       "4    2.0\n",
       "Name: something, dtype: float64"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.something.ffill()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    1.0\n",
       "2    2.0\n",
       "3    2.0\n",
       "4    NaN\n",
       "Name: something, dtype: float64"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.something.bfill()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.0\n",
       "1    1.0\n",
       "2    1.5\n",
       "3    2.0\n",
       "4    2.0\n",
       "Name: something, dtype: float64"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.something.interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        1.0\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        1.0\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        2.0"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transforming and Creating Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something        bmi\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        1.0  17.533678\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        1.0   9.441118\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN  11.324404\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        2.0   5.322067\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN  21.485449"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.assign(bmi=df['weight'] / (df['height']/100)**2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['bmi'] = df['weight'] / (df['height']/100)**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>1.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something        bmi\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        1.0  17.533678\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        1.0   9.441118\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN  11.324404\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        2.0   5.322067\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN  21.485449"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['something'] = [2,2,None,None,3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something        bmi\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0  17.533678\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   9.441118\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN  11.324404\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   5.322067\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0  21.485449"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sorting Data Frames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort on indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bmi</th>\n",
       "      <th>date</th>\n",
       "      <th>height</th>\n",
       "      <th>pid</th>\n",
       "      <th>something</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17.533678</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>328</td>\n",
       "      <td>2.0</td>\n",
       "      <td>72.654347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.441118</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>756</td>\n",
       "      <td>2.0</td>\n",
       "      <td>34.027679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11.324404</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>185</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.501914</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.322067</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>507</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17.396343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>21.485449</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>919</td>\n",
       "      <td>3.0</td>\n",
       "      <td>64.724301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         bmi                       date      height  pid  something     weight\n",
       "0  17.533678 2018-11-11 14:16:18.148411  203.560866  328        2.0  72.654347\n",
       "1   9.441118 2018-11-12 14:16:18.148411  189.847316  756        2.0  34.027679\n",
       "2  11.324404 2018-11-13 14:16:18.148411  158.646074  185        NaN  28.501914\n",
       "3   5.322067 2018-11-14 14:16:18.148411  180.795993  507        NaN  17.396343\n",
       "4  21.485449 2018-11-15 14:16:18.148411  173.564725  919        3.0  64.724301"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_index(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something        bmi\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0  21.485449\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   5.322067\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN  11.324404\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   9.441118\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0  17.533678"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_index(axis=0, ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort on values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something        bmi\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0  17.533678\n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   9.441118\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0  21.485449\n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN  11.324404\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   5.322067"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_values(by=['something', 'bmi'], ascending=[True, False])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summarizing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply an aggregation function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height  something        bmi\n",
       "0  72.654347  203.560866        2.0  17.533678\n",
       "1  34.027679  189.847316        2.0   9.441118\n",
       "2  28.501914  158.646074        NaN  11.324404\n",
       "3  17.396343  180.795993        NaN   5.322067\n",
       "4  64.724301  173.564725        3.0  21.485449"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.select_dtypes(include=np.number)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "weight       217.304584\n",
       "height       906.414974\n",
       "something      7.000000\n",
       "bmi           65.106716\n",
       "dtype: float64"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.select_dtypes(include=np.number).agg(np.sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>5.0</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sum</th>\n",
       "      <td>NaN</td>\n",
       "      <td>217.304584</td>\n",
       "      <td>906.414974</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>65.106716</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>43.460917</td>\n",
       "      <td>181.282995</td>\n",
       "      <td>2018-11-13 14:16:18.148410880</td>\n",
       "      <td>2.333333</td>\n",
       "      <td>13.021343</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       pid      weight      height                           date  something  \\\n",
       "count  5.0    5.000000    5.000000                              5   3.000000   \n",
       "sum    NaN  217.304584  906.414974                            NaN   7.000000   \n",
       "mean   NaN   43.460917  181.282995  2018-11-13 14:16:18.148410880   2.333333   \n",
       "\n",
       "             bmi  \n",
       "count   5.000000  \n",
       "sum    65.106716  \n",
       "mean   13.021343  "
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.agg(['count', np.sum, np.mean])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split-Apply-Combine\n",
    "\n",
    "We often want to perform subgroup analysis (conditioning by some discrete or categorical variable). This is done with `groupby` followed by an aggregate function. Conceptually, we split the data frame into separate groups, apply the aggregate function to each group separately, then combine the aggregated results back into a single data frame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['treatment'] = list('ababa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "\n",
       "         bmi treatment  \n",
       "0  17.533678         a  \n",
       "1   9.441118         b  \n",
       "2  11.324404         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  "
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df.groupby('treatment')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "\n",
       "         bmi treatment  \n",
       "0  17.533678         a  \n",
       "2  11.324404         a  \n",
       "4  21.485449         a  "
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.get_group('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>178.590555</td>\n",
       "      <td>2.5</td>\n",
       "      <td>16.781177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>185.321654</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.381592</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              weight      height  something        bmi\n",
       "treatment                                             \n",
       "a          55.293521  178.590555        2.5  16.781177\n",
       "b          25.712011  185.321654        2.0   7.381592"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using `agg` with `groupby`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>178.590555</td>\n",
       "      <td>2.5</td>\n",
       "      <td>16.781177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>185.321654</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.381592</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              weight      height  something        bmi\n",
       "treatment                                             \n",
       "a          55.293521  178.590555        2.5  16.781177\n",
       "b          25.712011  185.321654        2.0   7.381592"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.agg('mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">weight</th>\n",
       "      <th colspan=\"2\" halign=\"left\">height</th>\n",
       "      <th colspan=\"2\" halign=\"left\">something</th>\n",
       "      <th colspan=\"2\" halign=\"left\">bmi</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>23.538565</td>\n",
       "      <td>178.590555</td>\n",
       "      <td>22.875289</td>\n",
       "      <td>2.5</td>\n",
       "      <td>0.707107</td>\n",
       "      <td>16.781177</td>\n",
       "      <td>5.122148</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>11.760130</td>\n",
       "      <td>185.321654</td>\n",
       "      <td>6.400252</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.381592</td>\n",
       "      <td>2.912608</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              weight                 height            something            \\\n",
       "                mean        std        mean        std      mean       std   \n",
       "treatment                                                                    \n",
       "a          55.293521  23.538565  178.590555  22.875289       2.5  0.707107   \n",
       "b          25.712011  11.760130  185.321654   6.400252       2.0       NaN   \n",
       "\n",
       "                 bmi            \n",
       "                mean       std  \n",
       "treatment                       \n",
       "a          16.781177  5.122148  \n",
       "b           7.381592  2.912608  "
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.agg(['mean', 'std'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">weight</th>\n",
       "      <th colspan=\"2\" halign=\"left\">height</th>\n",
       "      <th>bmi</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "      <th>&lt;lambda&gt;</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>treatment</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>23.538565</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>897.296525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>11.760130</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>117.459100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              weight                 height                     bmi\n",
       "                mean        std         min         max    <lambda>\n",
       "treatment                                                          \n",
       "a          55.293521  23.538565  158.646074  203.560866  897.296525\n",
       "b          25.712011  11.760130  180.795993  189.847316  117.459100"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.agg({'weight': ['mean', 'std'], 'height': ['min', 'max'], 'bmi': lambda x: (x**2).sum()})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using `trasnform` wtih `groupby`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>178.590555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>185.321654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>178.590555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>25.712011</td>\n",
       "      <td>185.321654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>55.293521</td>\n",
       "      <td>178.590555</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight      height\n",
       "0  55.293521  178.590555\n",
       "1  25.712011  185.321654\n",
       "2  55.293521  178.590555\n",
       "3  25.712011  185.321654\n",
       "4  55.293521  178.590555"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_mean = grouped[['weight', 'height']].transform(np.mean)\n",
    "g_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>23.538565</td>\n",
       "      <td>22.875289</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11.760130</td>\n",
       "      <td>6.400252</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23.538565</td>\n",
       "      <td>22.875289</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.760130</td>\n",
       "      <td>6.400252</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>23.538565</td>\n",
       "      <td>22.875289</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      weight     height\n",
       "0  23.538565  22.875289\n",
       "1  11.760130   6.400252\n",
       "2  23.538565  22.875289\n",
       "3  11.760130   6.400252\n",
       "4  23.538565  22.875289"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_std = grouped[['weight', 'height']].transform(np.std)\n",
    "g_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.737548</td>\n",
       "      <td>1.091584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.707107</td>\n",
       "      <td>0.707107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1.138201</td>\n",
       "      <td>-0.871879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.707107</td>\n",
       "      <td>-0.707107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.400652</td>\n",
       "      <td>-0.219706</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     weight    height\n",
       "0  0.737548  1.091584\n",
       "1  0.707107  0.707107\n",
       "2 -1.138201 -0.871879\n",
       "3 -0.707107 -0.707107\n",
       "4  0.400652 -0.219706"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(df[['weight', 'height']] - g_mean)/g_std"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combining Data Frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "\n",
       "         bmi treatment  \n",
       "0  17.533678         a  \n",
       "1   9.441118         b  \n",
       "2  11.324404         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  "
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 =  df.iloc[3:].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date        bmi treatment\n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411   5.322067         b\n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411  21.485449         a"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.drop('something', axis=1, inplace=True)\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Adding rows\n",
    "\n",
    "Note that `pandas` aligns by column indexes automatically."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN   \n",
       "\n",
       "         bmi treatment  \n",
       "0  17.533678         a  \n",
       "1   9.441118         b  \n",
       "2  11.324404         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  "
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.append(df1, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        NaN   \n",
       "\n",
       "         bmi treatment  \n",
       "0  17.533678         a  \n",
       "1   9.441118         b  \n",
       "2  11.324404         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  \n",
       "3   5.322067         b  \n",
       "4  21.485449         a  "
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([df, df1], sort=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Adding columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    328\n",
       "1    756\n",
       "2    185\n",
       "3    507\n",
       "4    919\n",
       "Name: pid, dtype: category\n",
       "Categories (5, int64): [185, 328, 507, 756, 919]"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.pid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = pd.DataFrame(OrderedDict(pid=[649, 533, 400, 600], age=[23,34,45,56]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    649\n",
       "1    533\n",
       "2    400\n",
       "3    600\n",
       "Name: pid, dtype: int64"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.pid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.pid = df.pid.astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [pid, weight, height, date, something, bmi, treatment, age]\n",
       "Index: []"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df, df2, on='pid', how='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "\n",
       "         bmi treatment  age  \n",
       "0  17.533678         a  NaN  \n",
       "1   9.441118         b  NaN  \n",
       "2  11.324404         a  NaN  \n",
       "3   5.322067         b  NaN  \n",
       "4  21.485449         a  NaN  "
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df, df2, on='pid', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>649</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>533</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>600</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid  weight  height date  something  bmi treatment  age\n",
       "0  649     NaN     NaN  NaT        NaN  NaN       NaN   23\n",
       "1  533     NaN     NaN  NaT        NaN  NaN       NaN   34\n",
       "2  400     NaN     NaN  NaT        NaN  NaN       NaN   45\n",
       "3  600     NaN     NaN  NaT        NaN  NaN       NaN   56"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df, df2, on='pid', how='right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>weight</th>\n",
       "      <th>height</th>\n",
       "      <th>date</th>\n",
       "      <th>something</th>\n",
       "      <th>bmi</th>\n",
       "      <th>treatment</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>328</td>\n",
       "      <td>72.654347</td>\n",
       "      <td>203.560866</td>\n",
       "      <td>2018-11-11 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>17.533678</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>756</td>\n",
       "      <td>34.027679</td>\n",
       "      <td>189.847316</td>\n",
       "      <td>2018-11-12 14:16:18.148411</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.441118</td>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>185</td>\n",
       "      <td>28.501914</td>\n",
       "      <td>158.646074</td>\n",
       "      <td>2018-11-13 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.324404</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>507</td>\n",
       "      <td>17.396343</td>\n",
       "      <td>180.795993</td>\n",
       "      <td>2018-11-14 14:16:18.148411</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.322067</td>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>919</td>\n",
       "      <td>64.724301</td>\n",
       "      <td>173.564725</td>\n",
       "      <td>2018-11-15 14:16:18.148411</td>\n",
       "      <td>3.0</td>\n",
       "      <td>21.485449</td>\n",
       "      <td>a</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>649</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>533</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>400</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>45.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>600</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaT</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>56.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pid     weight      height                       date  something  \\\n",
       "0  328  72.654347  203.560866 2018-11-11 14:16:18.148411        2.0   \n",
       "1  756  34.027679  189.847316 2018-11-12 14:16:18.148411        2.0   \n",
       "2  185  28.501914  158.646074 2018-11-13 14:16:18.148411        NaN   \n",
       "3  507  17.396343  180.795993 2018-11-14 14:16:18.148411        NaN   \n",
       "4  919  64.724301  173.564725 2018-11-15 14:16:18.148411        3.0   \n",
       "5  649        NaN         NaN                        NaT        NaN   \n",
       "6  533        NaN         NaN                        NaT        NaN   \n",
       "7  400        NaN         NaN                        NaT        NaN   \n",
       "8  600        NaN         NaN                        NaT        NaN   \n",
       "\n",
       "         bmi treatment   age  \n",
       "0  17.533678         a   NaN  \n",
       "1   9.441118         b   NaN  \n",
       "2  11.324404         a   NaN  \n",
       "3   5.322067         b   NaN  \n",
       "4  21.485449         a   NaN  \n",
       "5        NaN       NaN  23.0  \n",
       "6        NaN       NaN  34.0  \n",
       "7        NaN       NaN  45.0  \n",
       "8        NaN       NaN  56.0  "
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df, df2, on='pid', how='outer')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging on the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(dict(x=[1,2,3]), index=list('abc'))\n",
    "df2 = pd.DataFrame(dict(y=[4,5,6]), index=list('abc'))\n",
    "df3 = pd.DataFrame(dict(z=[7,8,9]), index=list('abc'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   x\n",
       "a  1\n",
       "b  2\n",
       "c  3"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   y\n",
       "a  4\n",
       "b  5\n",
       "c  6"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   z\n",
       "a  7\n",
       "b  8\n",
       "c  9"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   x  y  z\n",
       "a  1  4  7\n",
       "b  2  5  8\n",
       "c  3  6  9"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.join([df2, df3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fixing common DataFrame issues"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multiple variables in a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid_treat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B-2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>D-2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pid_treat\n",
       "0       A-1\n",
       "1       B-2\n",
       "2       C-1\n",
       "3       D-2"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dict(pid_treat = ['A-1', 'B-2', 'C-1', 'D-2']))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [A, 1]\n",
       "1    [B, 2]\n",
       "2    [C, 1]\n",
       "3    [D, 2]\n",
       "Name: pid_treat, dtype: object"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.pid_treat.str.split('-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>treat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>D</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pid treat\n",
       "0   A     1\n",
       "1   B     2\n",
       "2   C     1\n",
       "3   D     2"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.pid_treat.str.split('-').apply(pd.Series, index=['pid', 'treat'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multiple values in a cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>vals</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>(1, 2, 3)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>(4, 5, 6)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>c</td>\n",
       "      <td>(7, 8, 9)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pid       vals\n",
       "0   a  (1, 2, 3)\n",
       "1   b  (4, 5, 6)\n",
       "2   c  (7, 8, 9)"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dict(pid=['a', 'b', 'c'], vals = [(1,2,3), (4,5,6), (7,8,9)]))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>vals</th>\n",
       "      <th>t1</th>\n",
       "      <th>t2</th>\n",
       "      <th>t3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>(1, 2, 3)</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>(4, 5, 6)</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>c</td>\n",
       "      <td>(7, 8, 9)</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pid       vals  t1  t2  t3\n",
       "0   a  (1, 2, 3)   1   2   3\n",
       "1   b  (4, 5, 6)   4   5   6\n",
       "2   c  (7, 8, 9)   7   8   9"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['t1', 't2', 't3']]  = df.vals.apply(pd.Series)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop('vals', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pid</th>\n",
       "      <th>vals</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>c</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>c</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>b</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>c</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pid  vals\n",
       "0   a     1\n",
       "1   b     4\n",
       "2   c     7\n",
       "3   a     2\n",
       "4   b     5\n",
       "5   c     8\n",
       "6   a     3\n",
       "7   b     6\n",
       "8   c     9"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.melt(df, id_vars='pid', value_name='vals').drop('variable', axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reshaping Data Frames\n",
    "\n",
    "Sometimes we need to make rows into columns or vice versa."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting multiple columns into a single column\n",
    "\n",
    "This is often useful if you need to condition on some variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = 'https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv'\n",
    "iris = pd.read_csv(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          3.5           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          3.1           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(150, 5)"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_iris = pd.melt(iris, id_vars='species')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>variable</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>463</th>\n",
       "      <td>setosa</td>\n",
       "      <td>petal_width</td>\n",
       "      <td>0.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>210</th>\n",
       "      <td>versicolor</td>\n",
       "      <td>sepal_width</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>versicolor</td>\n",
       "      <td>sepal_length</td>\n",
       "      <td>5.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>468</th>\n",
       "      <td>setosa</td>\n",
       "      <td>petal_width</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>versicolor</td>\n",
       "      <td>sepal_length</td>\n",
       "      <td>4.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>versicolor</td>\n",
       "      <td>sepal_length</td>\n",
       "      <td>5.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>virginica</td>\n",
       "      <td>sepal_length</td>\n",
       "      <td>6.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>385</th>\n",
       "      <td>versicolor</td>\n",
       "      <td>petal_length</td>\n",
       "      <td>4.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>virginica</td>\n",
       "      <td>petal_width</td>\n",
       "      <td>1.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>469</th>\n",
       "      <td>setosa</td>\n",
       "      <td>petal_width</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        species      variable  value\n",
       "463      setosa   petal_width    0.1\n",
       "210  versicolor   sepal_width    2.0\n",
       "70   versicolor  sepal_length    5.9\n",
       "468      setosa   petal_width    0.3\n",
       "57   versicolor  sepal_length    4.9\n",
       "98   versicolor  sepal_length    5.1\n",
       "111   virginica  sepal_length    6.4\n",
       "385  versicolor  petal_length    4.5\n",
       "558   virginica   petal_width    1.8\n",
       "469      setosa   petal_width    0.3"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_iris.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pivoting\n",
    "\n",
    "Sometimes we need to convert categorical values in a column into separate columns. This is often done at the same time as performing a summary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>species</th>\n",
       "      <th>setosa</th>\n",
       "      <th>versicolor</th>\n",
       "      <th>virginica</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>petal_length</th>\n",
       "      <td>1.464</td>\n",
       "      <td>4.260</td>\n",
       "      <td>5.552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>petal_width</th>\n",
       "      <td>0.244</td>\n",
       "      <td>1.326</td>\n",
       "      <td>2.026</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sepal_length</th>\n",
       "      <td>5.006</td>\n",
       "      <td>5.936</td>\n",
       "      <td>6.588</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sepal_width</th>\n",
       "      <td>3.418</td>\n",
       "      <td>2.770</td>\n",
       "      <td>2.974</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "species       setosa  versicolor  virginica\n",
       "variable                                   \n",
       "petal_length   1.464       4.260      5.552\n",
       "petal_width    0.244       1.326      2.026\n",
       "sepal_length   5.006       5.936      6.588\n",
       "sepal_width    3.418       2.770      2.974"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_iris.pivot_table(index='variable', columns='species', values='value', aggfunc='mean')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functional style - `apply`, `applymap` and `map`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`apply` can be used to apply a custom function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = pd.DataFrame(\n",
    "    np.around(np.clip(np.random.normal(90, 10, (5,3)), 0, 100), 1),\n",
    "    columns = ['math', 'stat', 'biol'],\n",
    "    index = ['anne', 'bob', 'charles', 'dirk', 'edgar']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>math</th>\n",
       "      <th>stat</th>\n",
       "      <th>biol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>anne</th>\n",
       "      <td>93.4</td>\n",
       "      <td>99.5</td>\n",
       "      <td>80.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bob</th>\n",
       "      <td>67.5</td>\n",
       "      <td>82.9</td>\n",
       "      <td>94.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>charles</th>\n",
       "      <td>93.5</td>\n",
       "      <td>91.5</td>\n",
       "      <td>96.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dirk</th>\n",
       "      <td>89.4</td>\n",
       "      <td>93.3</td>\n",
       "      <td>84.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>edgar</th>\n",
       "      <td>88.3</td>\n",
       "      <td>92.4</td>\n",
       "      <td>97.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         math  stat  biol\n",
       "anne     93.4  99.5  80.9\n",
       "bob      67.5  82.9  94.5\n",
       "charles  93.5  91.5  96.4\n",
       "dirk     89.4  93.3  84.9\n",
       "edgar    88.3  92.4  97.0"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_grade_1(score):\n",
    "    return np.where(score > 90, 'A', \n",
    "                    np.where(score > 80, 'B',\n",
    "                            np.where(score > 70, 'C', 'F')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>math</th>\n",
       "      <th>stat</th>\n",
       "      <th>biol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>anne</th>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bob</th>\n",
       "      <td>F</td>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>charles</th>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dirk</th>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>edgar</th>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        math stat biol\n",
       "anne       A    A    B\n",
       "bob        F    B    A\n",
       "charles    A    A    A\n",
       "dirk       B    A    B\n",
       "edgar      B    A    A"
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.apply(convert_grade_1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `np.where` is a little clumsy - here is an alternative."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_grade_2(score):\n",
    "    if score.name == 'math': # math professors are mean\n",
    "        return np.choose(\n",
    "            pd.cut(score, [-1, 80, 90, 95, 100], labels=False),\n",
    "            ['F', 'C', 'B', 'A']\n",
    "        )    \n",
    "    else:\n",
    "        return np.choose(\n",
    "            pd.cut(score, [-1, 70, 80, 90, 100], labels=False),\n",
    "            ['F', 'C', 'B', 'A']\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>math</th>\n",
       "      <th>stat</th>\n",
       "      <th>biol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>anne</th>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bob</th>\n",
       "      <td>F</td>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>charles</th>\n",
       "      <td>B</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dirk</th>\n",
       "      <td>C</td>\n",
       "      <td>A</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>edgar</th>\n",
       "      <td>C</td>\n",
       "      <td>A</td>\n",
       "      <td>A</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        math stat biol\n",
       "anne       B    A    B\n",
       "bob        F    B    A\n",
       "charles    B    A    A\n",
       "dirk       C    A    B\n",
       "edgar      C    A    A"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.apply(convert_grade_2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`apply` can be used to avoid explicit looping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "def likely_profession(row):\n",
    "    if (row.biol > row.math) and (row.biol > row.stat):\n",
    "        return 'farmer'\n",
    "    elif (row.math > row.biol) and (row.math > row.stat):\n",
    "        return 'high school teacher'\n",
    "    elif (row.stat > row.math) and (row.stat > row.biol):\n",
    "        return 'actuary'\n",
    "    else:\n",
    "        return 'doctor'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "anne       actuary\n",
       "bob         farmer\n",
       "charles     farmer\n",
       "dirk       actuary\n",
       "edgar       farmer\n",
       "dtype: object"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.apply(likely_profession, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If all else fails, you can loop over `pandas` data frames.\n",
    "\n",
    "- Be prepared for pitying looks from more snobbish Python coders\n",
    "\n",
    "Loops are frowned upon because they are not efficient, but sometimes pragmatism beats elegance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "idx = anne\n",
      "row = Index(['math', 'stat', 'biol'], dtype='object'): [93.4 99.5 80.9]\n",
      "------------------------------\n",
      "idx = bob\n",
      "row = Index(['math', 'stat', 'biol'], dtype='object'): [67.5 82.9 94.5]\n",
      "------------------------------\n",
      "idx = charles\n",
      "row = Index(['math', 'stat', 'biol'], dtype='object'): [93.5 91.5 96.4]\n",
      "------------------------------\n",
      "idx = dirk\n",
      "row = Index(['math', 'stat', 'biol'], dtype='object'): [89.4 93.3 84.9]\n",
      "------------------------------\n",
      "idx = edgar\n",
      "row = Index(['math', 'stat', 'biol'], dtype='object'): [88.3 92.4 97. ]\n",
      "------------------------------"
     ]
    }
   ],
   "source": [
    "for idx, row in scores.iterrows():\n",
    "    print(f'\\nidx = {idx}\\nrow = {row.index}: {row.values}\\n', \n",
    "          end='-'*30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`apply` can be used for reductions along margins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.randint(0, 10, (4,5)), columns=list('abcde'), index=list('wxyz'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>w</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x</th>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c  d  e\n",
       "w  3  7  0  3  7\n",
       "x  9  5  5  2  5\n",
       "y  9  4  5  8  3\n",
       "z  0  4  9  1  8"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    21\n",
       "b    20\n",
       "c    19\n",
       "d    14\n",
       "e    23\n",
       "dtype: int64"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.apply(sum, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "w    20\n",
       "x    26\n",
       "y    29\n",
       "z    22\n",
       "dtype: int64"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.apply(sum, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### For element-wise mapping operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "char_map = {i: c for i,c in enumerate(string.ascii_uppercase)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>w</th>\n",
       "      <td>D</td>\n",
       "      <td>H</td>\n",
       "      <td>A</td>\n",
       "      <td>D</td>\n",
       "      <td>H</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x</th>\n",
       "      <td>J</td>\n",
       "      <td>F</td>\n",
       "      <td>F</td>\n",
       "      <td>C</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>J</td>\n",
       "      <td>E</td>\n",
       "      <td>F</td>\n",
       "      <td>I</td>\n",
       "      <td>D</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>A</td>\n",
       "      <td>E</td>\n",
       "      <td>J</td>\n",
       "      <td>B</td>\n",
       "      <td>I</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c  d  e\n",
       "w  D  H  A  D  H\n",
       "x  J  F  F  C  F\n",
       "y  J  E  F  I  D\n",
       "z  A  E  J  B  I"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.applymap(lambda x: char_map[x])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### For mapping a series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "      <th>b_map</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>w</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>H</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x</th>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>E</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>E</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c  d  e b_map\n",
       "w  3  7  0  3  7     H\n",
       "x  9  5  5  2  5     F\n",
       "y  9  4  5  8  3     E\n",
       "z  0  4  9  1  8     E"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.assign(b_map = df.b.map(char_map))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chaining commands\n",
    "\n",
    "Sometimes you see this functional style of method chaining that avoids the need for temporary intermediate variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">sepal_length</th>\n",
       "      <th colspan=\"2\" halign=\"left\">sepal_width</th>\n",
       "      <th colspan=\"2\" halign=\"left\">both</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "      <th>sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>species</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>setosa</th>\n",
       "      <td>5.1</td>\n",
       "      <td>50.7</td>\n",
       "      <td>3.5</td>\n",
       "      <td>35.1</td>\n",
       "      <td>6.6</td>\n",
       "      <td>65.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>versicolor</th>\n",
       "      <td>5.8</td>\n",
       "      <td>52.2</td>\n",
       "      <td>2.8</td>\n",
       "      <td>24.8</td>\n",
       "      <td>10.1</td>\n",
       "      <td>90.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>virginica</th>\n",
       "      <td>7.0</td>\n",
       "      <td>77.0</td>\n",
       "      <td>3.1</td>\n",
       "      <td>33.8</td>\n",
       "      <td>12.7</td>\n",
       "      <td>140.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           sepal_length       sepal_width        both       \n",
       "                   mean   sum        mean   sum  mean    sum\n",
       "species                                                     \n",
       "setosa              5.1  50.7         3.5  35.1   6.6   65.7\n",
       "versicolor          5.8  52.2         2.8  24.8  10.1   90.9\n",
       "virginica           7.0  77.0         3.1  33.8  12.7  140.2"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    iris.\n",
    "    sample(frac=0.2).\n",
    "    filter(regex='s.*').\n",
    "    assign(both=iris.sepal_length + iris.petal_length).\n",
    "    query('both > 2').\n",
    "    groupby('species').agg(['mean', 'sum']).\n",
    "    pipe(lambda x: np.around(x, 1))\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Moving between R and Python in Jupyter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext rpy2.ipython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.simplefilter('ignore', FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "iris = %R iris"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sepal.Length</th>\n",
       "      <th>Sepal.Width</th>\n",
       "      <th>Petal.Length</th>\n",
       "      <th>Petal.Width</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species\n",
       "1           5.1          3.5           1.4          0.2  setosa\n",
       "2           4.9          3.0           1.4          0.2  setosa\n",
       "3           4.7          3.2           1.3          0.2  setosa\n",
       "4           4.6          3.1           1.5          0.2  setosa\n",
       "5           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "iris_py = iris.copy()\n",
    "iris_py.Species = iris_py.Species.str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i iris_py -o iris_r\n",
    "\n",
    "iris_r <- iris_py[1:3,]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sepal.Length</th>\n",
       "      <th>Sepal.Width</th>\n",
       "      <th>Petal.Length</th>\n",
       "      <th>Petal.Width</th>\n",
       "      <th>Species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>SETOSA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>SETOSA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>SETOSA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species\n",
       "1           5.1          3.5           1.4          0.2  SETOSA\n",
       "2           4.9          3.0           1.4          0.2  SETOSA\n",
       "3           4.7          3.2           1.3          0.2  SETOSA"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "! python3 -m pip install --quiet watermark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext watermark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pendulum 2.1.2\n",
      "pandas   1.1.1\n",
      "numpy    1.18.5\n",
      "CPython 3.8.5\n",
      "IPython 7.17.0\n"
     ]
    }
   ],
   "source": [
    "%watermark -v -iv"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}