{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Strings\n",
    "\n",
    "- Point index\n",
    "- Interval index\n",
    "- Negative index\n",
    "- Stride\n",
    "- Reversing a string\n",
    "- Strings are immutable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"hello world\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('h', 'w')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[0], s[6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello '"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[0:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('d', 'r')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[-1], s[-3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hlowrd'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'dlrow olleh'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'str' object does not support item assignment\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    s[0] = 'H'\n",
    "except TypeError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The `string` module\n",
    "\n",
    "- String constants\n",
    "- String `capwords`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0123456789'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.digits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.ascii_letters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' \\t\\n\\r\\x0b\\x0c'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.whitespace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~ \\t\\n\\r\\x0b\\x0c'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.printable"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Orphan function in strings module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello World'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.capwords(s)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## String methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Methods to change case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'HELLO WORLD'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello world'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ss'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'ß'.casefold()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello world'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello World'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.title()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Difference between `title` method and `capwords` function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello:World'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'hello:world'.title()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello:world'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.capwords('hello:world')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### String predicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isalnum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isalpha()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isascii()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isidentifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isprintable()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.startswith('hell')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.endswith('ld')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Searching and counting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'llo' in s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'foo' in s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.find('llo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.index('llo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.find('foo')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "substring not found\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    s.index('foo')\n",
    "except ValueError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.count('l')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.count('ll')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stripping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello world'"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'   hello world   '.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello world   '"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'   hello world   '.lstrip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'   hello world'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'   hello world   '.rstrip()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Splitting and joining"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['hello', 'world']"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['he', '', 'o wor', 'd']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.split('l')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'h-e-l-l-o- -w-o-r-l-d'"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'-'.join(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello-world'"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'-'.join(s.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'hello world'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'l'.join(s.split('l'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Translation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CTGGTAT'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'GATTACA'.translate(str.maketrans('ACTG', 'TAGC'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'GTT'"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'GATTACA'.translate(str.maketrans('', '', 'AC'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'gattaca'"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'GATTACA'.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### `ord` and `chr`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(65, 97)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ord('A'), ord('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('A', 'a')"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chr(65), chr(97)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'b'"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chr(ord('B') + (ord('a') - ord('A')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Formatting strings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### C sytle formatting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "pi = 3.141592653589793\n",
    "r = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area = 3.141593 * 2^2'"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'area = %f * %d^2' % (pi, r)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Precision and padding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area =     3.14 * 002^2'"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'area = %8.2f * %03d^2' % (pi, r)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Right align string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'      area =     3.14 * 002^2'"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'%10s = %8.2f * %03d^2' % ('area', pi, r)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Left align string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area       =     3.14 * 002^2'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'%-10s = %8.2f * %03d^2' % ('area', pi, r)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using the `format` method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area = 3.141592653589793 * 2^2'"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'area = {} * {}^2'.format(pi, r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area = 3.141592653589793 * 2^2'"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'area = {a} * {b}^2'.format(a=pi, b=r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area =    3.142 * 000002^2'"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'area = {pi:8,.4} * {r:06d}^2'.format(pi=pi, r=r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'      area'"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'{:>10}'.format('area')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area      '"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'{:<10}'.format('area')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'   area   '"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'{:^10}'.format('area')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'===area==='"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'{:=^10}'.format('area')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using f strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'area = 3.141592653589793 * 2^2'"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f'area = {pi} * {r}^2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'===area==='"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = 'area'\n",
    "f'{x:=^10}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Templates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from string import Template"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ann likes Python\n",
      "bob likes R\n",
      "cody likes C++\n"
     ]
    }
   ],
   "source": [
    "t = Template(\"$who likes $what\")\n",
    "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n",
    "for name, lang in items:\n",
    "    print(t.substitute(who=name, what=lang))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ann likes Python\n",
      "bob likes R\n",
      "cody likes C++\n"
     ]
    }
   ],
   "source": [
    "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n",
    "for name, lang in items:\n",
    "    print(\"{} likes {}\".format(name, lang))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ann likes Python\n",
      "bob likes R\n",
      "cody likes C++\n"
     ]
    }
   ],
   "source": [
    "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n",
    "for name, lang in items:\n",
    "    print(f\"{name} likes {lang}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Encodings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unicode strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello 猫\n"
     ]
    }
   ],
   "source": [
    "print('hello \\u732b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello 猫\n"
     ]
    }
   ],
   "source": [
    "s = '猫'\n",
    "print(f'hello {s}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![image](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQoAAAC9CAMAAAB8rcOCAAAAvVBMVEX///8AAADnIjXqIjbtIzbwIze2trbt7e329vbBwcHl5eWtra3R0dGhoaHuyCfg4ODa2tqCgoKTk5NlZWWZmZl6enpubm7bIDJfX1/Hx8dLS0tXV1cnJyeyGilERERQUFDOHi+gGCUuLi6JFB9CCg8aGho7Ozt6EhwQEBDBHCz50SlNCxI3CA1nDxhvEBlaDRUVAwUiBQjjvyUrBgq+oB9GOwvTsSN+ahWtkhwzKwg7MgpuXBKUfRgbFgRYSg6QxMgjAAAQmklEQVR4nO1dZ3ujuhKOjWk2HBvce0ni1E22nHL7//9ZFyMNSEIC2Roccp68X3ZjY5VB00fSzc0XvvCFL3zhb4O+E0fjaOT0P3ogH4xu1MoxDHR/FfiO4wdurUO7LgbbFo/NoPInvWiTPX5wrjHKK8AbtoqISn7QHx3Fxw9/i5XhSAiR4F4xuX58L31+ft1R14FIOrETZBKjd1A+Xjst5qPJZtW6Pw5jv1tH+7NsKu/73bQ93a3fsk9ECeCqyXZCrTwSzPjOZmGlODsTS2j6YWp3rHa7bVn29Bt82GMfHQiDKeCIPDYG7kbWISo5QGDeTe12Bst++l5YF8GighAJQryB8eiq+4x0FX8FYMHf2labhWW9029oR55qRRwWQ+YrD2dYBTyWvoEIwSgMaVtruy3CvqXfueyDPIY9MgYvU61D8zHJMCmlxOmFhIYvIVBTIqfFKnlOpj25hZl9ii3KUvhVlEjfggmjuGWUyGkxHhf7HQv9ZoOdGM1ZAR1KJHi8XFTRZb2XUyKhxTd5lzO/2FYmUw1mrAL7KpZxGMYTpW0zvkxqxOTXbypKtNudH8XOFFzZh+/xnZFB3nf+DvpOtJISYyvjEy9xG3ujxO8eD5eLTYHd6Qevakq0rV1hQSgZErTyDJ0UmUUxFr5we0txfALBbtygFw3V9jGPqaUmRdt+4J6dlJi72bLApsQcGpYacI6MGqte37vpOpGGJcRg3SmhRALm0WG53Q/9Ynsi2TtVmPVeKOeUc/GthD1SabGHJ7dVAgn82zLn/gKAwi+zZINKu0MD5YQ40YJY4AcNpU2bPOCR4QRggFXpU15sSol9BXtky0LH0QIOwSEBBdg+0oABh548iqKJXxXskdKCPKphRYJxjuQeEcDb3mg86xsIjXWZ9gBSEJtTVGQSAFejuqfwqvWE8fnEeCSOns6iaLen5DcaA6GtY9reoKLvdX/gyH3YxH+O4tDxg2DQ7bouYyUetBdFZn5rWJFU7ZULuPMATHfGSmOd6G3sBP3S2BpZyt9Vi8JKwPy1Tp/WcL9BpekPuxKgP84KFfZO9uVx0tPxkol/I1cfHdvaPa13bTv7tqM7QRBxiNk1WN94LUo7kBKivb8j397t2x2OQ6pjzGBk4dmbICpitBYFEP54l/CHvWdlTeq9W3b7Nf1L4pkLABcST4XgE1dArBKanfcWh5eETdYv2Z+r4ajUZvDoc3imN4Rea8sqEEk/LVLiriXg9Vb8pBWVMAp9BM9PB6mJ1qAAYsveFfjDfi/MW4qxMqJKtSleOoT2uEVrUABhwFtRf+ROaCVUNgZkAdCGCisRrUEBhAGfCqICJnq7s9q7ImewEAxKrz/vhaN4hO2QQSKotkwTsdNFSsCi+D49ZYcsm6PF+8N+f/uQC9DcO5LGirAEPng11crrMhA5/yKKCptOlAb47Hzetzvb7pxgt9eZYE3lgSNJCxBsUagBKYVakis3QOqCqLBa7Of5ovjW7uQLyLJ3QIzJTXm0ZIOgAHu0rVoqCG7AXRGtCghuk0WRh7rFdJGVWWGVkRLzhQGkqCsTO2amzEyROF0tMnP7Fx3Eruin2IWMgArGKREgBca0ZdgwUy6QgnirmV6VUCL5VqTF3cN+/bRL8LTe3377lX9h6peB3Y0ybwnSxguJIFgVZLK0ukKRGWA9lW/7VKhaFIlsTYQrZE9MHcr5NUhR8MVAOpw8dyDLgzKZCuplbdudoiuTeHD0AUMtCMq0JheEmN0Phfdt026Tmdl3zAqRwaJBPmWS0aLE0gnOlgBMLNTAcQ5CioIuhXBu66XToRMtPpPTjebYlU9YbZT3SUnRq37yEqhIAW+69fpEK/R26tCn9VT1CKWsoRKhQ9IIt18CYmy+SZQknypu/SiLh9u5ZFEQi8gew/ATtekfcaZeAGEDyTyt7xwpiuKEJQXhkNeOTGymjU0x3ieEbmoyN4k3JiPFlCNFaRIxszzeEw9FToz0a8MyNbCxRjhTF0FC9LKyCp4WRS9eRooEd08yViI+jWFAi8sI4avUnvqdW20mkFUiNcU4j6yGiZBiaThY6MFPh41d3EQoXYznpbCfMmJor4qWPHqOwSA3Qk0N9sJ4LHvplm3RWZamETtClOu2QAvCbKb5U6F0FtvWInEGdb0NNTbLxeYbP8YCZakyNY1KDvhesDWJKx98Ps0H5aLPyUX8zx97KO0Un6ZGmLGdyFFCO5+uDaJDlA4EyIESBgFdk7hj4LELaRXaiHH4hquyxw/sUS9HVbsKTmqJsABRcfrvWvo0XVrGcq7HUKKOHAA14t46EnFgdSAcoVAy6UPkidRmtUlKVfBq6KfGQ2U2guCzxwm08e/7U9CFoULHtp5uX6FvZU0nBIHTlUDfv+DVEA5CSBnmpKhnJ3AumN/W045NMN3tH/isqaLSNwsCWwyzfONIQaM/CLmcbN9nXYUFLAu2fry+3L1KKttbvywZLaw29dsIT1Blwntv1F1DkHNQaFZftYl824+IOwktrDYEcsmk9+QPXmxqV+pUIdtoUlcuJEFQuh2rR8vtf00LWeZptn6e7ES4rFm6CPyBEHKBbEtNvinFSEWHw8jNq2jXnA/e4epy3m9vwWXhNTOtTzC3kyH8j1n1J4VsJ/HYIbZAJlnv1qf4TBrZt9v778VfpOAkrKVf61kBWLo1BXpZuL18Z/5qGM8ZkyjI5/m2f9pNp097dS0K77BQZWtuEsGiMPX1deH1gyDoF/OSfeXEC+AzJrAozCUdqI8aZaYePOUum5c9Z4AILjqVFAvjAUB2rJZtimdCoXJv7URzZFLjbsdTArwS80w6vIoPXxQneLKjCVKHvGM/3b683knCvNRBMa8lAw69lqSohFM48oRmDqzUYi8E/20a1TEPvkFNT5POCOn6YRzFod8tDwZy7IHgMlBK1OORGoPwS5n7TpOlCAkt8JNqq88zAzVC1VsyIQuPYBOB/VdX+ZEpKP+qNupC4hXB+wChib+dGQm0pF2VRtnT8SP0BLqrrqJNc1BTQxoYzdxUjIhTC4+qdYG678UkEFujZc4g4AI1wdJUAXZ9vAmmVcdij7sw7qb5/HGTv69X1s60cuZIYZzp/QT8ccMERulRWqcaxfUrRwljsxvCJY0xuhXI41+nA9amu7VQu3SCoTUAXdRUkYYHdZA4C4wZcggcZdIIp7QUgZwQq34m7gwl/+cQFQSy0zNO3kKAMglopaYyRWQMxFNmaHwe/jSysmCvSeNFBQV7nO8w0/+wXIyExRaDntfF6ZDnKHbYhCCoWqOAxWcSFWpAGYBJRRq0UdsG0ysBtIlBE5AprW2D6ZWwNV/cYGB99iOxQfobWEcQ4K1rV+W1gHDSAiRAmhrL0wVE4gzCs5A1xhvUB8HcUqQt1HgC9JVA36nBxjEEfdwMGB/AAXUun12XIhzLAhZWvVVH18CIXd/eBVoASPFZnDE1ZAfAPh6HUegHenQBUnxqC8sbOFHpEQZap20DKRod7S6B58daZ+FqqIXaj8OqE95I/3jHSgngAoc1Nl1aAq907gIUesEbzHvxeMkJmmPc8wefy/o+52TcohHadaJlqYDZjD+PAJVfJnNw8jfqud3+4HTwp/jTbvlFNBmWV6hixUDOIItxmM1NZ/B6ZffVzfWiCdadKaUYzCsvWRudzAZaB3uGtXlGmWyZ6qEq575ulUNmttJnV5AcGqQ4R8ioB5C3Um/d1qF6KAIuJsXjZjaORyFzr9lyPJ4sD/eHw/1R3T17k0qdiUXGbNJ1r2B+OlpweDwcNqlNzrYORoqea6rJRaZgb6rTTfADKS7vFVrQWojc5q8aY4CcvtdcfTRIaxD9n59DCv7uq9osEH5PuGb8YHPeIpLAO6dDboj1nWjMizXNbQ1nMbocVFjohEcFo7+23LtwiqTej+jDJortDB4TTJPaKvqEqn+t38DYTKINZ5yeJ5zrUBsphOv4tH6DcWAokFPDlhYKgGoLDV9CCnArjDaE0DY0hKDAILWJTV5W6IlN+I1RxzSBoLP9midFbcqU35arp0xXGKSAjjWWFn+DdW0HwfOMWG5i+cPl4rjIWcrINTrjChru6mbzbYxK6HcjuV5yOQ7nl/pHtAkddcp2WWPMgjXwy1VC4bpwwBH2dp8FWF0ac5vnfdWaWsw5scKOU+72T8lx9tWr8A4W7k3ghFEUh+q4TMYiZ1HCj2bH42Ic6i8ksLIqXYqK2FzZBYcS9IotqOWGN0r8noN4yWoZumwJ7lh3aPN06evkLwPFZfJ6y4qD7GJbxHo98Vh6fRvV1V7ekpeZQ7MVz5HfOYgnCCRZXXwlnB0B0gpH8Xiy5ISpTgPdkfRG9FZrg5dXz+Xff88b3FnIJFhGZW/gJ1JvPIl07MCeQg9teohvjdos//795/Pz889//JP8iW6xA8EvaljFXSPcKB1RBP94/o3g+a9alkXGHxdYVp5K5mJnkdNG/wJKZLRALhI956JJEcr7WZGPxCKv6+dvOX6mnyBXc8DoL2k2XxT3kzBwXX9m0poahBTPDCme009wU2uZlXXJj4k0W0XzTDJAc8jHNqRt/v4srgrcvSdAiQud0sARirWOZu0pkLb6+DMTm7/9C19sGi0KGbK4JaoBRIf515/PJ/z5H/InricHA8cLwIMoNbxSQkAmlf743x+ZgELtIUua4OVwsyQHqgGUm8Q5UPPO2bAxN09nPIer9UV37IBCiVP5xWwUuEHm46BGkrLKv+yTIByPjQnjhYyrs8DR1sVSKtytC5nkJC46lG5haL4gjGazSexjmfWFU7Owg2r5Wo59Z1lTJyiQuFDIodZiB60aMz8mkNxTOEM13AbFDpp67Isv8SiXmAK/2H5zK1plcYYtGpsUqqYbenwagZSdFzjE6AqByG2ja75VYf8NAjEKyrrRW4fmMjIQLA2tOFkOoIl6lIJJMcfeTT/CG3hfHvw+1pYtNwRDCbp2e7zIv9iq5dQ0JzGaqUGYlH5u9gRctc5lEj8Um2Ap08T9Q0w1I2cAuiyfXCDzA3Zl0dXGFVI0bosd8/YLQiF/recrEpaQ+aLqsvRp1i5ttiZH5hT4tFb+bDHH5Mj4iBjn/DVIYrDMq5AHbrg9X4kwuaGj6M1whu2iIWepcdYE6gvK92lKzCmOSRphY3icjkB1y7MsmcL95F3hD9/Cz5XBH3C9AmpB3CvpK5igH+qd8T4Hdq39sfptC6Gz+KM8NKH+DF2Onwi9rFA6ottjHvc9H+IO2VUN/oBbuZXzpuixHnrXXRq+eBbBB0rwbuFchC1maU4pgmIU82M3R/vFS7JWcf1DCiRbxz/+XFBpzGh4SfGwJuTFg8dGnOom3758qGN/uutHsqNrWveN8QxV0cRD5KDlILxgNFN0c98gTyhx3hWVnid6jEPfjCB9fzRR1ygfG7MiAN2Kbf6LSRw6sgvFlPD6gTMab8ub/QhrRgMS/SbDajsbR6Ow5/vBYNDvuq7nEnT7g8Hcd3qjaLxU7udgsW0UZwjwleWO6Nhe2aC7AHO9tWGGodN4OhAMQvGAcUwcok92WFffibSOKzsPi3j+SZaDCDfoRdvSC2z1sUU0Uj4O3SDRCkMtpSDBajJyPtkhZRo4mQphNFmUbuoCHGdRYok0NSOJCNcdBL7jhOEojqJofELyb5zYG05ib+hvD/vCF77whS98AP4PBZnfS+kt1LcAAAAASUVORK5CYII=)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Byte strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "kitty = '小' + '猫'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello 小猫\n"
     ]
    }
   ],
   "source": [
    "print(f'hello {kitty}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b'\\xe5\\xb0\\x8f\\xe7\\x8c\\xab'"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kitty_bytes = kitty.encode('utf8')\n",
    "kitty_bytes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'小猫'"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kitty_bytes.decode('utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    kitty_bytes.decode('ascii')\n",
    "except UnicodeDecodeError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading and writing text files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting haiku.txt\n"
     ]
    }
   ],
   "source": [
    "%%file haiku.txt\n",
    "古池や蛙飛び込む水の音\n",
    "ふるいけやかわずとびこむみずのおと"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "古池や蛙飛び込む水の音\n",
      "ふるいけやかわずとびこむみずのおと"
     ]
    }
   ],
   "source": [
    "with open('haiku.txt') as f:\n",
    "    for line in f:\n",
    "        print(line, end='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('haiku.txt') as f:\n",
    "    haiku = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'古池や蛙飛び込む水の音\\nふるいけやかわずとびこむみずのおと'"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "haiku"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['古池や蛙飛び込む水の音', 'ふるいけやかわずとびこむみずのおと']"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "haiku.split()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using regular expressions\n",
    "\n",
    "- Read [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html)\n",
    "- Practice at https://regex101.com\n",
    "- Play [RegEx Golf](https://alf.nu/RegexGolf)\n",
    "\n",
    "![golf](https://www.explainxkcd.com/wiki/images/7/7b/regex_golf.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Matching Characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "beer = '''99 bottles of Beer on the wall, 99 bottles of beeR.\n",
    "Take one down and pass it around, 98 bottles of beer on the wall.'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['beer']"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('beer', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Beer', 'beeR', 'beer']"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('beer', beer, re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['on', 'on', 'on']"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('on', beer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word boundaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['on', 'on']"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\bon\\b', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[' ', 't', 'h', 'e', ' ', 'w', 'a', 'l', 'l', '.']"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'.', beer)[-10:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Character sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['9', '9', '9', '9', '9', '8']"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\d', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['9', '9', '9', '9', '9', '8']"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'[0-9]', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['B', 'e', 'e', 'r', 'o', 'n', 't', 'h', 'e', 'w', 'a', 'l', 'l', '9']"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\w', beer)[11:25]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Repeating Things"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['99', '99', '98']"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\d+', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bottles of Beer', 'bottles of beer']"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'b.+r', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bee', 'bee']"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'be+', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['b', 'b', 'bee', 'b', 'bee']"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'be*', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bo', 'bo', 'bee', 'bo', 'bee']"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'b[aeiou]+', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bee', 'bee']"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'b[aeiou]{2,}', beer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bo', 'bo', 'be', 'bo', 'be']"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'b[aeiou]{1}', beer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finding matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14 18 (14, 18) Beer\n",
      "46 50 (46, 50) beeR\n",
      "100 104 (100, 104) beer\n"
     ]
    }
   ],
   "source": [
    "for m in re.finditer('beer', beer, re.IGNORECASE):\n",
    "    print(m.start(), m.end(), m.span(),  m.group())    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Grouping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'(\\d+)\\s+(\\b\\w+?\\b)', beer, re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['',\n",
       " ' bottles of Beer on the wall, ',\n",
       " ' bottles of beeR.\\nTake one down and pass it around, ',\n",
       " ' bottles of beer on the wall.']"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.split(r'\\d+', beer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Search and replace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "99 bottles of whiskey on the wall, 99 bottles of whiskey.\n",
      "Take one down and pass it around, 98 bottles of whiskey on the wall.\n"
     ]
    }
   ],
   "source": [
    "print(re.sub('beer', 'whiskey', beer, flags=re.IGNORECASE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bottles 99 of Beer on the wall, bottles 99 of beeR.\n",
      "Take one down and pass it around, 98 bottles of beer on the wall.\n"
     ]
    }
   ],
   "source": [
    "print(re.sub(r'(\\d+)\\s+(\\b\\w+?\\b)', r'\\2 \\1', beer, re.IGNORECASE))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function versus compiled method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = re.compile(r'(\\d+)\\s+(\\b\\w+?\\b)')\n",
    "pattern.findall(beer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Raw strings\n",
    "\n",
    "The backslash `\\` is an escape character in a regular Python string. So we need to escape it to match a literal `\\`. However, `\\` is an escape character in the regular expression mini-language when compiling the regular expression pattern. So we need to escape at two levels - hence we need `\\\\\\\\` to match a literal `\\`. The raw string r`foo` treats `\\` as a literal character rather than an escape character. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "latex = 'latex uses \\section over and over again like so \\section'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('\\section', latex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('\\\\section', latex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\\\section', '\\\\section']"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall('\\\\\\\\section', latex)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\\\section', '\\\\section']"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\\\section', latex)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Examples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Custom version of `capwords`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello World'"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.capwords('hello    world')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "def my_capwords(ss):\n",
    "    return ' '.join([s.title() for s in ss.split()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello World'"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_capwords('hello    world')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of words\n",
    "\n",
    "Create a table of counts, where rows represent unique words and columns represent different documents. Ignore case and capitalization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc1 = \"\"\"The wheels on the bus go,\n",
    "Round and round,\n",
    "Round and round,\n",
    "Round and round.\n",
    "The wheels on the bus go\n",
    "Round and round,\n",
    "All through the town.\"\"\"\n",
    "\n",
    "doc2 = \"\"\"The doors on the bus go,\n",
    "Open and shut,\n",
    "Open and shut,\n",
    "Open and shut.\n",
    "The doors on the bus go\n",
    "Open and shut,\n",
    "All through the town.\"\"\"\n",
    "\n",
    "doc3 = \"\"\"The Driver on the bus says,\n",
    "\"Move on back!\n",
    "Move on back!\n",
    "Move on back!\"\n",
    "The Driver on the bus says,\n",
    "\"Move on back!\"\n",
    "All through the town.\"\"\"\n",
    "\n",
    "doc4 = \"\"\"The babies on the bus go,\n",
    "\"Wah, wah, wah!\n",
    "Wah, wah, wah!\n",
    "Wah, wah, wah!\"\n",
    "The babies on the bus go,\n",
    "\"Wah, wah, wah!\"\n",
    "All through the town.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = [doc1, doc2, doc3, doc4]\n",
    "doc_words = [doc.strip().lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
    "             for doc in docs]\n",
    "words = [word for words in doc_words for word in words]\n",
    "vocab = set(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "table = np.zeros((len(vocab), len(docs)), dtype='int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, word in enumerate(vocab):\n",
    "    for j, doc in enumerate(doc_words):\n",
    "        table[i, j] = doc.count(word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>doc1</th>\n",
       "      <th>doc2</th>\n",
       "      <th>doc3</th>\n",
       "      <th>doc4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>round</th>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>back</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>the</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bus</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>town</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>shut</th>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>all</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>and</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>move</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>babies</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>driver</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wheels</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>through</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>doors</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>says</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>open</th>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>on</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>go</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wah</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         doc1  doc2  doc3  doc4\n",
       "round       8     0     0     0\n",
       "back        0     0     4     0\n",
       "the         5     5     5     5\n",
       "bus         2     2     2     2\n",
       "town        1     1     1     1\n",
       "shut        0     4     0     0\n",
       "all         1     1     1     1\n",
       "and         4     4     0     0\n",
       "move        0     0     4     0\n",
       "babies      0     0     0     2\n",
       "driver      0     0     2     0\n",
       "wheels      2     0     0     0\n",
       "through     1     1     1     1\n",
       "doors       0     2     0     0\n",
       "says        0     0     2     0\n",
       "open        0     4     0     0\n",
       "on          2     2     6     2\n",
       "go          2     2     0     2\n",
       "wah         0     0     0    12"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(table, columns='doc1 doc2 doc3 doc4'.split(), index=vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}