{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Strings\n", "\n", "- Point index\n", "- Interval index\n", "- Negative index\n", "- Stride\n", "- Reversing a string\n", "- Strings are immutable" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "s = \"hello world\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('h', 'w')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[0], s[6]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello '" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[0:6]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('d', 'r')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[-1], s[-3]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hlowrd'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[::2]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'dlrow olleh'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[::-1]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'str' object does not support item assignment\n" ] } ], "source": [ "try:\n", " s[0] = 'H'\n", "except TypeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `string` module\n", "\n", "- String constants\n", "- String `capwords`" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import string" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0123456789'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.digits" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.ascii_letters" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.punctuation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' \\t\\n\\r\\x0b\\x0c'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.whitespace" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~ \\t\\n\\r\\x0b\\x0c'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.printable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Orphan function in strings module" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords(s)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## String methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Methods to change case" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'HELLO WORLD'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.upper()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.lower()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ss'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'ß'.casefold()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello world'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.capitalize()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.title()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Difference between `title` method and `capwords` function" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello:World'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'hello:world'.title()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello:world'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords('hello:world')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### String predicates" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isalnum()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isalpha()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isascii()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isidentifier()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isprintable()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.startswith('hell')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.endswith('ld')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Searching and counting" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'llo' in s" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'foo' in s" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.find('llo')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.index('llo')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.find('foo')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "substring not found\n" ] } ], "source": [ "try:\n", " s.index('foo')\n", "except ValueError as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.count('l')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.count('ll')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stripping" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.strip()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world '" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.lstrip()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' hello world'" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.rstrip()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting and joining" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['hello', 'world']" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.split()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['he', '', 'o wor', 'd']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.split('l')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'h-e-l-l-o- -w-o-r-l-d'" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'-'.join(s)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello-world'" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'-'.join(s.split())" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'l'.join(s.split('l'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Translation" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'CTGGTAT'" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans('ACTG', 'TAGC'))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'GTT'" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans('', '', 'AC'))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'gattaca'" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `ord` and `chr`" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(65, 97)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ord('A'), ord('a')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('A', 'a')" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chr(65), chr(97)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'b'" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chr(ord('B') + (ord('a') - ord('A')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formatting strings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### C sytle formatting" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "pi = 3.141592653589793\n", "r = 2" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141593 * 2^2'" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = %f * %d^2' % (pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Precision and padding" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.14 * 002^2'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = %8.2f * %03d^2' % (pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Right align string" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area = 3.14 * 002^2'" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'%10s = %8.2f * %03d^2' % ('area', pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Left align string" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.14 * 002^2'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'%-10s = %8.2f * %03d^2' % ('area', pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the `format` method" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {} * {}^2'.format(pi, r)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {a} * {b}^2'.format(a=pi, b=r)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.142 * 000002^2'" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {pi:8,.4} * {r:06d}^2'.format(pi=pi, r=r)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area'" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:>10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area '" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:<10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area '" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:^10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'===area==='" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:=^10}'.format('area')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using f strings" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f'area = {pi} * {r}^2'" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'===area==='" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = 'area'\n", "f'{x:=^10}'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Templates" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from string import Template" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "t = Template(\"$who likes $what\")\n", "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(t.substitute(who=name, what=lang))" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(\"{} likes {}\".format(name, lang))" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(f\"{name} likes {lang}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Encodings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unicode strings" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 猫\n" ] } ], "source": [ "print('hello \\u732b')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 猫\n" ] } ], "source": [ "s = '猫'\n", "print(f'hello {s}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![image](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQoAAAC9CAMAAAB8rcOCAAAAvVBMVEX///8AAADnIjXqIjbtIzbwIze2trbt7e329vbBwcHl5eWtra3R0dGhoaHuyCfg4ODa2tqCgoKTk5NlZWWZmZl6enpubm7bIDJfX1/Hx8dLS0tXV1cnJyeyGilERERQUFDOHi+gGCUuLi6JFB9CCg8aGho7Ozt6EhwQEBDBHCz50SlNCxI3CA1nDxhvEBlaDRUVAwUiBQjjvyUrBgq+oB9GOwvTsSN+ahWtkhwzKwg7MgpuXBKUfRgbFgRYSg6QxMgjAAAQmklEQVR4nO1dZ3ujuhKOjWk2HBvce0ni1E22nHL7//9ZFyMNSEIC2Roccp68X3ZjY5VB00fSzc0XvvCFL3zhb4O+E0fjaOT0P3ogH4xu1MoxDHR/FfiO4wdurUO7LgbbFo/NoPInvWiTPX5wrjHKK8AbtoqISn7QHx3Fxw9/i5XhSAiR4F4xuX58L31+ft1R14FIOrETZBKjd1A+Xjst5qPJZtW6Pw5jv1tH+7NsKu/73bQ93a3fsk9ECeCqyXZCrTwSzPjOZmGlODsTS2j6YWp3rHa7bVn29Bt82GMfHQiDKeCIPDYG7kbWISo5QGDeTe12Bst++l5YF8GighAJQryB8eiq+4x0FX8FYMHf2labhWW9029oR55qRRwWQ+YrD2dYBTyWvoEIwSgMaVtruy3CvqXfueyDPIY9MgYvU61D8zHJMCmlxOmFhIYvIVBTIqfFKnlOpj25hZl9ii3KUvhVlEjfggmjuGWUyGkxHhf7HQv9ZoOdGM1ZAR1KJHi8XFTRZb2XUyKhxTd5lzO/2FYmUw1mrAL7KpZxGMYTpW0zvkxqxOTXbypKtNudH8XOFFzZh+/xnZFB3nf+DvpOtJISYyvjEy9xG3ujxO8eD5eLTYHd6Qevakq0rV1hQSgZErTyDJ0UmUUxFr5we0txfALBbtygFw3V9jGPqaUmRdt+4J6dlJi72bLApsQcGpYacI6MGqte37vpOpGGJcRg3SmhRALm0WG53Q/9Ynsi2TtVmPVeKOeUc/GthD1SabGHJ7dVAgn82zLn/gKAwi+zZINKu0MD5YQ40YJY4AcNpU2bPOCR4QRggFXpU15sSol9BXtky0LH0QIOwSEBBdg+0oABh548iqKJXxXskdKCPKphRYJxjuQeEcDb3mg86xsIjXWZ9gBSEJtTVGQSAFejuqfwqvWE8fnEeCSOns6iaLen5DcaA6GtY9reoKLvdX/gyH3YxH+O4tDxg2DQ7bouYyUetBdFZn5rWJFU7ZULuPMATHfGSmOd6G3sBP3S2BpZyt9Vi8JKwPy1Tp/WcL9BpekPuxKgP84KFfZO9uVx0tPxkol/I1cfHdvaPa13bTv7tqM7QRBxiNk1WN94LUo7kBKivb8j397t2x2OQ6pjzGBk4dmbICpitBYFEP54l/CHvWdlTeq9W3b7Nf1L4pkLABcST4XgE1dArBKanfcWh5eETdYv2Z+r4ajUZvDoc3imN4Rea8sqEEk/LVLiriXg9Vb8pBWVMAp9BM9PB6mJ1qAAYsveFfjDfi/MW4qxMqJKtSleOoT2uEVrUABhwFtRf+ROaCVUNgZkAdCGCisRrUEBhAGfCqICJnq7s9q7ImewEAxKrz/vhaN4hO2QQSKotkwTsdNFSsCi+D49ZYcsm6PF+8N+f/uQC9DcO5LGirAEPng11crrMhA5/yKKCptOlAb47Hzetzvb7pxgt9eZYE3lgSNJCxBsUagBKYVakis3QOqCqLBa7Of5ovjW7uQLyLJ3QIzJTXm0ZIOgAHu0rVoqCG7AXRGtCghuk0WRh7rFdJGVWWGVkRLzhQGkqCsTO2amzEyROF0tMnP7Fx3Eruin2IWMgArGKREgBca0ZdgwUy6QgnirmV6VUCL5VqTF3cN+/bRL8LTe3377lX9h6peB3Y0ybwnSxguJIFgVZLK0ukKRGWA9lW/7VKhaFIlsTYQrZE9MHcr5NUhR8MVAOpw8dyDLgzKZCuplbdudoiuTeHD0AUMtCMq0JheEmN0Phfdt026Tmdl3zAqRwaJBPmWS0aLE0gnOlgBMLNTAcQ5CioIuhXBu66XToRMtPpPTjebYlU9YbZT3SUnRq37yEqhIAW+69fpEK/R26tCn9VT1CKWsoRKhQ9IIt18CYmy+SZQknypu/SiLh9u5ZFEQi8gew/ATtekfcaZeAGEDyTyt7xwpiuKEJQXhkNeOTGymjU0x3ieEbmoyN4k3JiPFlCNFaRIxszzeEw9FToz0a8MyNbCxRjhTF0FC9LKyCp4WRS9eRooEd08yViI+jWFAi8sI4avUnvqdW20mkFUiNcU4j6yGiZBiaThY6MFPh41d3EQoXYznpbCfMmJor4qWPHqOwSA3Qk0N9sJ4LHvplm3RWZamETtClOu2QAvCbKb5U6F0FtvWInEGdb0NNTbLxeYbP8YCZakyNY1KDvhesDWJKx98Ps0H5aLPyUX8zx97KO0Un6ZGmLGdyFFCO5+uDaJDlA4EyIESBgFdk7hj4LELaRXaiHH4hquyxw/sUS9HVbsKTmqJsABRcfrvWvo0XVrGcq7HUKKOHAA14t46EnFgdSAcoVAy6UPkidRmtUlKVfBq6KfGQ2U2guCzxwm08e/7U9CFoULHtp5uX6FvZU0nBIHTlUDfv+DVEA5CSBnmpKhnJ3AumN/W045NMN3tH/isqaLSNwsCWwyzfONIQaM/CLmcbN9nXYUFLAu2fry+3L1KKttbvywZLaw29dsIT1Blwntv1F1DkHNQaFZftYl824+IOwktrDYEcsmk9+QPXmxqV+pUIdtoUlcuJEFQuh2rR8vtf00LWeZptn6e7ES4rFm6CPyBEHKBbEtNvinFSEWHw8jNq2jXnA/e4epy3m9vwWXhNTOtTzC3kyH8j1n1J4VsJ/HYIbZAJlnv1qf4TBrZt9v778VfpOAkrKVf61kBWLo1BXpZuL18Z/5qGM8ZkyjI5/m2f9pNp097dS0K77BQZWtuEsGiMPX1deH1gyDoF/OSfeXEC+AzJrAozCUdqI8aZaYePOUum5c9Z4AILjqVFAvjAUB2rJZtimdCoXJv7URzZFLjbsdTArwS80w6vIoPXxQneLKjCVKHvGM/3b683knCvNRBMa8lAw69lqSohFM48oRmDqzUYi8E/20a1TEPvkFNT5POCOn6YRzFod8tDwZy7IHgMlBK1OORGoPwS5n7TpOlCAkt8JNqq88zAzVC1VsyIQuPYBOB/VdX+ZEpKP+qNupC4hXB+wChib+dGQm0pF2VRtnT8SP0BLqrrqJNc1BTQxoYzdxUjIhTC4+qdYG678UkEFujZc4g4AI1wdJUAXZ9vAmmVcdij7sw7qb5/HGTv69X1s60cuZIYZzp/QT8ccMERulRWqcaxfUrRwljsxvCJY0xuhXI41+nA9amu7VQu3SCoTUAXdRUkYYHdZA4C4wZcggcZdIIp7QUgZwQq34m7gwl/+cQFQSy0zNO3kKAMglopaYyRWQMxFNmaHwe/jSysmCvSeNFBQV7nO8w0/+wXIyExRaDntfF6ZDnKHbYhCCoWqOAxWcSFWpAGYBJRRq0UdsG0ysBtIlBE5AprW2D6ZWwNV/cYGB99iOxQfobWEcQ4K1rV+W1gHDSAiRAmhrL0wVE4gzCs5A1xhvUB8HcUqQt1HgC9JVA36nBxjEEfdwMGB/AAXUun12XIhzLAhZWvVVH18CIXd/eBVoASPFZnDE1ZAfAPh6HUegHenQBUnxqC8sbOFHpEQZap20DKRod7S6B58daZ+FqqIXaj8OqE95I/3jHSgngAoc1Nl1aAq907gIUesEbzHvxeMkJmmPc8wefy/o+52TcohHadaJlqYDZjD+PAJVfJnNw8jfqud3+4HTwp/jTbvlFNBmWV6hixUDOIItxmM1NZ/B6ZffVzfWiCdadKaUYzCsvWRudzAZaB3uGtXlGmWyZ6qEq575ulUNmttJnV5AcGqQ4R8ioB5C3Um/d1qF6KAIuJsXjZjaORyFzr9lyPJ4sD/eHw/1R3T17k0qdiUXGbNJ1r2B+OlpweDwcNqlNzrYORoqea6rJRaZgb6rTTfADKS7vFVrQWojc5q8aY4CcvtdcfTRIaxD9n59DCv7uq9osEH5PuGb8YHPeIpLAO6dDboj1nWjMizXNbQ1nMbocVFjohEcFo7+23LtwiqTej+jDJortDB4TTJPaKvqEqn+t38DYTKINZ5yeJ5zrUBsphOv4tH6DcWAokFPDlhYKgGoLDV9CCnArjDaE0DY0hKDAILWJTV5W6IlN+I1RxzSBoLP9midFbcqU35arp0xXGKSAjjWWFn+DdW0HwfOMWG5i+cPl4rjIWcrINTrjChru6mbzbYxK6HcjuV5yOQ7nl/pHtAkddcp2WWPMgjXwy1VC4bpwwBH2dp8FWF0ac5vnfdWaWsw5scKOU+72T8lx9tWr8A4W7k3ghFEUh+q4TMYiZ1HCj2bH42Ic6i8ksLIqXYqK2FzZBYcS9IotqOWGN0r8noN4yWoZumwJ7lh3aPN06evkLwPFZfJ6y4qD7GJbxHo98Vh6fRvV1V7ekpeZQ7MVz5HfOYgnCCRZXXwlnB0B0gpH8Xiy5ISpTgPdkfRG9FZrg5dXz+Xff88b3FnIJFhGZW/gJ1JvPIl07MCeQg9teohvjdos//795/Pz889//JP8iW6xA8EvaljFXSPcKB1RBP94/o3g+a9alkXGHxdYVp5K5mJnkdNG/wJKZLRALhI956JJEcr7WZGPxCKv6+dvOX6mnyBXc8DoL2k2XxT3kzBwXX9m0poahBTPDCme009wU2uZlXXJj4k0W0XzTDJAc8jHNqRt/v4srgrcvSdAiQud0sARirWOZu0pkLb6+DMTm7/9C19sGi0KGbK4JaoBRIf515/PJ/z5H/InricHA8cLwIMoNbxSQkAmlf743x+ZgELtIUua4OVwsyQHqgGUm8Q5UPPO2bAxN09nPIer9UV37IBCiVP5xWwUuEHm46BGkrLKv+yTIByPjQnjhYyrs8DR1sVSKtytC5nkJC46lG5haL4gjGazSexjmfWFU7Owg2r5Wo59Z1lTJyiQuFDIodZiB60aMz8mkNxTOEM13AbFDpp67Isv8SiXmAK/2H5zK1plcYYtGpsUqqYbenwagZSdFzjE6AqByG2ja75VYf8NAjEKyrrRW4fmMjIQLA2tOFkOoIl6lIJJMcfeTT/CG3hfHvw+1pYtNwRDCbp2e7zIv9iq5dQ0JzGaqUGYlH5u9gRctc5lEj8Um2Ap08T9Q0w1I2cAuiyfXCDzA3Zl0dXGFVI0bosd8/YLQiF/recrEpaQ+aLqsvRp1i5ttiZH5hT4tFb+bDHH5Mj4iBjn/DVIYrDMq5AHbrg9X4kwuaGj6M1whu2iIWepcdYE6gvK92lKzCmOSRphY3icjkB1y7MsmcL95F3hD9/Cz5XBH3C9AmpB3CvpK5igH+qd8T4Hdq39sfptC6Gz+KM8NKH+DF2Onwi9rFA6ottjHvc9H+IO2VUN/oBbuZXzpuixHnrXXRq+eBbBB0rwbuFchC1maU4pgmIU82M3R/vFS7JWcf1DCiRbxz/+XFBpzGh4SfGwJuTFg8dGnOom3758qGN/uutHsqNrWveN8QxV0cRD5KDlILxgNFN0c98gTyhx3hWVnid6jEPfjCB9fzRR1ygfG7MiAN2Kbf6LSRw6sgvFlPD6gTMab8ub/QhrRgMS/SbDajsbR6Ow5/vBYNDvuq7nEnT7g8Hcd3qjaLxU7udgsW0UZwjwleWO6Nhe2aC7AHO9tWGGodN4OhAMQvGAcUwcok92WFffibSOKzsPi3j+SZaDCDfoRdvSC2z1sUU0Uj4O3SDRCkMtpSDBajJyPtkhZRo4mQphNFmUbuoCHGdRYok0NSOJCNcdBL7jhOEojqJofELyb5zYG05ib+hvD/vCF77whS98AP4PBZnfS+kt1LcAAAAASUVORK5CYII=)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Byte strings" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "kitty = '小' + '猫'" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 小猫\n" ] } ], "source": [ "print(f'hello {kitty}')" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'\\xe5\\xb0\\x8f\\xe7\\x8c\\xab'" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kitty_bytes = kitty.encode('utf8')\n", "kitty_bytes" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'小猫'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kitty_bytes.decode('utf8')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)\n" ] } ], "source": [ "try:\n", " kitty_bytes.decode('ascii')\n", "except UnicodeDecodeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading and writing text files" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting haiku.txt\n" ] } ], "source": [ "%%file haiku.txt\n", "古池や蛙飛び込む水の音\n", "ふるいけやかわずとびこむみずのおと" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "古池や蛙飛び込む水の音\n", "ふるいけやかわずとびこむみずのおと" ] } ], "source": [ "with open('haiku.txt') as f:\n", " for line in f:\n", " print(line, end='')" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "with open('haiku.txt') as f:\n", " haiku = f.read()" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'古池や蛙飛び込む水の音\\nふるいけやかわずとびこむみずのおと'" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "haiku" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['古池や蛙飛び込む水の音', 'ふるいけやかわずとびこむみずのおと']" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "haiku.split()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using regular expressions\n", "\n", "- Read [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html)\n", "- Practice at https://regex101.com\n", "- Play [RegEx Golf](https://alf.nu/RegexGolf)\n", "\n", "![golf](https://www.explainxkcd.com/wiki/images/7/7b/regex_golf.png)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Matching Characters" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "beer = '''99 bottles of Beer on the wall, 99 bottles of beeR.\n", "Take one down and pass it around, 98 bottles of beer on the wall.'''" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['beer']" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('beer', beer)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Beer', 'beeR', 'beer']" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('beer', beer, re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['on', 'on', 'on']" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('on', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word boundaries" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['on', 'on']" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\bon\\b', beer)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[' ', 't', 'h', 'e', ' ', 'w', 'a', 'l', 'l', '.']" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'.', beer)[-10:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Character sets" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['9', '9', '9', '9', '9', '8']" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\d', beer)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['9', '9', '9', '9', '9', '8']" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'[0-9]', beer)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['B', 'e', 'e', 'r', 'o', 'n', 't', 'h', 'e', 'w', 'a', 'l', 'l', '9']" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\w', beer)[11:25]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Repeating Things" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['99', '99', '98']" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\d+', beer)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bottles of Beer', 'bottles of beer']" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b.+r', beer)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bee', 'bee']" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'be+', beer)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['b', 'b', 'bee', 'b', 'bee']" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'be*', beer)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bo', 'bo', 'bee', 'bo', 'bee']" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]+', beer)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bee', 'bee']" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]{2,}', beer)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bo', 'bo', 'be', 'bo', 'be']" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]{1}', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Finding matches" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14 18 (14, 18) Beer\n", "46 50 (46, 50) beeR\n", "100 104 (100, 104) beer\n" ] } ], "source": [ "for m in re.finditer('beer', beer, re.IGNORECASE):\n", " print(m.start(), m.end(), m.span(), m.group()) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grouping" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'(\\d+)\\s+(\\b\\w+?\\b)', beer, re.IGNORECASE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['',\n", " ' bottles of Beer on the wall, ',\n", " ' bottles of beeR.\\nTake one down and pass it around, ',\n", " ' bottles of beer on the wall.']" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.split(r'\\d+', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Search and replace" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "99 bottles of whiskey on the wall, 99 bottles of whiskey.\n", "Take one down and pass it around, 98 bottles of whiskey on the wall.\n" ] } ], "source": [ "print(re.sub('beer', 'whiskey', beer, flags=re.IGNORECASE))" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bottles 99 of Beer on the wall, bottles 99 of beeR.\n", "Take one down and pass it around, 98 bottles of beer on the wall.\n" ] } ], "source": [ "print(re.sub(r'(\\d+)\\s+(\\b\\w+?\\b)', r'\\2 \\1', beer, re.IGNORECASE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Function versus compiled method" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pattern = re.compile(r'(\\d+)\\s+(\\b\\w+?\\b)')\n", "pattern.findall(beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Raw strings\n", "\n", "The backslash `\\` is an escape character in a regular Python string. So we need to escape it to match a literal `\\`. However, `\\` is an escape character in the regular expression mini-language when compiling the regular expression pattern. So we need to escape at two levels - hence we need `\\\\\\\\` to match a literal `\\`. The raw string r`foo` treats `\\` as a literal character rather than an escape character. " ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "latex = 'latex uses \\section over and over again like so \\section'" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\section', latex)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\\\section', latex)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['\\\\section', '\\\\section']" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\\\\\\\section', latex)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['\\\\section', '\\\\section']" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\\\section', latex)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Custom version of `capwords`" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords('hello world')" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "def my_capwords(ss):\n", " return ' '.join([s.title() for s in ss.split()])" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_capwords('hello world')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bag of words\n", "\n", "Create a table of counts, where rows represent unique words and columns represent different documents. Ignore case and capitalization." ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "doc1 = \"\"\"The wheels on the bus go,\n", "Round and round,\n", "Round and round,\n", "Round and round.\n", "The wheels on the bus go\n", "Round and round,\n", "All through the town.\"\"\"\n", "\n", "doc2 = \"\"\"The doors on the bus go,\n", "Open and shut,\n", "Open and shut,\n", "Open and shut.\n", "The doors on the bus go\n", "Open and shut,\n", "All through the town.\"\"\"\n", "\n", "doc3 = \"\"\"The Driver on the bus says,\n", "\"Move on back!\n", "Move on back!\n", "Move on back!\"\n", "The Driver on the bus says,\n", "\"Move on back!\"\n", "All through the town.\"\"\"\n", "\n", "doc4 = \"\"\"The babies on the bus go,\n", "\"Wah, wah, wah!\n", "Wah, wah, wah!\n", "Wah, wah, wah!\"\n", "The babies on the bus go,\n", "\"Wah, wah, wah!\"\n", "All through the town.\"\"\"" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "docs = [doc1, doc2, doc3, doc4]\n", "doc_words = [doc.strip().lower().translate(str.maketrans('', '', string.punctuation)).split()\n", " for doc in docs]\n", "words = [word for words in doc_words for word in words]\n", "vocab = set(words)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "table = np.zeros((len(vocab), len(docs)), dtype='int')" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "for i, word in enumerate(vocab):\n", " for j, doc in enumerate(doc_words):\n", " table[i, j] = doc.count(word)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doc1doc2doc3doc4
round8000
back0040
the5555
bus2222
town1111
shut0400
all1111
and4400
move0040
babies0002
driver0020
wheels2000
through1111
doors0200
says0020
open0400
on2262
go2202
wah00012
\n", "
" ], "text/plain": [ " doc1 doc2 doc3 doc4\n", "round 8 0 0 0\n", "back 0 0 4 0\n", "the 5 5 5 5\n", "bus 2 2 2 2\n", "town 1 1 1 1\n", "shut 0 4 0 0\n", "all 1 1 1 1\n", "and 4 4 0 0\n", "move 0 0 4 0\n", "babies 0 0 0 2\n", "driver 0 0 2 0\n", "wheels 2 0 0 0\n", "through 1 1 1 1\n", "doors 0 2 0 0\n", "says 0 0 2 0\n", "open 0 4 0 0\n", "on 2 2 6 2\n", "go 2 2 0 2\n", "wah 0 0 0 12" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(table, columns='doc1 doc2 doc3 doc4'.split(), index=vocab)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }