{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Strings\n", "\n", "- Point index\n", "- Interval index\n", "- Negative index\n", "- Stride\n", "- Reversing a string\n", "- Strings are immutable" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "s = \"hello world\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('h', 'w')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[0], s[6]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello '" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[0:6]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('d', 'r')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[-1], s[-3]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hlowrd'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[::2]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'dlrow olleh'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s[::-1]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'str' object does not support item assignment\n" ] } ], "source": [ "try:\n", " s[0] = 'H'\n", "except TypeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `string` module\n", "\n", "- String constants\n", "- String `capwords`" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import string" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0123456789'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.digits" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.ascii_letters" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.punctuation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' \\t\\n\\r\\x0b\\x0c'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.whitespace" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~ \\t\\n\\r\\x0b\\x0c'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.printable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Orphan function in strings module" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords(s)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## String methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Methods to change case" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'HELLO WORLD'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.upper()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.lower()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ss'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'ß'.casefold()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello world'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.capitalize()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.title()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Difference between `title` method and `capwords` function" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello:World'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'hello:world'.title()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello:world'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords('hello:world')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### String predicates" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isalnum()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isalpha()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isascii()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isidentifier()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.isprintable()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.startswith('hell')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.endswith('ld')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Searching and counting" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'llo' in s" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'foo' in s" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.find('llo')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.index('llo')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.find('foo')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "substring not found\n" ] } ], "source": [ "try:\n", " s.index('foo')\n", "except ValueError as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.count('l')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.count('ll')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stripping" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.strip()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world '" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.lstrip()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' hello world'" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "' hello world '.rstrip()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting and joining" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['hello', 'world']" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.split()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['he', '', 'o wor', 'd']" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.split('l')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'h-e-l-l-o- -w-o-r-l-d'" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'-'.join(s)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello-world'" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'-'.join(s.split())" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello world'" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'l'.join(s.split('l'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Translation" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'CTGGTAT'" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans('ACTG', 'TAGC'))" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'GTT'" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans('', '', 'AC'))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'gattaca'" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'GATTACA'.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `ord` and `chr`" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(65, 97)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ord('A'), ord('a')" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('A', 'a')" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chr(65), chr(97)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'b'" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chr(ord('B') + (ord('a') - ord('A')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formatting strings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### C sytle formatting" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "pi = 3.141592653589793\n", "r = 2" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141593 * 2^2'" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = %f * %d^2' % (pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Precision and padding" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.14 * 002^2'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = %8.2f * %03d^2' % (pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Right align string" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area = 3.14 * 002^2'" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'%10s = %8.2f * %03d^2' % ('area', pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Left align string" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.14 * 002^2'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'%-10s = %8.2f * %03d^2' % ('area', pi, r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the `format` method" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {} * {}^2'.format(pi, r)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {a} * {b}^2'.format(a=pi, b=r)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.142 * 000002^2'" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'area = {pi:8,.4} * {r:06d}^2'.format(pi=pi, r=r)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area'" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:>10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area '" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:<10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' area '" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:^10}'.format('area')" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'===area==='" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'{:=^10}'.format('area')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using f strings" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'area = 3.141592653589793 * 2^2'" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f'area = {pi} * {r}^2'" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'===area==='" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x = 'area'\n", "f'{x:=^10}'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Templates" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from string import Template" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "t = Template(\"$who likes $what\")\n", "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(t.substitute(who=name, what=lang))" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(\"{} likes {}\".format(name, lang))" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ann likes Python\n", "bob likes R\n", "cody likes C++\n" ] } ], "source": [ "items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]\n", "for name, lang in items:\n", " print(f\"{name} likes {lang}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Encodings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unicode strings" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 猫\n" ] } ], "source": [ "print('hello \\u732b')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 猫\n" ] } ], "source": [ "s = '猫'\n", "print(f'hello {s}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Byte strings" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "kitty = '小' + '猫'" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hello 小猫\n" ] } ], "source": [ "print(f'hello {kitty}')" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'\\xe5\\xb0\\x8f\\xe7\\x8c\\xab'" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kitty_bytes = kitty.encode('utf8')\n", "kitty_bytes" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'小猫'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kitty_bytes.decode('utf8')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)\n" ] } ], "source": [ "try:\n", " kitty_bytes.decode('ascii')\n", "except UnicodeDecodeError as e:\n", " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading and writing text files" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting haiku.txt\n" ] } ], "source": [ "%%file haiku.txt\n", "古池や蛙飛び込む水の音\n", "ふるいけやかわずとびこむみずのおと" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "古池や蛙飛び込む水の音\n", "ふるいけやかわずとびこむみずのおと" ] } ], "source": [ "with open('haiku.txt') as f:\n", " for line in f:\n", " print(line, end='')" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "with open('haiku.txt') as f:\n", " haiku = f.read()" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'古池や蛙飛び込む水の音\\nふるいけやかわずとびこむみずのおと'" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "haiku" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['古池や蛙飛び込む水の音', 'ふるいけやかわずとびこむみずのおと']" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "haiku.split()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using regular expressions\n", "\n", "- Read [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html)\n", "- Practice at https://regex101.com\n", "- Play [RegEx Golf](https://alf.nu/RegexGolf)\n", "\n", "" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Matching Characters" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "beer = '''99 bottles of Beer on the wall, 99 bottles of beeR.\n", "Take one down and pass it around, 98 bottles of beer on the wall.'''" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['beer']" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('beer', beer)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Beer', 'beeR', 'beer']" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('beer', beer, re.IGNORECASE)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['on', 'on', 'on']" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('on', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word boundaries" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['on', 'on']" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\bon\\b', beer)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[' ', 't', 'h', 'e', ' ', 'w', 'a', 'l', 'l', '.']" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'.', beer)[-10:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Character sets" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['9', '9', '9', '9', '9', '8']" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\d', beer)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['9', '9', '9', '9', '9', '8']" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'[0-9]', beer)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['B', 'e', 'e', 'r', 'o', 'n', 't', 'h', 'e', 'w', 'a', 'l', 'l', '9']" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\w', beer)[11:25]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Repeating Things" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['99', '99', '98']" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\d+', beer)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bottles of Beer', 'bottles of beer']" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b.+r', beer)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bee', 'bee']" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'be+', beer)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['b', 'b', 'bee', 'b', 'bee']" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'be*', beer)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bo', 'bo', 'bee', 'bo', 'bee']" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]+', beer)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bee', 'bee']" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]{2,}', beer)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bo', 'bo', 'be', 'bo', 'be']" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'b[aeiou]{1}', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Finding matches" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14 18 (14, 18) Beer\n", "46 50 (46, 50) beeR\n", "100 104 (100, 104) beer\n" ] } ], "source": [ "for m in re.finditer('beer', beer, re.IGNORECASE):\n", " print(m.start(), m.end(), m.span(), m.group()) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grouping" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'(\\d+)\\s+(\\b\\w+?\\b)', beer, re.IGNORECASE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Splitting" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['',\n", " ' bottles of Beer on the wall, ',\n", " ' bottles of beeR.\\nTake one down and pass it around, ',\n", " ' bottles of beer on the wall.']" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.split(r'\\d+', beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Search and replace" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "99 bottles of whiskey on the wall, 99 bottles of whiskey.\n", "Take one down and pass it around, 98 bottles of whiskey on the wall.\n" ] } ], "source": [ "print(re.sub('beer', 'whiskey', beer, flags=re.IGNORECASE))" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bottles 99 of Beer on the wall, bottles 99 of beeR.\n", "Take one down and pass it around, 98 bottles of beer on the wall.\n" ] } ], "source": [ "print(re.sub(r'(\\d+)\\s+(\\b\\w+?\\b)', r'\\2 \\1', beer, re.IGNORECASE))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Function versus compiled method" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pattern = re.compile(r'(\\d+)\\s+(\\b\\w+?\\b)')\n", "pattern.findall(beer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Raw strings\n", "\n", "The backslash `\\` is an escape character in a regular Python string. So we need to escape it to match a literal `\\`. However, `\\` is an escape character in the regular expression mini-language when compiling the regular expression pattern. So we need to escape at two levels - hence we need `\\\\\\\\` to match a literal `\\`. The raw string r`foo` treats `\\` as a literal character rather than an escape character. " ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "latex = 'latex uses \\section over and over again like so \\section'" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\section', latex)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\\\section', latex)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['\\\\section', '\\\\section']" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall('\\\\\\\\section', latex)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['\\\\section', '\\\\section']" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.findall(r'\\\\section', latex)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Custom version of `capwords`" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string.capwords('hello world')" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "def my_capwords(ss):\n", " return ' '.join([s.title() for s in ss.split()])" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Hello World'" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_capwords('hello world')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bag of words\n", "\n", "Create a table of counts, where rows represent unique words and columns represent different documents. Ignore case and capitalization." ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "doc1 = \"\"\"The wheels on the bus go,\n", "Round and round,\n", "Round and round,\n", "Round and round.\n", "The wheels on the bus go\n", "Round and round,\n", "All through the town.\"\"\"\n", "\n", "doc2 = \"\"\"The doors on the bus go,\n", "Open and shut,\n", "Open and shut,\n", "Open and shut.\n", "The doors on the bus go\n", "Open and shut,\n", "All through the town.\"\"\"\n", "\n", "doc3 = \"\"\"The Driver on the bus says,\n", "\"Move on back!\n", "Move on back!\n", "Move on back!\"\n", "The Driver on the bus says,\n", "\"Move on back!\"\n", "All through the town.\"\"\"\n", "\n", "doc4 = \"\"\"The babies on the bus go,\n", "\"Wah, wah, wah!\n", "Wah, wah, wah!\n", "Wah, wah, wah!\"\n", "The babies on the bus go,\n", "\"Wah, wah, wah!\"\n", "All through the town.\"\"\"" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "docs = [doc1, doc2, doc3, doc4]\n", "doc_words = [doc.strip().lower().translate(str.maketrans('', '', string.punctuation)).split()\n", " for doc in docs]\n", "words = [word for words in doc_words for word in words]\n", "vocab = set(words)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "table = np.zeros((len(vocab), len(docs)), dtype='int')" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "for i, word in enumerate(vocab):\n", " for j, doc in enumerate(doc_words):\n", " table[i, j] = doc.count(word)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | doc1 | \n", "doc2 | \n", "doc3 | \n", "doc4 | \n", "
---|---|---|---|---|
round | \n", "8 | \n", "0 | \n", "0 | \n", "0 | \n", "
back | \n", "0 | \n", "0 | \n", "4 | \n", "0 | \n", "
the | \n", "5 | \n", "5 | \n", "5 | \n", "5 | \n", "
bus | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "
town | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
shut | \n", "0 | \n", "4 | \n", "0 | \n", "0 | \n", "
all | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
and | \n", "4 | \n", "4 | \n", "0 | \n", "0 | \n", "
move | \n", "0 | \n", "0 | \n", "4 | \n", "0 | \n", "
babies | \n", "0 | \n", "0 | \n", "0 | \n", "2 | \n", "
driver | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "
wheels | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "
through | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
doors | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "
says | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "
open | \n", "0 | \n", "4 | \n", "0 | \n", "0 | \n", "
on | \n", "2 | \n", "2 | \n", "6 | \n", "2 | \n", "
go | \n", "2 | \n", "2 | \n", "0 | \n", "2 | \n", "
wah | \n", "0 | \n", "0 | \n", "0 | \n", "12 | \n", "