Working with Text

In [1]:
%matplotlib inline

Libraries for I/O

In [68]:
import os
import glob

Libraries for numerics

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats

Libraries for plotting

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

Libraries for string manipulation

In [4]:
import string
import re

Libraries for functional programming

In [5]:
from functools import reduce, partial
import itertools as it
import operator as op
import toolz as tz
import toolz.curried as c

String methods

In [6]:
s = "  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n"

Removing leading and trailing whitespace

In [7]:
s.strip()
Out[7]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [8]:
s.lstrip()
Out[8]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n'
In [9]:
s.rstrip()
Out[9]:
'  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [10]:
s = s.strip()

Changing case

In [11]:
s.lower()
Out[11]:
'avoid taking unnecessary gambles. lucky numbers: 12, 15, 23, 28, 37'
In [12]:
s.upper()
Out[12]:
'AVOID TAKING UNNECESSARY GAMBLES. LUCKY NUMBERS: 12, 15, 23, 28, 37'
In [13]:
s.title()
Out[13]:
'Avoid Taking Unnecessary Gambles. Lucky Numbers: 12, 15, 23, 28, 37'

Checking conditions

In [14]:
s.startswith('Avoid')
Out[14]:
True
In [15]:
s.endswith('37')
Out[15]:
True
In [16]:
s.isalpha()
Out[16]:
False
In [17]:
s.isnumeric()
Out[17]:
False
In [18]:
s.isspace()
Out[18]:
False
In [19]:
s.isprintable()
Out[19]:
True

Counting and indexing

In [20]:
s.count('a')
Out[20]:
3
In [21]:
s.count('gambles')
Out[21]:
1
In [22]:
s.find('gambles')
Out[22]:
25
In [23]:
s[27:]
Out[23]:
'mbles. Lucky numbers: 12, 15, 23, 28, 37'
In [24]:
s.find('foobar')
Out[24]:
-1
In [25]:
s.index('gambles')
Out[25]:
25
In [26]:
try:
    s.index('foobar')
except ValueError as e:
    print(e)
substring not found

Splitting and joining

In [27]:
s.split()
Out[27]:
['Avoid',
 'taking',
 'unnecessary',
 'gambles.',
 'Lucky',
 'numbers:',
 '12,',
 '15,',
 '23,',
 '28,',
 '37']
In [28]:
s.split(':')
Out[28]:
['Avoid taking unnecessary gambles. Lucky numbers', ' 12, 15, 23, 28, 37']
In [29]:
'-'.join(s.split())
Out[29]:
'Avoid-taking-unnecessary-gambles.-Lucky-numbers:-12,-15,-23,-28,-37'

Replacing

In [30]:
s.replace('gambles', 'risk')
Out[30]:
'Avoid taking unnecessary risk. Lucky numbers: 12, 15, 23, 28, 37'

Translating

In [31]:
table = str.maketrans(string.ascii_lowercase, string.ascii_uppercase, string.punctuation)
s.translate(table)
Out[31]:
'AVOID TAKING UNNECESSARY GAMBLES LUCKY NUMBERS 12 15 23 28 37'
In [32]:
table = str.maketrans('', '', string.punctuation)
s.translate(table)
Out[32]:
'Avoid taking unnecessary gambles Lucky numbers 12 15 23 28 37'

Exercise: Caesar Cipher

A Caesar cipher with offset \(k\) converts a character into the character \(k\) letters down, looping around if this goes past z. Non-characters (numbers, spaces, punctuation) are left intact. For instance, with offset=3, we get abcXYZ being coded as defABC. Write an function encode(k, s) where k is the offset and s the string to be coded. Write a decode(k, s) function that decodes encrypted ciphers. Test it out on the fortune.

In [33]:
def encode(k, s):
    table = str.maketrans(
        string.ascii_lowercase + string.ascii_uppercase,
        string.ascii_lowercase[k:] + string.ascii_lowercase[:k] +
        string.ascii_uppercase[k:] + string.ascii_uppercase[:k])
    return s.translate(table)
In [34]:
encode(3, 'abcXYZ')
Out[34]:
'defABC'
In [35]:
def decode(k, s):
    return encode(-k, s)
In [36]:
code = encode(3, s)
In [37]:
code
Out[37]:
'Dyrlg wdnlqj xqqhfhvvdub jdpeohv. Oxfnb qxpehuv: 12, 15, 23, 28, 37'
In [38]:
decode(3, code)
Out[38]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

Counting words

To count words, we typically do the following preprocessing:

  • Convert to lower (or upper) case
  • Remove punctuation
  • Split on blank space
  • Count each word in list
In [39]:
s
Out[39]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

Preprocessing

In [40]:
words = s.lower().translate(str.maketrans('','',string.punctuation)).split()

Using a Counter (bag)

In [41]:
from collections import Counter
In [42]:
Counter(words)
Out[42]:
Counter({'12': 1,
         '15': 1,
         '23': 1,
         '28': 1,
         '37': 1,
         'avoid': 1,
         'gambles': 1,
         'lucky': 1,
         'numbers': 1,
         'taking': 1,
         'unnecessary': 1})

Using a dictionary

In [43]:
counter = {}
for word in words:
    counter[word] = counter.get(word, 0) + 1
In [44]:
counter
Out[44]:
{'12': 1,
 '15': 1,
 '23': 1,
 '28': 1,
 '37': 1,
 'avoid': 1,
 'gambles': 1,
 'lucky': 1,
 'numbers': 1,
 'taking': 1,
 'unnecessary': 1}

Using a defaultdict

In [45]:
from collections import defaultdict
In [46]:
d = defaultdict(int)
In [47]:
for word in words:
    d[word] += 1
In [48]:
d
Out[48]:
defaultdict(int,
            {'12': 1,
             '15': 1,
             '23': 1,
             '28': 1,
             '37': 1,
             'avoid': 1,
             'gambles': 1,
             'lucky': 1,
             'numbers': 1,
             'taking': 1,
             'unnecessary': 1})

Using a functional pipe

In [49]:
tz.pipe(
    s,
    lambda s: s.lower(),
    lambda s: s.translate(str.maketrans('', '', string.punctuation)),
    lambda s: s.split(),
    tz.frequencies
)
Out[49]:
{'12': 1,
 '15': 1,
 '23': 1,
 '28': 1,
 '37': 1,
 'avoid': 1,
 'gambles': 1,
 'lucky': 1,
 'numbers': 1,
 'taking': 1,
 'unnecessary': 1}

Modification for collection of strings

In [50]:
ss = [s, s, s]
In [51]:
ss
Out[51]:
['Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
 'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
 'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37']
In [52]:
tz.pipe(
    ss,
    c.map(lambda s: s.lower()),
    c.map(lambda s: s.translate(str.maketrans('', '', string.punctuation))),
    c.mapcat(lambda s: s.split()),
    tz.frequencies
)
Out[52]:
{'12': 3,
 '15': 3,
 '23': 3,
 '28': 3,
 '37': 3,
 'avoid': 3,
 'gambles': 3,
 'lucky': 3,
 'numbers': 3,
 'taking': 3,
 'unnecessary': 3}

String to vector

To analyze text, we typically need to convert it to a vector format. There are several ways to do so. Here we show the most obvious method known as one-hot encoding.

One hot character encoding

We first encode the string ‘abcabc’ as the vector [0,1,2,0,1,2]. For one-hot encoding, we next convert this to the one-hot encoded matrix

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
In [53]:
idx = 0
index = {}
for ch in s:
    if not ch in index:
        index[ch] = idx
        idx += 1
In [54]:
index
Out[54]:
{' ': 5,
 ',': 25,
 '.': 20,
 '1': 23,
 '2': 24,
 '3': 27,
 '5': 26,
 '7': 29,
 '8': 28,
 ':': 22,
 'A': 0,
 'L': 21,
 'a': 7,
 'b': 18,
 'c': 13,
 'd': 4,
 'e': 12,
 'g': 10,
 'i': 3,
 'k': 8,
 'l': 19,
 'm': 17,
 'n': 9,
 'o': 2,
 'r': 15,
 's': 14,
 't': 6,
 'u': 11,
 'v': 1,
 'y': 16}

Categorical encoding

In [55]:
nchars = len(index)
In [56]:
vs = np.array([index[ch] for ch in s])
In [57]:
vs
Out[57]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  3,  9, 10,  5, 11,  9,  9, 12,
       13, 12, 14, 14,  7, 15, 16,  5, 10,  7, 17, 18, 19, 12, 14, 20,  5,
       21, 11, 13,  8, 16,  5,  9, 11, 17, 18, 12, 15, 14, 22,  5, 23, 24,
       25,  5, 23, 26, 25,  5, 24, 27, 25,  5, 24, 28, 25,  5, 27, 29])

One-hot encoding

In [58]:
n = len(vs)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(vs))
m[i, vs] = 1
m
Out[58]:
array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

Reverse index lookup

In [59]:
reverse_index = dict(zip(index.values(), index.keys()))
In [60]:
''.join(reverse_index[v] for v in vs)
Out[60]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

One hot encoding for words.

In [61]:
words = ' '.join([s,s]).lower().translate(str.maketrans('', '', string.punctuation)).split()
In [62]:
pos = 0
index = {}
for word in words:
    if word not in index:
        index[word] = pos
        pos += 1

Categorical encoding

In [63]:
ws = np.array([index[word] for word in words])
In [64]:
ws
Out[64]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,  5,
        6,  7,  8,  9, 10])

One-hot encoding

In [65]:
n = len(ws)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(ws))
m[i, ws] = 1
m
Out[65]:
array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

Reverse lookup

In [66]:
reverse_index = dict(zip(index.values(), index.keys()))
In [67]:
' '.join(reverse_index[w] for w in ws)
Out[67]:
'avoid taking unnecessary gambles lucky numbers 12 15 23 28 37 avoid taking unnecessary gambles lucky numbers 12 15 23 28 37'

Regular expressions

In [68]:
s
Out[68]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

Literal match

In [69]:
re.findall(r'gambles', s)
Out[69]:
['gambles']

Quantifiers ., {m,n}, +, *

In [70]:
re.findall(r'gam.les', s)
Out[70]:
['gambles']
In [71]:
re.findall(r'g.*s', s)
Out[71]:
['g unnecessary gambles. Lucky numbers']

Non-greedy quantifier.

In [72]:
re.findall(r'g.*?s', s)
Out[72]:
['g unneces', 'gambles']

Special characters

In [73]:
re.findall(r'\bg.*?s\b', s)
Out[73]:
['gambles']
In [74]:
re.findall(r'\b\w+?\b', s)
Out[74]:
['Avoid',
 'taking',
 'unnecessary',
 'gambles',
 'Lucky',
 'numbers',
 '12',
 '15',
 '23',
 '28',
 '37']
In [75]:
re.findall(r'\b\d+?\b', s)
Out[75]:
['12', '15', '23', '28', '37']
In [76]:
re.findall(r'\b[a-zA-Z]+?\b', s)
Out[76]:
['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']

Begin and end anchors

In [77]:
re.findall(r'\w+', s)
Out[77]:
['Avoid',
 'taking',
 'unnecessary',
 'gambles',
 'Lucky',
 'numbers',
 '12',
 '15',
 '23',
 '28',
 '37']
In [78]:
re.findall(r'^\w+', s)
Out[78]:
['Avoid']
In [79]:
re.findall(r'\w+$', s)
Out[79]:
['37']

Capture groups

In [80]:
pat = r'\b(\d)(\d)?\b'
In [81]:
re.findall(pat, s)
Out[81]:
[('1', '2'), ('1', '5'), ('2', '3'), ('2', '8'), ('3', '7')]

Using search and match objects

In [82]:
re.search(pat, s)
Out[82]:
<_sre.SRE_Match object; span=(49, 51), match='12'>
In [83]:
m = re.search(pat, s)
In [84]:
m.string
Out[84]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [85]:
m.group()
Out[85]:
'12'
In [86]:
m.groups()
Out[86]:
('1', '2')

Replacement using capture groups

In [87]:
rep = r'\2\1'
re.sub(pat, rep, s)
Out[87]:
'Avoid taking unnecessary gambles. Lucky numbers: 21, 51, 32, 82, 73'

Using compiled patterns

In [88]:
pat = re.compile(r'\b[a-zA-Z]+?\b')
pat.findall(s)
Out[88]:
['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']