Working with Text¶
In [1]:
%matplotlib inline
Libraries for I/O
In [68]:
import os
import glob
Libraries for numerics
In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
Libraries for plotting
In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
Libraries for string manipulation
In [4]:
import string
import re
Libraries for functional programming
In [5]:
from functools import reduce, partial
import itertools as it
import operator as op
import toolz as tz
import toolz.curried as c
String methods¶
In [6]:
s = " Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n"
Removing leading and trailing whitespace¶
In [7]:
s.strip()
Out[7]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [8]:
s.lstrip()
Out[8]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n'
In [9]:
s.rstrip()
Out[9]:
' Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [10]:
s = s.strip()
Changing case¶
In [11]:
s.lower()
Out[11]:
'avoid taking unnecessary gambles. lucky numbers: 12, 15, 23, 28, 37'
In [12]:
s.upper()
Out[12]:
'AVOID TAKING UNNECESSARY GAMBLES. LUCKY NUMBERS: 12, 15, 23, 28, 37'
In [13]:
s.title()
Out[13]:
'Avoid Taking Unnecessary Gambles. Lucky Numbers: 12, 15, 23, 28, 37'
Checking conditions¶
In [14]:
s.startswith('Avoid')
Out[14]:
True
In [15]:
s.endswith('37')
Out[15]:
True
In [16]:
s.isalpha()
Out[16]:
False
In [17]:
s.isnumeric()
Out[17]:
False
In [18]:
s.isspace()
Out[18]:
False
In [19]:
s.isprintable()
Out[19]:
True
Counting and indexing¶
In [20]:
s.count('a')
Out[20]:
3
In [21]:
s.count('gambles')
Out[21]:
1
In [22]:
s.find('gambles')
Out[22]:
25
In [23]:
s[27:]
Out[23]:
'mbles. Lucky numbers: 12, 15, 23, 28, 37'
In [24]:
s.find('foobar')
Out[24]:
-1
In [25]:
s.index('gambles')
Out[25]:
25
In [26]:
try:
s.index('foobar')
except ValueError as e:
print(e)
substring not found
Splitting and joining¶
In [27]:
s.split()
Out[27]:
['Avoid',
'taking',
'unnecessary',
'gambles.',
'Lucky',
'numbers:',
'12,',
'15,',
'23,',
'28,',
'37']
In [28]:
s.split(':')
Out[28]:
['Avoid taking unnecessary gambles. Lucky numbers', ' 12, 15, 23, 28, 37']
In [29]:
'-'.join(s.split())
Out[29]:
'Avoid-taking-unnecessary-gambles.-Lucky-numbers:-12,-15,-23,-28,-37'
Replacing¶
In [30]:
s.replace('gambles', 'risk')
Out[30]:
'Avoid taking unnecessary risk. Lucky numbers: 12, 15, 23, 28, 37'
Translating¶
In [31]:
table = str.maketrans(string.ascii_lowercase, string.ascii_uppercase, string.punctuation)
s.translate(table)
Out[31]:
'AVOID TAKING UNNECESSARY GAMBLES LUCKY NUMBERS 12 15 23 28 37'
In [32]:
table = str.maketrans('', '', string.punctuation)
s.translate(table)
Out[32]:
'Avoid taking unnecessary gambles Lucky numbers 12 15 23 28 37'
Exercise: Caesar Cipher
A Caesar cipher with offset \(k\) converts a character into the
character \(k\) letters down, looping around if this goes past
z
. Non-characters (numbers, spaces, punctuation) are left intact.
For instance, with offset=3, we get abcXYZ
being coded as
defABC
. Write an function encode(k, s)
where k
is the offset
and s
the string to be coded. Write a decode(k, s)
function that
decodes encrypted ciphers. Test it out on the fortune.
In [33]:
def encode(k, s):
table = str.maketrans(
string.ascii_lowercase + string.ascii_uppercase,
string.ascii_lowercase[k:] + string.ascii_lowercase[:k] +
string.ascii_uppercase[k:] + string.ascii_uppercase[:k])
return s.translate(table)
In [34]:
encode(3, 'abcXYZ')
Out[34]:
'defABC'
In [35]:
def decode(k, s):
return encode(-k, s)
In [36]:
code = encode(3, s)
In [37]:
code
Out[37]:
'Dyrlg wdnlqj xqqhfhvvdub jdpeohv. Oxfnb qxpehuv: 12, 15, 23, 28, 37'
In [38]:
decode(3, code)
Out[38]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
Counting words¶
To count words, we typically do the following preprocessing:
- Convert to lower (or upper) case
- Remove punctuation
- Split on blank space
- Count each word in list
In [39]:
s
Out[39]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
Preprocessing¶
In [40]:
words = s.lower().translate(str.maketrans('','',string.punctuation)).split()
Using a Counter (bag)¶
In [41]:
from collections import Counter
In [42]:
Counter(words)
Out[42]:
Counter({'12': 1,
'15': 1,
'23': 1,
'28': 1,
'37': 1,
'avoid': 1,
'gambles': 1,
'lucky': 1,
'numbers': 1,
'taking': 1,
'unnecessary': 1})
Using a dictionary¶
In [43]:
counter = {}
for word in words:
counter[word] = counter.get(word, 0) + 1
In [44]:
counter
Out[44]:
{'12': 1,
'15': 1,
'23': 1,
'28': 1,
'37': 1,
'avoid': 1,
'gambles': 1,
'lucky': 1,
'numbers': 1,
'taking': 1,
'unnecessary': 1}
Using a defaultdict
¶
In [45]:
from collections import defaultdict
In [46]:
d = defaultdict(int)
In [47]:
for word in words:
d[word] += 1
In [48]:
d
Out[48]:
defaultdict(int,
{'12': 1,
'15': 1,
'23': 1,
'28': 1,
'37': 1,
'avoid': 1,
'gambles': 1,
'lucky': 1,
'numbers': 1,
'taking': 1,
'unnecessary': 1})
Using a functional pipe¶
In [49]:
tz.pipe(
s,
lambda s: s.lower(),
lambda s: s.translate(str.maketrans('', '', string.punctuation)),
lambda s: s.split(),
tz.frequencies
)
Out[49]:
{'12': 1,
'15': 1,
'23': 1,
'28': 1,
'37': 1,
'avoid': 1,
'gambles': 1,
'lucky': 1,
'numbers': 1,
'taking': 1,
'unnecessary': 1}
Modification for collection of strings¶
In [50]:
ss = [s, s, s]
In [51]:
ss
Out[51]:
['Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37']
In [52]:
tz.pipe(
ss,
c.map(lambda s: s.lower()),
c.map(lambda s: s.translate(str.maketrans('', '', string.punctuation))),
c.mapcat(lambda s: s.split()),
tz.frequencies
)
Out[52]:
{'12': 3,
'15': 3,
'23': 3,
'28': 3,
'37': 3,
'avoid': 3,
'gambles': 3,
'lucky': 3,
'numbers': 3,
'taking': 3,
'unnecessary': 3}
String to vector¶
To analyze text, we typically need to convert it to a vector format. There are several ways to do so. Here we show the most obvious method known as one-hot encoding.
One hot character encoding¶
We first encode the string ‘abcabc’ as the vector [0,1,2,0,1,2]. For one-hot encoding, we next convert this to the one-hot encoded matrix
array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1],
[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
In [53]:
idx = 0
index = {}
for ch in s:
if not ch in index:
index[ch] = idx
idx += 1
In [54]:
index
Out[54]:
{' ': 5,
',': 25,
'.': 20,
'1': 23,
'2': 24,
'3': 27,
'5': 26,
'7': 29,
'8': 28,
':': 22,
'A': 0,
'L': 21,
'a': 7,
'b': 18,
'c': 13,
'd': 4,
'e': 12,
'g': 10,
'i': 3,
'k': 8,
'l': 19,
'm': 17,
'n': 9,
'o': 2,
'r': 15,
's': 14,
't': 6,
'u': 11,
'v': 1,
'y': 16}
Categorical encoding¶
In [55]:
nchars = len(index)
In [56]:
vs = np.array([index[ch] for ch in s])
In [57]:
vs
Out[57]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 3, 9, 10, 5, 11, 9, 9, 12,
13, 12, 14, 14, 7, 15, 16, 5, 10, 7, 17, 18, 19, 12, 14, 20, 5,
21, 11, 13, 8, 16, 5, 9, 11, 17, 18, 12, 15, 14, 22, 5, 23, 24,
25, 5, 23, 26, 25, 5, 24, 27, 25, 5, 24, 28, 25, 5, 27, 29])
One-hot encoding¶
In [58]:
n = len(vs)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(vs))
m[i, vs] = 1
m
Out[58]:
array([[1, 0, 0, ..., 0, 0, 0],
[0, 1, 0, ..., 0, 0, 0],
[0, 0, 1, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 1, 0, 0],
[0, 0, 0, ..., 0, 0, 1]])
Reverse index lookup¶
In [59]:
reverse_index = dict(zip(index.values(), index.keys()))
In [60]:
''.join(reverse_index[v] for v in vs)
Out[60]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
One hot encoding for words.¶
In [61]:
words = ' '.join([s,s]).lower().translate(str.maketrans('', '', string.punctuation)).split()
In [62]:
pos = 0
index = {}
for word in words:
if word not in index:
index[word] = pos
pos += 1
Categorical encoding¶
In [63]:
ws = np.array([index[word] for word in words])
In [64]:
ws
Out[64]:
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10])
One-hot encoding¶
In [65]:
n = len(ws)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(ws))
m[i, ws] = 1
m
Out[65]:
array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
Reverse lookup¶
In [66]:
reverse_index = dict(zip(index.values(), index.keys()))
In [67]:
' '.join(reverse_index[w] for w in ws)
Out[67]:
'avoid taking unnecessary gambles lucky numbers 12 15 23 28 37 avoid taking unnecessary gambles lucky numbers 12 15 23 28 37'
Regular expressions¶
In [68]:
s
Out[68]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
Quantifiers .
, {m,n}
, +
, *
¶
In [70]:
re.findall(r'gam.les', s)
Out[70]:
['gambles']
In [71]:
re.findall(r'g.*s', s)
Out[71]:
['g unnecessary gambles. Lucky numbers']
Special characters¶
In [73]:
re.findall(r'\bg.*?s\b', s)
Out[73]:
['gambles']
In [74]:
re.findall(r'\b\w+?\b', s)
Out[74]:
['Avoid',
'taking',
'unnecessary',
'gambles',
'Lucky',
'numbers',
'12',
'15',
'23',
'28',
'37']
In [75]:
re.findall(r'\b\d+?\b', s)
Out[75]:
['12', '15', '23', '28', '37']
In [76]:
re.findall(r'\b[a-zA-Z]+?\b', s)
Out[76]:
['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']
Begin and end anchors¶
In [77]:
re.findall(r'\w+', s)
Out[77]:
['Avoid',
'taking',
'unnecessary',
'gambles',
'Lucky',
'numbers',
'12',
'15',
'23',
'28',
'37']
In [78]:
re.findall(r'^\w+', s)
Out[78]:
['Avoid']
In [79]:
re.findall(r'\w+$', s)
Out[79]:
['37']
Capture groups¶
In [80]:
pat = r'\b(\d)(\d)?\b'
In [81]:
re.findall(pat, s)
Out[81]:
[('1', '2'), ('1', '5'), ('2', '3'), ('2', '8'), ('3', '7')]
Using search and match objects¶
In [82]:
re.search(pat, s)
Out[82]:
<_sre.SRE_Match object; span=(49, 51), match='12'>
In [83]:
m = re.search(pat, s)
In [84]:
m.string
Out[84]:
'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'
In [85]:
m.group()
Out[85]:
'12'
In [86]:
m.groups()
Out[86]:
('1', '2')
Replacement using capture groups¶
In [87]:
rep = r'\2\1'
re.sub(pat, rep, s)
Out[87]:
'Avoid taking unnecessary gambles. Lucky numbers: 21, 51, 32, 82, 73'
Using compiled patterns¶
In [88]:
pat = re.compile(r'\b[a-zA-Z]+?\b')
pat.findall(s)
Out[88]:
['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']