Text

Strings

  • Point index

  • Interval index

  • Negative index

  • Stride

  • Reversing a string

  • Strings are immutable

In [1]:
s = "hello world"
In [2]:
s[0], s[6]
Out[2]:
('h', 'w')
In [3]:
s[0:6]
Out[3]:
'hello '
In [4]:
s[-1], s[-3]
Out[4]:
('d', 'r')
In [5]:
s[::2]
Out[5]:
'hlowrd'
In [6]:
s[::-1]
Out[6]:
'dlrow olleh'
In [7]:
try:
    s[0] = 'H'
except TypeError as e:
    print(e)
'str' object does not support item assignment

The string module

  • String constants

  • String capwords

In [8]:
import string
In [9]:
string.digits
Out[9]:
'0123456789'
In [10]:
string.ascii_letters
Out[10]:
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
In [11]:
string.punctuation
Out[11]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [12]:
string.whitespace
Out[12]:
' \t\n\r\x0b\x0c'
In [13]:
string.printable
Out[13]:
'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

Orphan function in strings module

In [14]:
string.capwords(s)
Out[14]:
'Hello World'

String methods

Methods to change case

In [15]:
s.upper()
Out[15]:
'HELLO WORLD'
In [16]:
s.lower()
Out[16]:
'hello world'
In [17]:
'ß'.casefold()
Out[17]:
'ss'
In [18]:
s.capitalize()
Out[18]:
'Hello world'
In [19]:
s.title()
Out[19]:
'Hello World'

Difference between title method and capwords function

In [20]:
'hello:world'.title()
Out[20]:
'Hello:World'
In [21]:
string.capwords('hello:world')
Out[21]:
'Hello:world'

String predicates

In [22]:
s.isalnum()
Out[22]:
False
In [23]:
s.isalpha()
Out[23]:
False
In [24]:
s.isascii()
Out[24]:
True
In [25]:
s.isidentifier()
Out[25]:
False
In [26]:
s.isprintable()
Out[26]:
True
In [27]:
s.startswith('hell')
Out[27]:
True
In [28]:
s.endswith('ld')
Out[28]:
True

Searching and counting

In [29]:
'llo' in s
Out[29]:
True
In [30]:
'foo' in s
Out[30]:
False
In [31]:
s.find('llo')
Out[31]:
2
In [32]:
s.index('llo')
Out[32]:
2
In [33]:
s.find('foo')
Out[33]:
-1
In [34]:
try:
    s.index('foo')
except ValueError as e:
    print(e)
substring not found
In [35]:
s.count('l')
Out[35]:
3
In [36]:
s.count('ll')
Out[36]:
1

Stripping

In [37]:
'   hello world   '.strip()
Out[37]:
'hello world'
In [38]:
'   hello world   '.lstrip()
Out[38]:
'hello world   '
In [39]:
'   hello world   '.rstrip()
Out[39]:
'   hello world'

Splitting and joining

In [40]:
s.split()
Out[40]:
['hello', 'world']
In [41]:
s.split('l')
Out[41]:
['he', '', 'o wor', 'd']
In [42]:
'-'.join(s)
Out[42]:
'h-e-l-l-o- -w-o-r-l-d'
In [43]:
'-'.join(s.split())
Out[43]:
'hello-world'
In [44]:
'l'.join(s.split('l'))
Out[44]:
'hello world'

Translation

In [45]:
'GATTACA'.translate(str.maketrans('ACTG', 'TAGC'))
Out[45]:
'CTGGTAT'
In [46]:
'GATTACA'.translate(str.maketrans('', '', 'AC'))
Out[46]:
'GTT'
In [47]:
'GATTACA'.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))
Out[47]:
'gattaca'

ord and chr

In [48]:
ord('A'), ord('a')
Out[48]:
(65, 97)
In [49]:
chr(65), chr(97)
Out[49]:
('A', 'a')
In [50]:
chr(ord('B') + (ord('a') - ord('A')))
Out[50]:
'b'

Formatting strings

C sytle formatting

In [51]:
pi = 3.141592653589793
r = 2
In [52]:
'area = %f * %d^2' % (pi, r)
Out[52]:
'area = 3.141593 * 2^2'

Precision and padding

In [53]:
'area = %8.2f * %03d^2' % (pi, r)
Out[53]:
'area =     3.14 * 002^2'

Right align string

In [54]:
'%10s = %8.2f * %03d^2' % ('area', pi, r)
Out[54]:
'      area =     3.14 * 002^2'

Left align string

In [55]:
'%-10s = %8.2f * %03d^2' % ('area', pi, r)
Out[55]:
'area       =     3.14 * 002^2'

Using the format method

In [56]:
'area = {} * {}^2'.format(pi, r)
Out[56]:
'area = 3.141592653589793 * 2^2'
In [57]:
'area = {a} * {b}^2'.format(a=pi, b=r)
Out[57]:
'area = 3.141592653589793 * 2^2'
In [58]:
'area = {pi:8,.4} * {r:06d}^2'.format(pi=pi, r=r)
Out[58]:
'area =    3.142 * 000002^2'
In [59]:
'{:>10}'.format('area')
Out[59]:
'      area'
In [60]:
'{:<10}'.format('area')
Out[60]:
'area      '
In [61]:
'{:^10}'.format('area')
Out[61]:
'   area   '
In [62]:
'{:=^10}'.format('area')
Out[62]:
'===area==='

Using f strings

In [63]:
f'area = {pi} * {r}^2'
Out[63]:
'area = 3.141592653589793 * 2^2'
In [64]:
x = 'area'
f'{x:=^10}'
Out[64]:
'===area==='

Templates

In [65]:
from string import Template
In [66]:
t = Template("$who likes $what")
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
    print(t.substitute(who=name, what=lang))
ann likes Python
bob likes R
cody likes C++
In [67]:
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
    print("{} likes {}".format(name, lang))
ann likes Python
bob likes R
cody likes C++
In [68]:
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
    print(f"{name} likes {lang}")
ann likes Python
bob likes R
cody likes C++

Encodings

Unicode strings

In [69]:
print('hello \u732b')
hello 猫
In [70]:
s = '猫'
print(f'hello {s}')
hello 猫
image

image

Byte strings

In [71]:
kitty = '小' + '猫'
In [72]:
print(f'hello {kitty}')
hello 小猫
In [73]:
kitty_bytes = kitty.encode('utf8')
kitty_bytes
Out[73]:
b'\xe5\xb0\x8f\xe7\x8c\xab'
In [74]:
kitty_bytes.decode('utf8')
Out[74]:
'小猫'
In [75]:
try:
    kitty_bytes.decode('ascii')
except UnicodeDecodeError as e:
    print(e)
'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)

Reading and writing text files

In [76]:
%%file haiku.txt
古池や蛙飛び込む水の音
ふるいけやかわずとびこむみずのおと
Overwriting haiku.txt
In [77]:
with open('haiku.txt') as f:
    for line in f:
        print(line, end='')
古池や蛙飛び込む水の音
ふるいけやかわずとびこむみずのおと
In [78]:
with open('haiku.txt') as f:
    haiku = f.read()
In [79]:
haiku
Out[79]:
'古池や蛙飛び込む水の音\nふるいけやかわずとびこむみずのおと'
In [80]:
haiku.split()
Out[80]:
['古池や蛙飛び込む水の音', 'ふるいけやかわずとびこむみずのおと']

Using regular expressions

golf

golf

In [81]:
import re

Matching Characters

In [82]:
beer = '''99 bottles of Beer on the wall, 99 bottles of beeR.
Take one down and pass it around, 98 bottles of beer on the wall.'''
In [83]:
re.findall('beer', beer)
Out[83]:
['beer']
In [84]:
re.findall('beer', beer, re.IGNORECASE)
Out[84]:
['Beer', 'beeR', 'beer']
In [85]:
re.findall('on', beer)
Out[85]:
['on', 'on', 'on']

Word boundaries

In [86]:
re.findall(r'\bon\b', beer)
Out[86]:
['on', 'on']
In [87]:
re.findall(r'.', beer)[-10:]
Out[87]:
[' ', 't', 'h', 'e', ' ', 'w', 'a', 'l', 'l', '.']

Character sets

In [88]:
re.findall(r'\d', beer)
Out[88]:
['9', '9', '9', '9', '9', '8']
In [89]:
re.findall(r'[0-9]', beer)
Out[89]:
['9', '9', '9', '9', '9', '8']
In [90]:
re.findall(r'\w', beer)[11:25]
Out[90]:
['B', 'e', 'e', 'r', 'o', 'n', 't', 'h', 'e', 'w', 'a', 'l', 'l', '9']

Repeating Things

In [91]:
re.findall(r'\d+', beer)
Out[91]:
['99', '99', '98']
In [92]:
re.findall(r'b.+r', beer)
Out[92]:
['bottles of Beer', 'bottles of beer']
In [93]:
re.findall(r'be+', beer)
Out[93]:
['bee', 'bee']
In [94]:
re.findall(r'be*', beer)
Out[94]:
['b', 'b', 'bee', 'b', 'bee']
In [95]:
re.findall(r'b[aeiou]+', beer)
Out[95]:
['bo', 'bo', 'bee', 'bo', 'bee']
In [96]:
re.findall(r'b[aeiou]{2,}', beer)
Out[96]:
['bee', 'bee']
In [97]:
re.findall(r'b[aeiou]{1}', beer)
Out[97]:
['bo', 'bo', 'be', 'bo', 'be']

Finding matches

In [98]:
for m in re.finditer('beer', beer, re.IGNORECASE):
    print(m.start(), m.end(), m.span(),  m.group())
14 18 (14, 18) Beer
46 50 (46, 50) beeR
100 104 (100, 104) beer

Grouping

In [99]:
re.findall(r'(\d+)\s+(\b\w+?\b)', beer, re.IGNORECASE)
Out[99]:
[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]

Splitting

In [100]:
re.split(r'\d+', beer)
Out[100]:
['',
 ' bottles of Beer on the wall, ',
 ' bottles of beeR.\nTake one down and pass it around, ',
 ' bottles of beer on the wall.']

Search and replace

In [101]:
print(re.sub('beer', 'whiskey', beer, flags=re.IGNORECASE))
99 bottles of whiskey on the wall, 99 bottles of whiskey.
Take one down and pass it around, 98 bottles of whiskey on the wall.
In [102]:
print(re.sub(r'(\d+)\s+(\b\w+?\b)', r'\2 \1', beer, re.IGNORECASE))
bottles 99 of Beer on the wall, bottles 99 of beeR.
Take one down and pass it around, 98 bottles of beer on the wall.

Function versus compiled method

In [103]:
pattern = re.compile(r'(\d+)\s+(\b\w+?\b)')
pattern.findall(beer)
Out[103]:
[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]

Raw strings

The backslash \ is an escape character in a regular Python string. So we need to escape it to match a literal \. However, \ is an escape character in the regular expression mini-language when compiling the regular expression pattern. So we need to escape at two levels - hence we need \\\\ to match a literal \. The raw string rfoo treats \ as a literal character rather than an escape character.

In [104]:
latex = 'latex uses \section over and over again like so \section'
In [105]:
re.findall('\section', latex)
Out[105]:
[]
In [106]:
re.findall('\\section', latex)
Out[106]:
[]
In [107]:
re.findall('\\\\section', latex)
Out[107]:
['\\section', '\\section']
In [108]:
re.findall(r'\\section', latex)
Out[108]:
['\\section', '\\section']

Examples

Custom version of capwords

In [109]:
string.capwords('hello    world')
Out[109]:
'Hello World'
In [110]:
def my_capwords(ss):
    return ' '.join([s.title() for s in ss.split()])
In [111]:
my_capwords('hello    world')
Out[111]:
'Hello World'

Bag of words

Create a table of counts, where rows represent unique words and columns represent different documents. Ignore case and capitalization.

In [112]:
doc1 = """The wheels on the bus go,
Round and round,
Round and round,
Round and round.
The wheels on the bus go
Round and round,
All through the town."""

doc2 = """The doors on the bus go,
Open and shut,
Open and shut,
Open and shut.
The doors on the bus go
Open and shut,
All through the town."""

doc3 = """The Driver on the bus says,
"Move on back!
Move on back!
Move on back!"
The Driver on the bus says,
"Move on back!"
All through the town."""

doc4 = """The babies on the bus go,
"Wah, wah, wah!
Wah, wah, wah!
Wah, wah, wah!"
The babies on the bus go,
"Wah, wah, wah!"
All through the town."""
In [113]:
docs = [doc1, doc2, doc3, doc4]
doc_words = [doc.strip().lower().translate(str.maketrans('', '', string.punctuation)).split()
             for doc in docs]
words = [word for words in doc_words for word in words]
vocab = set(words)
In [114]:
import numpy as np
import pandas as pd
In [115]:
table = np.zeros((len(vocab), len(docs)), dtype='int')
In [116]:
for i, word in enumerate(vocab):
    for j, doc in enumerate(doc_words):
        table[i, j] = doc.count(word)
In [117]:
pd.DataFrame(table, columns='doc1 doc2 doc3 doc4'.split(), index=vocab)
Out[117]:
doc1 doc2 doc3 doc4
round 8 0 0 0
back 0 0 4 0
the 5 5 5 5
bus 2 2 2 2
town 1 1 1 1
shut 0 4 0 0
all 1 1 1 1
and 4 4 0 0
move 0 0 4 0
babies 0 0 0 2
driver 0 0 2 0
wheels 2 0 0 0
through 1 1 1 1
doors 0 2 0 0
says 0 0 2 0
open 0 4 0 0
on 2 2 6 2
go 2 2 0 2
wah 0 0 0 12
In [ ]: