Text¶
Strings¶
Point index
Interval index
Negative index
Stride
Reversing a string
Strings are immutable
In [1]:
s = "hello world"
In [2]:
s[0], s[6]
Out[2]:
('h', 'w')
In [3]:
s[0:6]
Out[3]:
'hello '
In [4]:
s[-1], s[-3]
Out[4]:
('d', 'r')
In [5]:
s[::2]
Out[5]:
'hlowrd'
In [6]:
s[::-1]
Out[6]:
'dlrow olleh'
In [7]:
try:
s[0] = 'H'
except TypeError as e:
print(e)
'str' object does not support item assignment
The string
module¶
String constants
String
capwords
In [8]:
import string
In [9]:
string.digits
Out[9]:
'0123456789'
In [10]:
string.ascii_letters
Out[10]:
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
In [11]:
string.punctuation
Out[11]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [12]:
string.whitespace
Out[12]:
' \t\n\r\x0b\x0c'
In [13]:
string.printable
Out[13]:
'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
String methods¶
Methods to change case¶
In [15]:
s.upper()
Out[15]:
'HELLO WORLD'
In [16]:
s.lower()
Out[16]:
'hello world'
In [17]:
'ß'.casefold()
Out[17]:
'ss'
In [18]:
s.capitalize()
Out[18]:
'Hello world'
In [19]:
s.title()
Out[19]:
'Hello World'
Difference between title
method and capwords
function
In [20]:
'hello:world'.title()
Out[20]:
'Hello:World'
In [21]:
string.capwords('hello:world')
Out[21]:
'Hello:world'
String predicates¶
In [22]:
s.isalnum()
Out[22]:
False
In [23]:
s.isalpha()
Out[23]:
False
In [24]:
s.isascii()
Out[24]:
True
In [25]:
s.isidentifier()
Out[25]:
False
In [26]:
s.isprintable()
Out[26]:
True
In [27]:
s.startswith('hell')
Out[27]:
True
In [28]:
s.endswith('ld')
Out[28]:
True
Searching and counting¶
In [29]:
'llo' in s
Out[29]:
True
In [30]:
'foo' in s
Out[30]:
False
In [31]:
s.find('llo')
Out[31]:
2
In [32]:
s.index('llo')
Out[32]:
2
In [33]:
s.find('foo')
Out[33]:
-1
In [34]:
try:
s.index('foo')
except ValueError as e:
print(e)
substring not found
In [35]:
s.count('l')
Out[35]:
3
In [36]:
s.count('ll')
Out[36]:
1
Stripping¶
In [37]:
' hello world '.strip()
Out[37]:
'hello world'
In [38]:
' hello world '.lstrip()
Out[38]:
'hello world '
In [39]:
' hello world '.rstrip()
Out[39]:
' hello world'
Splitting and joining¶
In [40]:
s.split()
Out[40]:
['hello', 'world']
In [41]:
s.split('l')
Out[41]:
['he', '', 'o wor', 'd']
In [42]:
'-'.join(s)
Out[42]:
'h-e-l-l-o- -w-o-r-l-d'
In [43]:
'-'.join(s.split())
Out[43]:
'hello-world'
In [44]:
'l'.join(s.split('l'))
Out[44]:
'hello world'
Translation¶
In [45]:
'GATTACA'.translate(str.maketrans('ACTG', 'TAGC'))
Out[45]:
'CTGGTAT'
In [46]:
'GATTACA'.translate(str.maketrans('', '', 'AC'))
Out[46]:
'GTT'
In [47]:
'GATTACA'.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))
Out[47]:
'gattaca'
ord
and chr
¶
In [48]:
ord('A'), ord('a')
Out[48]:
(65, 97)
In [49]:
chr(65), chr(97)
Out[49]:
('A', 'a')
In [50]:
chr(ord('B') + (ord('a') - ord('A')))
Out[50]:
'b'
Formatting strings¶
C sytle formatting¶
In [51]:
pi = 3.141592653589793
r = 2
In [52]:
'area = %f * %d^2' % (pi, r)
Out[52]:
'area = 3.141593 * 2^2'
Precision and padding
In [53]:
'area = %8.2f * %03d^2' % (pi, r)
Out[53]:
'area = 3.14 * 002^2'
Right align string
In [54]:
'%10s = %8.2f * %03d^2' % ('area', pi, r)
Out[54]:
' area = 3.14 * 002^2'
Left align string
In [55]:
'%-10s = %8.2f * %03d^2' % ('area', pi, r)
Out[55]:
'area = 3.14 * 002^2'
Using the format
method¶
In [56]:
'area = {} * {}^2'.format(pi, r)
Out[56]:
'area = 3.141592653589793 * 2^2'
In [57]:
'area = {a} * {b}^2'.format(a=pi, b=r)
Out[57]:
'area = 3.141592653589793 * 2^2'
In [58]:
'area = {pi:8,.4} * {r:06d}^2'.format(pi=pi, r=r)
Out[58]:
'area = 3.142 * 000002^2'
In [59]:
'{:>10}'.format('area')
Out[59]:
' area'
In [60]:
'{:<10}'.format('area')
Out[60]:
'area '
In [61]:
'{:^10}'.format('area')
Out[61]:
' area '
In [62]:
'{:=^10}'.format('area')
Out[62]:
'===area==='
Using f strings¶
In [63]:
f'area = {pi} * {r}^2'
Out[63]:
'area = 3.141592653589793 * 2^2'
In [64]:
x = 'area'
f'{x:=^10}'
Out[64]:
'===area==='
Templates¶
In [65]:
from string import Template
In [66]:
t = Template("$who likes $what")
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
print(t.substitute(who=name, what=lang))
ann likes Python
bob likes R
cody likes C++
In [67]:
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
print("{} likes {}".format(name, lang))
ann likes Python
bob likes R
cody likes C++
In [68]:
items = [('ann', 'Python'), ('bob', 'R'), ('cody', 'C++')]
for name, lang in items:
print(f"{name} likes {lang}")
ann likes Python
bob likes R
cody likes C++
Encodings¶
Unicode strings¶
In [69]:
print('hello \u732b')
hello 猫
In [70]:
s = '猫'
print(f'hello {s}')
hello 猫
Byte strings¶
In [71]:
kitty = '小' + '猫'
In [72]:
print(f'hello {kitty}')
hello 小猫
In [73]:
kitty_bytes = kitty.encode('utf8')
kitty_bytes
Out[73]:
b'\xe5\xb0\x8f\xe7\x8c\xab'
In [74]:
kitty_bytes.decode('utf8')
Out[74]:
'小猫'
In [75]:
try:
kitty_bytes.decode('ascii')
except UnicodeDecodeError as e:
print(e)
'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
Reading and writing text files¶
In [76]:
%%file haiku.txt
古池や蛙飛び込む水の音
ふるいけやかわずとびこむみずのおと
Overwriting haiku.txt
In [77]:
with open('haiku.txt') as f:
for line in f:
print(line, end='')
古池や蛙飛び込む水の音
ふるいけやかわずとびこむみずのおと
In [78]:
with open('haiku.txt') as f:
haiku = f.read()
In [79]:
haiku
Out[79]:
'古池や蛙飛び込む水の音\nふるいけやかわずとびこむみずのおと'
In [80]:
haiku.split()
Out[80]:
['古池や蛙飛び込む水の音', 'ふるいけやかわずとびこむみずのおと']
Using regular expressions¶
Practice at https://regex101.com
Play RegEx Golf
In [81]:
import re
Matching Characters¶
In [82]:
beer = '''99 bottles of Beer on the wall, 99 bottles of beeR.
Take one down and pass it around, 98 bottles of beer on the wall.'''
In [83]:
re.findall('beer', beer)
Out[83]:
['beer']
In [84]:
re.findall('beer', beer, re.IGNORECASE)
Out[84]:
['Beer', 'beeR', 'beer']
In [85]:
re.findall('on', beer)
Out[85]:
['on', 'on', 'on']
Word boundaries¶
In [86]:
re.findall(r'\bon\b', beer)
Out[86]:
['on', 'on']
In [87]:
re.findall(r'.', beer)[-10:]
Out[87]:
[' ', 't', 'h', 'e', ' ', 'w', 'a', 'l', 'l', '.']
Character sets¶
In [88]:
re.findall(r'\d', beer)
Out[88]:
['9', '9', '9', '9', '9', '8']
In [89]:
re.findall(r'[0-9]', beer)
Out[89]:
['9', '9', '9', '9', '9', '8']
In [90]:
re.findall(r'\w', beer)[11:25]
Out[90]:
['B', 'e', 'e', 'r', 'o', 'n', 't', 'h', 'e', 'w', 'a', 'l', 'l', '9']
Repeating Things¶
In [91]:
re.findall(r'\d+', beer)
Out[91]:
['99', '99', '98']
In [92]:
re.findall(r'b.+r', beer)
Out[92]:
['bottles of Beer', 'bottles of beer']
In [93]:
re.findall(r'be+', beer)
Out[93]:
['bee', 'bee']
In [94]:
re.findall(r'be*', beer)
Out[94]:
['b', 'b', 'bee', 'b', 'bee']
In [95]:
re.findall(r'b[aeiou]+', beer)
Out[95]:
['bo', 'bo', 'bee', 'bo', 'bee']
In [96]:
re.findall(r'b[aeiou]{2,}', beer)
Out[96]:
['bee', 'bee']
In [97]:
re.findall(r'b[aeiou]{1}', beer)
Out[97]:
['bo', 'bo', 'be', 'bo', 'be']
Finding matches¶
In [98]:
for m in re.finditer('beer', beer, re.IGNORECASE):
print(m.start(), m.end(), m.span(), m.group())
14 18 (14, 18) Beer
46 50 (46, 50) beeR
100 104 (100, 104) beer
Grouping¶
In [99]:
re.findall(r'(\d+)\s+(\b\w+?\b)', beer, re.IGNORECASE)
Out[99]:
[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]
Splitting¶
In [100]:
re.split(r'\d+', beer)
Out[100]:
['',
' bottles of Beer on the wall, ',
' bottles of beeR.\nTake one down and pass it around, ',
' bottles of beer on the wall.']
Search and replace¶
In [101]:
print(re.sub('beer', 'whiskey', beer, flags=re.IGNORECASE))
99 bottles of whiskey on the wall, 99 bottles of whiskey.
Take one down and pass it around, 98 bottles of whiskey on the wall.
In [102]:
print(re.sub(r'(\d+)\s+(\b\w+?\b)', r'\2 \1', beer, re.IGNORECASE))
bottles 99 of Beer on the wall, bottles 99 of beeR.
Take one down and pass it around, 98 bottles of beer on the wall.
Function versus compiled method¶
In [103]:
pattern = re.compile(r'(\d+)\s+(\b\w+?\b)')
pattern.findall(beer)
Out[103]:
[('99', 'bottles'), ('99', 'bottles'), ('98', 'bottles')]
Raw strings¶
The backslash \
is an escape character in a regular Python string.
So we need to escape it to match a literal \
. However, \
is an
escape character in the regular expression mini-language when compiling
the regular expression pattern. So we need to escape at two levels -
hence we need \\\\
to match a literal \
. The raw string
rfoo
treats \
as a literal character rather than an escape
character.
In [104]:
latex = 'latex uses \section over and over again like so \section'
In [105]:
re.findall('\section', latex)
Out[105]:
[]
In [106]:
re.findall('\\section', latex)
Out[106]:
[]
In [107]:
re.findall('\\\\section', latex)
Out[107]:
['\\section', '\\section']
In [108]:
re.findall(r'\\section', latex)
Out[108]:
['\\section', '\\section']
Examples¶
Custom version of capwords
¶
In [109]:
string.capwords('hello world')
Out[109]:
'Hello World'
In [110]:
def my_capwords(ss):
return ' '.join([s.title() for s in ss.split()])
In [111]:
my_capwords('hello world')
Out[111]:
'Hello World'
Bag of words¶
Create a table of counts, where rows represent unique words and columns represent different documents. Ignore case and capitalization.
In [112]:
doc1 = """The wheels on the bus go,
Round and round,
Round and round,
Round and round.
The wheels on the bus go
Round and round,
All through the town."""
doc2 = """The doors on the bus go,
Open and shut,
Open and shut,
Open and shut.
The doors on the bus go
Open and shut,
All through the town."""
doc3 = """The Driver on the bus says,
"Move on back!
Move on back!
Move on back!"
The Driver on the bus says,
"Move on back!"
All through the town."""
doc4 = """The babies on the bus go,
"Wah, wah, wah!
Wah, wah, wah!
Wah, wah, wah!"
The babies on the bus go,
"Wah, wah, wah!"
All through the town."""
In [113]:
docs = [doc1, doc2, doc3, doc4]
doc_words = [doc.strip().lower().translate(str.maketrans('', '', string.punctuation)).split()
for doc in docs]
words = [word for words in doc_words for word in words]
vocab = set(words)
In [114]:
import numpy as np
import pandas as pd
In [115]:
table = np.zeros((len(vocab), len(docs)), dtype='int')
In [116]:
for i, word in enumerate(vocab):
for j, doc in enumerate(doc_words):
table[i, j] = doc.count(word)
In [117]:
pd.DataFrame(table, columns='doc1 doc2 doc3 doc4'.split(), index=vocab)
Out[117]:
doc1 | doc2 | doc3 | doc4 | |
---|---|---|---|---|
round | 8 | 0 | 0 | 0 |
back | 0 | 0 | 4 | 0 |
the | 5 | 5 | 5 | 5 |
bus | 2 | 2 | 2 | 2 |
town | 1 | 1 | 1 | 1 |
shut | 0 | 4 | 0 | 0 |
all | 1 | 1 | 1 | 1 |
and | 4 | 4 | 0 | 0 |
move | 0 | 0 | 4 | 0 |
babies | 0 | 0 | 0 | 2 |
driver | 0 | 0 | 2 | 0 |
wheels | 2 | 0 | 0 | 0 |
through | 1 | 1 | 1 | 1 |
doors | 0 | 2 | 0 | 0 |
says | 0 | 0 | 2 | 0 |
open | 0 | 4 | 0 | 0 |
on | 2 | 2 | 6 | 2 |
go | 2 | 2 | 0 | 2 |
wah | 0 | 0 | 0 | 12 |
In [ ]: