%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
%precision 4
import os, sys, glob
import regex as re
import string
Common applciations where there is a need to process text include:
You may need to refer to the following:
string
module -
e.g string.punctuation
, string.ascii_lowercase()
lower()
,
upper()
, split()
, replace()
, find()
, count()
re
module functions,
especially compile()
, match()
, search()
, sub()
As usual, make liberal use of IPython help (e.g string.punctuation?
)
to get information on a specific function or classs.
We will illustrate the use of string methods, regular expressions and natural langauge parsing, as well as some Python built-in data structures (e.g. Multiset (counter) and set) that can be used to clean or analyze text data. This is meant only as an walk-thourgh of some of the tools available; refer to the documentation for detals:
Perhaps the most basic thing we can do with textual data is to first tokenize (spilt into words) the document, then count the number of times each word (or pair of words, or ...) occurs. We will use the book (How to be Happy Though Married) as an example (from Project Gutenberg).
import requests
url = "http://www.gutenberg.org/cache/epub/35534/pg35534.txt"
raw = requests.get(url).text
# peek at the first 1000 characters of the downloaded text
raw[:1000]
u'ufeffProject Gutenberg's How to be Happy Though Married, by Edward John HardyrnrnThis eBook is for the use of anyone anywhere at no cost and withrnalmost no restrictions whatsoever. You may copy it, give it away orrnre-use it under the terms of the Project Gutenberg License includedrnwith this eBook or online at www.gutenberg.orgrnrnrnTitle: How to be Happy Though Marriedrn Being a Handbook to MarriagernrnAuthor: Edward John HardyrnrnRelease Date: March 9, 2011 [EBook #35534]rnrnLanguage: Englishrnrnrn*** START OF THIS PROJECT GUTENBERG EBOOK HOW TO BE HAPPY THOUGH MARRIED ***rnrnrnrnrnProduced by Colin Bell, Christine P. Travers and the OnlinernDistributed Proofreading Team at http://www.pgdp.net (Thisrnfile was produced from images generously made availablernby The Internet Archive)rnrnrnrnrnrnrn[Transcriber's note: The author's spelling has been maintained.rnrn+ signs around words indicate the use of a different font in the book.rnrnIn the word "Puranic", the "a" is overlined i'
The actual content of Project Guternberg books are delimited by the
phrases
"*** START OF THIS PROJECT GUTENBERG EBOOK THE KING JAMES BIBLE ***
and End of the Project Gutenberg EBook
respectively. Since the
actual book title will vary from book to book, we will use a regular
expression to search for
*** START OF THIS PROJECT GUTENBERG EBOOK <STUFF> ***
. For the end
of the book, we can use a simple string search, but will use a regular
expression too for consistency. Note that we need the index of the last
character and the index of the first character respectively as limits to
extract only the text of the downloaded book.
start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", raw).end()
stop = re.search(r"End of the Project Gutenberg EBook", raw).start()
text = raw[start:stop]
text[:1000]
u'rnrnrnrnrnProduced by Colin Bell, Christine P. Travers and the OnlinernDistributed Proofreading Team at http://www.pgdp.net (Thisrnfile was produced from images generously made availablernby The Internet Archive)rnrnrnrnrnrnrn[Transcriber's note: The author's spelling has been maintained.rnrn+ signs around words indicate the use of a different font in the book.rnrnIn the word "Puranic", the "a" is overlined in the book.]rnrnrnrnrn_HOW TO BE HAPPY THOUGH MARRIED._rnrnrnrnrnPRESS NOTICES ON THE FIRST EDITION.rnrn "_If wholesome advice you can brook,rn When single too long you have tarried;rn If comfort you'd gain from a book,rn When very much wedded and harried;rn No doubt you should speedily look,rn In 'How to be Happy though Married!'_"--PUNCH.rnrnrn"We strongly recommend this book as one of the best of wedding presents.rnIt is a complete handbook to an earthly Paradise, and its author may bernregarded as the Murray of Matrimony and the Baedeker of Bliss."--_PallrnMall Gaze'
# A naive but workable approach would be to first strip all punctuation,
# convert to lower case, then split on white space
words1 = re.sub(ur"\p{P}+", "", text.lower()).split()
print words1[:100]
len(words1)
[u'produced', u'by', u'colin', u'bell', u'christine', u'p', u'travers', u'and', u'the', u'online', u'distributed', u'proofreading', u'team', u'at', u'httpwwwpgdpnet', u'this', u'file', u'was', u'produced', u'from', u'images', u'generously', u'made', u'available', u'by', u'the', u'internet', u'archive', u'transcribers', u'note', u'the', u'authors', u'spelling', u'has', u'been', u'maintained', u'+', u'signs', u'around', u'words', u'indicate', u'the', u'use', u'of', u'a', u'different', u'font', u'in', u'the', u'book', u'in', u'the', u'word', u'puranic', u'the', u'a', u'is', u'overlined', u'in', u'the', u'book', u'how', u'to', u'be', u'happy', u'though', u'married', u'press', u'notices', u'on', u'the', u'first', u'edition', u'if', u'wholesome', u'advice', u'you', u'can', u'brook', u'when', u'single', u'too', u'long', u'you', u'have', u'tarried', u'if', u'comfort', u'youd', u'gain', u'from', u'a', u'book', u'when', u'very', u'much', u'wedded', u'and', u'harried', u'no']
86545
# If you need to be more careful, use the nltk tokenizer.
import nltk
from multiprocessing import Pool
from itertools import chain
punkt = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = punkt.tokenize(text.lower())
# since the tokenizer works on a per sentence level, we can parallelize
p = Pool()
words2 = list(chain.from_iterable(p.map(nltk.tokenize.word_tokenize, sentences)))
p.close()
# Now remove words that consist of only punctuation characters
words2 = [word for word in words2 if not all(char in string.punctuation for char in word)]
# Remove contractions - wods that begin with '
words2 = [word for word in words2 if not (word.startswith("'") and len(word) <=2)]
print words2[:100]
len(words2)
[u'produced', u'by', u'colin', u'bell', u'christine', u'p.', u'travers', u'and', u'the', u'online', u'distributed', u'proofreading', u'team', u'at', u'http', u'//www.pgdp.net', u'this', u'file', u'was', u'produced', u'from', u'images', u'generously', u'made', u'available', u'by', u'the', u'internet', u'archive', u'transcriber', u'note', u'the', u'author', u'spelling', u'has', u'been', u'maintained', u'signs', u'around', u'words', u'indicate', u'the', u'use', u'of', u'a', u'different', u'font', u'in', u'the', u'book', u'in', u'the', u'word', u'puranic', u'the', u'a', u'is', u'overlined', u'in', u'the', u'book', u'_how', u'to', u'be', u'happy', u'though', u'married._', u'press', u'notices', u'on', u'the', u'first', u'edition', u'_if', u'wholesome', u'advice', u'you', u'can', u'brook', u'when', u'single', u'too', u'long', u'you', u'have', u'tarried', u'if', u'comfort', u'you', u'gain', u'from', u'a', u'book', u'when', u'very', u'much', u'wedded', u'and', u'harried', u'no']
87158
from collections import Counter
c = Counter(words2)
c.most_common(n=10)
[(u'the', 4356),
(u'of', 3322),
(u'and', 2699),
(u'to', 2601),
(u'a', 2335),
(u'in', 1524),
(u'is', 1209),
(u'that', 1059),
(u'it', 848),
(u'be', 819)]
# this isn't very helpful since there are many "stop" words that don't man much
# now just the top 10 wordss give a good idea of what the book is about!
stopwords = nltk.corpus.stopwords.words('english')
new_c = c.copy()
for key in c:
if key in stopwords:
del new_c[key]
new_c.most_common(n=10)
[(u'wife', 353),
(u'one', 352),
(u'life', 271),
(u'man', 241),
(u'would', 237),
(u'said', 227),
(u'may', 219),
(u'husband', 208),
(u'good', 205),
(u'children', 194)]
# words in words1 but not in words2
w12 = list(set(words1) - set(words2))
w12[:10]
[u'wedmore',
u'servantgirl',
u'childs',
u'folklore',
u'mores',
u'loveletters',
u'itliterary',
u'motheror',
u'modium',
u'worldthen']
# words in word2 but not in word1
w21 = list(set(words2) - set(words1))
w21[:10]
[u'_john',
u"daughter's",
u'_illustrated',
u'party.',
u'seventy-seven',
u'34.',
u'co-operation',
u'mercury._',
u'proudie',
u'_publishers']
%load_ext version_information
%version_information requests, regex, nltk
The version_information extension is already loaded. To reload it, use:
%reload_ext version_information
Software | Version |
---|---|
Python | 2.7.5 (default, Mar 9 2014, 22:15:05) [GCC 4.2.1 Compatible Apple LLVM 5.0 (clang-500.0.68)] |
IPython | 2.1.0 |
OS | posix [darwin] |
requests | 2.3.0 |
regex | 2.4.46 |
nltk | 2.0.4 |
Sat Aug 02 13:20:24 2014 EDT |