MongoDB¶
MongoDB is a document database. It stores JSON objects.
Note that MongoDB also provides a GUI interface via MongDB Compasss that might be useful when you are getting familiar with MongoDB. However, we will focus only on pymongo
.
Concepts¶
What a document database is
Why document databases
Collections ~ tables
Documents ~ rows
Joins are possible but more common to embed nested objects
Using
find
Simple summaries
Using the
aggregate
method and setting up pipelinesGeospatial queries
Creating indexes to speed up queries
[1]:
from pymongo import MongoClient, GEOSPHERE
from bson.objectid import ObjectId
from bson.son import SON
[2]:
import requests
from bson import json_util
[3]:
import collections
from pathlib import Path
[4]:
import os
[5]:
from pprint import pprint
Set up¶
This connects to the MongoDB daemon
Use this on vm-manage
client = MongoClient('mongodb:27017')
[8]:
client = MongoClient()
This specifies the database. It does not matter if it does not exist.
[9]:
client.drop_database('starwars')
[10]:
db = client.starwars
This specifies a collection
[11]:
people = db.people
Check what collections are in the database. Note that the people
collection is only created when the first value is inserted.
[12]:
db.list_collection_names()
[12]:
[]
Get Data¶
[13]:
base_url = 'http://swapi.dev/api/'
[14]:
resp = requests.get(os.path.join(base_url, 'people/1'))
data = resp.json()
[15]:
data
[15]:
{'name': 'Luke Skywalker',
'height': '172',
'mass': '77',
'hair_color': 'blond',
'skin_color': 'fair',
'eye_color': 'blue',
'birth_year': '19BBY',
'gender': 'male',
'homeworld': 'http://swapi.dev/api/planets/1/',
'films': ['http://swapi.dev/api/films/1/',
'http://swapi.dev/api/films/2/',
'http://swapi.dev/api/films/3/',
'http://swapi.dev/api/films/6/'],
'species': [],
'vehicles': ['http://swapi.dev/api/vehicles/14/',
'http://swapi.dev/api/vehicles/30/'],
'starships': ['http://swapi.dev/api/starships/12/',
'http://swapi.dev/api/starships/22/'],
'created': '2014-12-09T13:50:51.644000Z',
'edited': '2014-12-20T21:17:56.891000Z',
'url': 'http://swapi.dev/api/people/1/'}
We will fetch details of the homeworld and starships as a nested document.
[16]:
def get_nested(d):
d['homeworld'] = requests.get(d['homeworld']).json()
urls = d['starships']
starships = [requests.get(url).json() for url in urls]
d['starships'] = starships
return d
We need to convert numbers from strings returned by the REST API
[17]:
def convert_str(x):
try:
return int(x)
except:
return x
def to_num(data):
for key in data:
val = data[key]
if isinstance(val, str):
data[key] = convert_str(val)
elif isinstance(val, dict):
for k, v in val.items():
if isinstance(v, str):
val[k] = convert_str(v)
elif isinstance(val, list):
for i, item in enumerate(val):
if isinstance(item, str):
data[key][i] = convert_str(item)
elif isinstance(item, dict):
for k, v in item.items():
if isinstance(v, str):
data[key][i][k] = convert_str(v)
return data
[18]:
data = to_num(get_nested(data))
[19]:
data
[19]:
{'name': 'Luke Skywalker',
'height': 172,
'mass': 77,
'hair_color': 'blond',
'skin_color': 'fair',
'eye_color': 'blue',
'birth_year': '19BBY',
'gender': 'male',
'homeworld': {'name': 'Tatooine',
'rotation_period': 23,
'orbital_period': 304,
'diameter': 10465,
'climate': 'arid',
'gravity': '1 standard',
'terrain': 'desert',
'surface_water': 1,
'population': 200000,
'residents': ['http://swapi.dev/api/people/1/',
'http://swapi.dev/api/people/2/',
'http://swapi.dev/api/people/4/',
'http://swapi.dev/api/people/6/',
'http://swapi.dev/api/people/7/',
'http://swapi.dev/api/people/8/',
'http://swapi.dev/api/people/9/',
'http://swapi.dev/api/people/11/',
'http://swapi.dev/api/people/43/',
'http://swapi.dev/api/people/62/'],
'films': ['http://swapi.dev/api/films/1/',
'http://swapi.dev/api/films/3/',
'http://swapi.dev/api/films/4/',
'http://swapi.dev/api/films/5/',
'http://swapi.dev/api/films/6/'],
'created': '2014-12-09T13:50:49.641000Z',
'edited': '2014-12-20T20:58:18.411000Z',
'url': 'http://swapi.dev/api/planets/1/'},
'films': ['http://swapi.dev/api/films/1/',
'http://swapi.dev/api/films/2/',
'http://swapi.dev/api/films/3/',
'http://swapi.dev/api/films/6/'],
'species': [],
'vehicles': ['http://swapi.dev/api/vehicles/14/',
'http://swapi.dev/api/vehicles/30/'],
'starships': [{'name': 'X-wing',
'model': 'T-65 X-wing',
'manufacturer': 'Incom Corporation',
'cost_in_credits': 149999,
'length': '12.5',
'max_atmosphering_speed': 1050,
'crew': 1,
'passengers': 0,
'cargo_capacity': 110,
'consumables': '1 week',
'hyperdrive_rating': '1.0',
'MGLT': 100,
'starship_class': 'Starfighter',
'pilots': ['http://swapi.dev/api/people/1/',
'http://swapi.dev/api/people/9/',
'http://swapi.dev/api/people/18/',
'http://swapi.dev/api/people/19/'],
'films': ['http://swapi.dev/api/films/1/',
'http://swapi.dev/api/films/2/',
'http://swapi.dev/api/films/3/'],
'created': '2014-12-12T11:19:05.340000Z',
'edited': '2014-12-20T21:23:49.886000Z',
'url': 'http://swapi.dev/api/starships/12/'},
{'name': 'Imperial shuttle',
'model': 'Lambda-class T-4a shuttle',
'manufacturer': 'Sienar Fleet Systems',
'cost_in_credits': 240000,
'length': 20,
'max_atmosphering_speed': 850,
'crew': 6,
'passengers': 20,
'cargo_capacity': 80000,
'consumables': '2 months',
'hyperdrive_rating': '1.0',
'MGLT': 50,
'starship_class': 'Armed government transport',
'pilots': ['http://swapi.dev/api/people/1/',
'http://swapi.dev/api/people/13/',
'http://swapi.dev/api/people/14/'],
'films': ['http://swapi.dev/api/films/2/', 'http://swapi.dev/api/films/3/'],
'created': '2014-12-15T13:04:47.235000Z',
'edited': '2014-12-20T21:23:49.900000Z',
'url': 'http://swapi.dev/api/starships/22/'}],
'created': '2014-12-09T13:50:51.644000Z',
'edited': '2014-12-20T21:17:56.891000Z',
'url': 'http://swapi.dev/api/people/1/'}
Insertion¶
Single inserts¶
[20]:
result = people.insert_one(data)
[21]:
db.list_collection_names()
[21]:
['people']
Bulk inserts¶
We load some previously retrieved values from file to avoid hitting the SWAPI server repeatedly.
[22]:
import pickle
with open('sw.pickle', 'rb') as f:
xs = pickle.load(f)
[23]:
result = people.insert_many(xs)
[24]:
result.inserted_ids
[24]:
[ObjectId('5fac82d339937fab8d3c0873'),
ObjectId('5fac82d339937fab8d3c0874'),
ObjectId('5fac82d339937fab8d3c0875'),
ObjectId('5fac82d339937fab8d3c0876'),
ObjectId('5fac82d339937fab8d3c0877'),
ObjectId('5fac82d339937fab8d3c0878'),
ObjectId('5fac82d339937fab8d3c0879'),
ObjectId('5fac82d339937fab8d3c087a'),
ObjectId('5fac82d339937fab8d3c087b')]
Queries¶
[25]:
people.find_one(
# search criteria
{'name': 'Luke Skywalker'},
# values to return
{'name': True,
'hair_color': True,
'skin_color': True,
'eye_color': True
}
)
[25]:
{'_id': ObjectId('5fac82d339937fab8d3c0872'),
'name': 'Luke Skywalker',
'hair_color': 'blond',
'skin_color': 'fair',
'eye_color': 'blue'}
[26]:
for p in people.find(
# search criteria
{},
# values to return
{'name': True,
'hair_color': True,
'skin_color': True,
'eye_color': True
}
):
print(p)
{'_id': ObjectId('5fac82d339937fab8d3c0872'), 'name': 'Luke Skywalker', 'hair_color': 'blond', 'skin_color': 'fair', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0873'), 'name': 'C-3PO', 'hair_color': 'n/a', 'skin_color': 'gold', 'eye_color': 'yellow'}
{'_id': ObjectId('5fac82d339937fab8d3c0874'), 'name': 'R2-D2', 'hair_color': 'n/a', 'skin_color': 'white, blue', 'eye_color': 'red'}
{'_id': ObjectId('5fac82d339937fab8d3c0875'), 'name': 'Darth Vader', 'hair_color': 'none', 'skin_color': 'white', 'eye_color': 'yellow'}
{'_id': ObjectId('5fac82d339937fab8d3c0876'), 'name': 'Leia Organa', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'brown'}
{'_id': ObjectId('5fac82d339937fab8d3c0877'), 'name': 'Owen Lars', 'hair_color': 'brown, grey', 'skin_color': 'light', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0878'), 'name': 'Beru Whitesun lars', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0879'), 'name': 'R5-D4', 'hair_color': 'n/a', 'skin_color': 'white, red', 'eye_color': 'red'}
{'_id': ObjectId('5fac82d339937fab8d3c087a'), 'name': 'Biggs Darklighter', 'hair_color': 'black', 'skin_color': 'light', 'eye_color': 'brown'}
{'_id': ObjectId('5fac82d339937fab8d3c087b'), 'name': 'Obi-Wan Kenobi', 'hair_color': 'auburn, white', 'skin_color': 'fair', 'eye_color': 'blue-gray'}
Using object ID¶
Note that ObjectID is NOT a string. You must convert a string to ObjectID before use.
From the official docs, the ObjectID consists of
a 4-byte value representing the seconds since the Unix epoch,
a 5-byte random value, and
a 3-byte counter, starting with a random value.
In particular, note that sorting by ObjectIDs generated across different machines will give an approximate time ordering.
[27]:
result.inserted_ids[0]
[27]:
ObjectId('5fac82d339937fab8d3c0873')
[28]:
people.find_one(
result.inserted_ids[0],
{'name': True, 'hair_color': True, 'skin_color': True, 'eye_color': True}
)
[28]:
{'_id': ObjectId('5fac82d339937fab8d3c0873'),
'name': 'C-3PO',
'hair_color': 'n/a',
'skin_color': 'gold',
'eye_color': 'yellow'}
Bulk queries¶
The general find
method returns a cursor, where each entry is a dictionary.
[29]:
for person in people.find(
{'gender': 'male'}
):
print(person['name'])
Luke Skywalker
Darth Vader
Owen Lars
Biggs Darklighter
Obi-Wan Kenobi
You can also explicitly define the projection.
[30]:
for x in people.find(
{'gender': 'male'},
{
'_id': False,
'name': True,
'gender': True
}
):
pprint(x)
{'gender': 'male', 'name': 'Luke Skywalker'}
{'gender': 'male', 'name': 'Darth Vader'}
{'gender': 'male', 'name': 'Owen Lars'}
{'gender': 'male', 'name': 'Biggs Darklighter'}
{'gender': 'male', 'name': 'Obi-Wan Kenobi'}
Using regex search¶
[31]:
for x in people.find(
{
'name': {'$regex': '^L'},
},
{
'name': True,
'gender': True,
'_id': False
}
):
pprint(x)
{'gender': 'male', 'name': 'Luke Skywalker'}
{'gender': 'female', 'name': 'Leia Organa'}
The above example uses the JavaScript regular expression syntax. You can also use Python regular expressions with ppymongo
.
[32]:
import re
name_pat = re.compile(r'^l', re.IGNORECASE)
[33]:
for x in people.find(
{
'name': name_pat,
},
{
'name': True,
'gender': True,
'_id': False
}
):
pprint(x)
{'gender': 'male', 'name': 'Luke Skywalker'}
{'gender': 'female', 'name': 'Leia Organa'}
Using relational operators¶
[34]:
for x in people.find(
{
'mass': {'$lt': 100},
},
{
'name': True,
'mass': True,
'_id': False
}
):
pprint(x)
{'mass': 77, 'name': 'Luke Skywalker'}
{'mass': 75, 'name': 'C-3PO'}
{'mass': 32, 'name': 'R2-D2'}
{'mass': 49, 'name': 'Leia Organa'}
{'mass': 75, 'name': 'Beru Whitesun lars'}
{'mass': 32, 'name': 'R5-D4'}
{'mass': 84, 'name': 'Biggs Darklighter'}
{'mass': 77, 'name': 'Obi-Wan Kenobi'}
[35]:
mass_range = {'$lt': 100, '$gt': 50}
[36]:
for x in people.find(
{
'mass': mass_range,
},
{
'name': True,
'mass': True,
'_id': False
}
):
pprint(x)
{'mass': 77, 'name': 'Luke Skywalker'}
{'mass': 75, 'name': 'C-3PO'}
{'mass': 75, 'name': 'Beru Whitesun lars'}
{'mass': 84, 'name': 'Biggs Darklighter'}
{'mass': 77, 'name': 'Obi-Wan Kenobi'}
Nested search¶
Nowadays, many relational databases allow you to store data as JSON columns. However, document databases allow the convenience of nested searches.
[37]:
for x in people.find(
{
'homeworld.name': 'Tatooine',
},
{
'name': True,
'species.name': True,
'_id': False
}
):
pprint(x)
{'name': 'Luke Skywalker', 'species': []}
{'name': 'C-3PO', 'species': []}
{'name': 'Darth Vader', 'species': []}
{'name': 'Owen Lars', 'species': []}
{'name': 'Beru Whitesun lars', 'species': []}
{'name': 'R5-D4', 'species': []}
{'name': 'Biggs Darklighter', 'species': []}
Matching multiple criteria¶
This is quite subtle. By default, when matching on multiple criteria, the search is across items. Here Obi-Wan Kenobi
is returned because each of the 3 conditions is matched by one or more of his starships, even though none of his starships match all 3 criteria.
[38]:
for x in people.find(
{
'starships.cost_in_credits': {'$lt': 250000},
'starships.max_atmosphering_speed': {'$gt': 500},
'starships.passengers': {'$gt': 0}
},
{
'name': True,
'starship.name': True,
'starships.max_atmosphering_speed': True,
'starships.passengers': True,
'starships.cost_in_credits': True,
'_id': False
}
):
pprint(x)
{'name': 'Luke Skywalker',
'starships': [{'cost_in_credits': 149999,
'max_atmosphering_speed': 1050,
'passengers': 0},
{'cost_in_credits': 240000,
'max_atmosphering_speed': 850,
'passengers': 20}]}
{'name': 'Obi-Wan Kenobi',
'starships': [{'cost_in_credits': 180000,
'max_atmosphering_speed': 1150,
'passengers': 0},
{'cost_in_credits': 125000000,
'max_atmosphering_speed': 1050,
'passengers': 48247},
{'cost_in_credits': 'unknown',
'max_atmosphering_speed': 1050,
'passengers': 3},
{'cost_in_credits': 320000,
'max_atmosphering_speed': 1500,
'passengers': 0},
{'cost_in_credits': 168000,
'max_atmosphering_speed': 1100,
'passengers': 0}]}
[39]:
for x in people.find(
{'name': 'Obi-Wan Kenobi'},
{
'starships.name': True,
'starships.cost_in_credits': True,
'starships.max_atmosphering_speed': True,
'starships.passengers': True,
'_id': False
}
):
pprint(x)
{'starships': [{'cost_in_credits': 180000,
'max_atmosphering_speed': 1150,
'name': 'Jedi starfighter',
'passengers': 0},
{'cost_in_credits': 125000000,
'max_atmosphering_speed': 1050,
'name': 'Trade Federation cruiser',
'passengers': 48247},
{'cost_in_credits': 'unknown',
'max_atmosphering_speed': 1050,
'name': 'Naboo star skiff',
'passengers': 3},
{'cost_in_credits': 320000,
'max_atmosphering_speed': 1500,
'name': 'Jedi Interceptor',
'passengers': 0},
{'cost_in_credits': 168000,
'max_atmosphering_speed': 1100,
'name': 'Belbullab-22 starfighter',
'passengers': 0}]}
Matching multiple criteria simultaneously¶
To find someone with a starship that matches all 3 conditions, we need to use the elemMatch
operator.
[40]:
for x in people.find(
{
'starships': {
'$elemMatch': {
'cost_in_credits': {'$lt': 250000},
'max_atmosphering_speed': {'$gt': 500},
'passengers': {'$gt': 1}
}
}
},
{
'name': True,
'starships.name': True,
'starships.max_atmosphering_speed': True,
'starships.passengers': True,
'starships.cost_in_credits': True,
'_id': False
}
):
pprint(x)
{'name': 'Luke Skywalker',
'starships': [{'cost_in_credits': 149999,
'max_atmosphering_speed': 1050,
'name': 'X-wing',
'passengers': 0},
{'cost_in_credits': 240000,
'max_atmosphering_speed': 850,
'name': 'Imperial shuttle',
'passengers': 20}]}
Aggregate Queries¶
[41]:
people.count_documents({'homeworld.name': 'Tatooine'})
[41]:
7
[42]:
people.distinct('homeworld.name')
[42]:
['Alderaan', 'Naboo', 'Stewjon', 'Tatooine']
Using aggregate¶
The aggregate
function runs a pipeline of commands, and uses the $group
operator to summarize results. Within the aggregate method, you assemble a pipeline of operations that is executed atomically.
Filter and count
[43]:
cmds = [
{'$match': {'homeworld.name': 'Tatooine'}},
{'$group': {'_id': '$homeworld.name',
'count': {'$sum': 1}}},
]
[44]:
for p in people.aggregate(cmds):
pprint(p)
{'_id': 'Tatooine', 'count': 7}
Filter and find total mass
[45]:
cmds = [
{'$match': {'homeworld.name': 'Tatooine'}},
{'$group': {'_id': '$homeworld.name',
'total_mass': {'$sum': '$mass'}}},
]
[46]:
for p in people.aggregate(cmds):
pprint(p)
{'_id': 'Tatooine', 'total_mass': 599}
Total mass of all members of a planet
[47]:
cmds = [
{'$group': {'_id': '$homeworld.name',
'total_mass': {'$sum': '$mass'}}},
]
[48]:
for p in people.aggregate(cmds):
pprint(p)
{'_id': 'Tatooine', 'total_mass': 599}
{'_id': 'Stewjon', 'total_mass': 77}
{'_id': 'Naboo', 'total_mass': 32}
{'_id': 'Alderaan', 'total_mass': 49}
Filter, project, group by, sorting.
[49]:
cmds = [
{
'$match': {
'mass': {
'$lt': 100
}
},
},
{
'$group': {
'_id': '$homeworld.name',
'total_mass': {'$sum': '$mass'},
'avg_mass': {'$avg': '$mass'}
},
},
{
'$sort': {
'avg_mass': -1
}
}
]
[50]:
for p in people.aggregate(cmds):
pprint(p)
{'_id': 'Stewjon', 'avg_mass': 77.0, 'total_mass': 77}
{'_id': 'Tatooine', 'avg_mass': 68.6, 'total_mass': 343}
{'_id': 'Alderaan', 'avg_mass': 49.0, 'total_mass': 49}
{'_id': 'Naboo', 'avg_mass': 32.0, 'total_mass': 32}
SQL equivalent (approximate)¶
SELECT species.name, AVG(mass) AS avg_mass, SUM(mass) AS total_mass
WHERE mass < 100
FROM people
JOIN species
ON people.species_id = species.species_id
GROUP BY species.name
ORDER BY avg_mass
Using MapReduce¶
With MapReduce
you get the full power of JavaScript, but it is more complex and often less efficient. You should use aggregate
in preference to map_reduce
in most cases.
In the map stage, you create a (key, value) pair
In the reduce stage, you perform a reduction (e.g. sum) of the values associated with each key
Motivating Python example¶
[51]:
from functools import reduce
[52]:
eye_color = ['blue', 'blue', 'green', 'brown', 'grey', 'green', 'blue']
[53]:
res = [(x, 1) for x in eye_color]
res
[53]:
[('blue', 1),
('blue', 1),
('green', 1),
('brown', 1),
('grey', 1),
('green', 1),
('blue', 1)]
[54]:
d = {}
for k, v in res:
d[k] = d.get(k, 0) + v
d
[54]:
{'blue': 3, 'green': 2, 'brown': 1, 'grey': 1}
Map-reduce example in Mongo¶
[55]:
from bson.code import Code
Count the number by eye_color
[56]:
mapper = Code('''
function() {
emit(this.eye_color, 1);
}
''')
reducer = Code('''
function (key, values) {
var total = 0;
for (var i = 0; i < values.length; i++) {
total += values[i];
}
return total;
}
''')
result = people.map_reduce(
mapper,
reducer,
'result1'
)
[57]:
for doc in result.find():
pprint(doc)
{'_id': 'blue', 'value': 3.0}
{'_id': 'blue-gray', 'value': 1.0}
{'_id': 'brown', 'value': 2.0}
{'_id': 'red', 'value': 2.0}
{'_id': 'yellow', 'value': 2.0}
The output is also stored in the result1
collection we specified.
[58]:
list(db.result1.find())
[58]:
[{'_id': 'blue', 'value': 3.0},
{'_id': 'blue-gray', 'value': 1.0},
{'_id': 'brown', 'value': 2.0},
{'_id': 'red', 'value': 2.0},
{'_id': 'yellow', 'value': 2.0}]
Using JavaScript Array functions to simplify code.
[59]:
mapper = Code('''
function() {
emit(this.eye_color, 1);
}
''')
reducer = Code('''
function (key, values) {
return Array.sum(values);
}
''')
result = people.map_reduce(
mapper,
reducer,
'result2'
)
[60]:
for doc in result.find():
pprint(doc)
{'_id': 'blue', 'value': 3.0}
{'_id': 'blue-gray', 'value': 1.0}
{'_id': 'brown', 'value': 2.0}
{'_id': 'red', 'value': 2.0}
{'_id': 'yellow', 'value': 2.0}
Find avergae mass by gender.
[61]:
mapper = Code('''
function() {
emit(this.gender, this.mass);
}
''')
reducer = Code('''
function (key, values) {
return Array.avg(values);
}
''')
result = people.map_reduce(
mapper,
reducer,
'result3'
)
[62]:
for doc in result.find():
pprint(doc)
{'_id': 'female', 'value': 62.0}
{'_id': 'male', 'value': 98.8}
{'_id': 'n/a', 'value': 46.333333333333336}
Count number of members in each species
[63]:
mapper = Code('''
function() {
this.species.map(function(z) {
emit(z.name, 1);
})
}
''')
reducer = Code('''
function (key, values) {
return Array.sum(values);
}
''')
result = people.map_reduce(
mapper,
reducer,
'result3'
)
[64]:
for doc in result.find():
pprint(doc)
{'_id': None, 'value': 3.0}
Using the aggregate
method¶
See if you can convert the above MapReduce queries to aggregate
method calls. An example is provided.
[65]:
cmds = [
{
'$group': {
'_id': '$eye_color',
'count': {'$sum': 1},
},
},
{
'$sort': {
'_id': 1
}
}
]
[66]:
for p in people.aggregate(cmds):
pprint(p)
{'_id': 'blue', 'count': 3}
{'_id': 'blue-gray', 'count': 1}
{'_id': 'brown', 'count': 2}
{'_id': 'red', 'count': 2}
{'_id': 'yellow', 'count': 2}
Geospatial queries¶
You specify queries using GeoJSON Objects
Point
LineString
Polygon
MultiPoint
MultiLineString
MultiPolygon
GeometryCollection
[67]:
crime = db.crime
[68]:
import json
[69]:
path = 'data/crime-mapping.geojson'
with open(path) as f:
datastore = json.load(f)
[70]:
results = crime.insert_many(datastore['features'])
[71]:
crime.find_one({})
[71]:
{'_id': ObjectId('5fac82d439937fab8d3c087c'),
'geometry': {'type': 'Point', 'coordinates': [-78.78200313, 35.760212065]},
'type': 'Feature',
'properties': {'ucr': '2650',
'domestic': 'N',
'period': ['Everything', 'Last Year'],
'street': 'KILDAIRE FARM RD',
'radio': 'Everything,Last Year',
'time_to': -62135553600,
'crime_type': 'ALL OTHER - ESCAPE FROM CUSTODY OR RESIST ARREST',
'district': 'D3',
'phxrecordstatus': None,
'lon': -78.78200313,
'timeframe': ['Last Year'],
'crimeday': 'THURSDAY',
'phxstatus': None,
'location_category': 'TOWN OWNED',
'violentproperty': 'All Other',
'residential_subdivision': 'SHOPPES OF KILDAIRE',
'offensecategory': 'All Other Offenses',
'chrgcnt': None,
'time_from': -62135553600,
'map_reference': 'P027',
'date_to': '11/30/2017',
'lat': 35.760212065,
'phxcommunity': 'No',
'crime_category': 'ALL OTHER',
'activity_date': None,
'beat_number': '112',
'record': 3145,
'incident_number': '17010528',
'apartment_complex': None,
'date_from': '2017-11-30'}}
[72]:
crime.find_one({},
{
'geometry': 1,
'_id': 0,
}
)
[72]:
{'geometry': {'type': 'Point', 'coordinates': [-78.78200313, 35.760212065]}}
[73]:
crime.create_index([('geometry', GEOSPHERE)])
[73]:
'geometry_2dsphere'
List 5 crimes near the location
[74]:
loc = SON([('type', 'Point'), ('coordinates', [-78.78200313, 35.760212065])])
for doc in crime.find(
{
'geometry' : SON([('$near', {'$geometry' : loc})])
},
{
'_id': 0,
'properties.crime_type': 1,
'properties.date_from': 1
}
).limit(5):
pprint(doc)
{'properties': {'crime_type': 'ALL OTHER - ESCAPE FROM CUSTODY OR RESIST '
'ARREST',
'date_from': '2017-11-30'}}
{'properties': {'crime_type': 'LARCENY - AUTO PARTS OR ACCESSORIES',
'date_from': '2018-03-20'}}
{'properties': {'crime_type': 'COUNTERFEITING - USING',
'date_from': '2018-08-05'}}
{'properties': {'crime_type': 'DRUGS - DRUG VIOLATIONS '
'(POSS./SELL/MAN./DEL./TRNSPRT/CULT.)',
'date_from': '2017-11-30'}}
{'properties': {'crime_type': 'VANDALISM - DAMAGE TO PROPERTY',
'date_from': '2018-03-26'}}
List crimes committed nearby (within 200 m)
[75]:
loc = SON([('type', 'Point'), ('coordinates', [-78.78200313, 35.760212065])])
for doc in crime.find(
{
'geometry' : SON([('$geoNear', {'$geometry' : loc, '$minDistance': 1e-6, '$maxDistance': 200})]),
},
{
'_id': 0,
'geometry.coordinates': 1,
'properties.crime_type': 1,
'properties.date_from': 1
}
):
pprint(doc)
{'geometry': {'coordinates': [-78.78102423, 35.7607323]},
'properties': {'crime_type': 'ASSAULT - SIMPLE - ALL OTHER',
'date_from': '2018-02-14'}}
{'geometry': {'coordinates': [-78.78131931, 35.761138061]},
'properties': {'crime_type': 'VANDALISM - GRAFFITI',
'date_from': '2018-07-20'}}
{'geometry': {'coordinates': [-78.7827814, 35.759087052]},
'properties': {'crime_type': 'VANDALISM - GRAFFITI',
'date_from': '2018-07-29'}}
Indexes¶
Just as with relational databases, you can add indexes to speed up search. Note that while reads become faster, writes become slower. There is always a trade-off.
[76]:
people.find({}).explain
[76]:
<bound method Cursor.explain of <pymongo.cursor.Cursor object at 0x111dfe670>>
[77]:
people.find({'name': 'Luke Skywalker'}).explain()
[77]:
{'queryPlanner': {'plannerVersion': 1,
'namespace': 'starwars.people',
'indexFilterSet': False,
'parsedQuery': {'name': {'$eq': 'Luke Skywalker'}},
'winningPlan': {'stage': 'COLLSCAN',
'filter': {'name': {'$eq': 'Luke Skywalker'}},
'direction': 'forward'},
'rejectedPlans': []},
'executionStats': {'executionSuccess': True,
'nReturned': 1,
'executionTimeMillis': 0,
'totalKeysExamined': 0,
'totalDocsExamined': 10,
'executionStages': {'stage': 'COLLSCAN',
'filter': {'name': {'$eq': 'Luke Skywalker'}},
'nReturned': 1,
'executionTimeMillisEstimate': 0,
'works': 12,
'advanced': 1,
'needTime': 10,
'needYield': 0,
'saveState': 0,
'restoreState': 0,
'isEOF': 1,
'direction': 'forward',
'docsExamined': 10},
'allPlansExecution': []},
'serverInfo': {'host': 'Cliburns-MacBook-Pro.local',
'port': 27017,
'version': '4.2.3',
'gitVersion': '6874650b362138df74be53d366bbefc321ea32d4'},
'ok': 1.0}
[78]:
people.create_index('name')
[78]:
'name_1'
[79]:
people.find({'name': 'Luke Skywalker'}).explain()
[79]:
{'queryPlanner': {'plannerVersion': 1,
'namespace': 'starwars.people',
'indexFilterSet': False,
'parsedQuery': {'name': {'$eq': 'Luke Skywalker'}},
'winningPlan': {'stage': 'FETCH',
'inputStage': {'stage': 'IXSCAN',
'keyPattern': {'name': 1},
'indexName': 'name_1',
'isMultiKey': False,
'multiKeyPaths': {'name': []},
'isUnique': False,
'isSparse': False,
'isPartial': False,
'indexVersion': 2,
'direction': 'forward',
'indexBounds': {'name': ['["Luke Skywalker", "Luke Skywalker"]']}}},
'rejectedPlans': []},
'executionStats': {'executionSuccess': True,
'nReturned': 1,
'executionTimeMillis': 1,
'totalKeysExamined': 1,
'totalDocsExamined': 1,
'executionStages': {'stage': 'FETCH',
'nReturned': 1,
'executionTimeMillisEstimate': 0,
'works': 2,
'advanced': 1,
'needTime': 0,
'needYield': 0,
'saveState': 0,
'restoreState': 0,
'isEOF': 1,
'docsExamined': 1,
'alreadyHasObj': 0,
'inputStage': {'stage': 'IXSCAN',
'nReturned': 1,
'executionTimeMillisEstimate': 0,
'works': 2,
'advanced': 1,
'needTime': 0,
'needYield': 0,
'saveState': 0,
'restoreState': 0,
'isEOF': 1,
'keyPattern': {'name': 1},
'indexName': 'name_1',
'isMultiKey': False,
'multiKeyPaths': {'name': []},
'isUnique': False,
'isSparse': False,
'isPartial': False,
'indexVersion': 2,
'direction': 'forward',
'indexBounds': {'name': ['["Luke Skywalker", "Luke Skywalker"]']},
'keysExamined': 1,
'seeks': 1,
'dupsTested': 0,
'dupsDropped': 0}},
'allPlansExecution': []},
'serverInfo': {'host': 'Cliburns-MacBook-Pro.local',
'port': 27017,
'version': '4.2.3',
'gitVersion': '6874650b362138df74be53d366bbefc321ea32d4'},
'ok': 1.0}