MongoDB

MongoDB is a document database. It stores JSON objects.

Note that MongoDB also provides a GUI interface via MongDB Compasss that might be useful when you are getting familiar with MongoDB. However, we will focus only on pymongo.

Concepts

  • What a document database is

  • Why document databases

  • Collections ~ tables

  • Documents ~ rows

  • Joins are possible but more common to embed nested objects

  • Basic data manipulation: CRUD

  • Using find

  • Simple summaries

  • Using the aggregate method and setting up pipelines

  • Geospatial queries

  • Creating indexes to speed up queries

[1]:
from pymongo import MongoClient, GEOSPHERE
from bson.objectid import ObjectId
from bson.son import SON
[2]:
import requests
from bson import json_util
[3]:
import collections
from pathlib import Path
[4]:
import os
[5]:
from pprint import pprint

Set up

This connects to the MongoDB daemon

Use this on vm-manage

client = MongoClient('mongodb:27017')
[8]:
client = MongoClient()

This specifies the database. It does not matter if it does not exist.

[9]:
client.drop_database('starwars')
[10]:
db = client.starwars

This specifies a collection

[11]:
people = db.people

Check what collections are in the database. Note that the people collection is only created when the first value is inserted.

[12]:
db.list_collection_names()
[12]:
[]

Get Data

[13]:
base_url = 'http://swapi.dev/api/'
[14]:
resp = requests.get(os.path.join(base_url, 'people/1'))
data = resp.json()
[15]:
data
[15]:
{'name': 'Luke Skywalker',
 'height': '172',
 'mass': '77',
 'hair_color': 'blond',
 'skin_color': 'fair',
 'eye_color': 'blue',
 'birth_year': '19BBY',
 'gender': 'male',
 'homeworld': 'http://swapi.dev/api/planets/1/',
 'films': ['http://swapi.dev/api/films/1/',
  'http://swapi.dev/api/films/2/',
  'http://swapi.dev/api/films/3/',
  'http://swapi.dev/api/films/6/'],
 'species': [],
 'vehicles': ['http://swapi.dev/api/vehicles/14/',
  'http://swapi.dev/api/vehicles/30/'],
 'starships': ['http://swapi.dev/api/starships/12/',
  'http://swapi.dev/api/starships/22/'],
 'created': '2014-12-09T13:50:51.644000Z',
 'edited': '2014-12-20T21:17:56.891000Z',
 'url': 'http://swapi.dev/api/people/1/'}

We will fetch details of the homeworld and starships as a nested document.

[16]:
def get_nested(d):
    d['homeworld']  = requests.get(d['homeworld']).json()
    urls = d['starships']
    starships = [requests.get(url).json() for url in urls]
    d['starships']  = starships
    return d

We need to convert numbers from strings returned by the REST API

[17]:
def convert_str(x):
    try:
        return int(x)
    except:
        return x

def to_num(data):
    for key in data:
        val = data[key]
        if isinstance(val, str):
            data[key] = convert_str(val)
        elif isinstance(val, dict):
            for k, v in val.items():
                if isinstance(v, str):
                    val[k] = convert_str(v)
        elif isinstance(val, list):
            for i, item in enumerate(val):
                if isinstance(item, str):
                    data[key][i] = convert_str(item)
                elif isinstance(item, dict):
                    for k, v in item.items():
                        if isinstance(v, str):
                            data[key][i][k] = convert_str(v)
    return data
[18]:
data = to_num(get_nested(data))
[19]:
data
[19]:
{'name': 'Luke Skywalker',
 'height': 172,
 'mass': 77,
 'hair_color': 'blond',
 'skin_color': 'fair',
 'eye_color': 'blue',
 'birth_year': '19BBY',
 'gender': 'male',
 'homeworld': {'name': 'Tatooine',
  'rotation_period': 23,
  'orbital_period': 304,
  'diameter': 10465,
  'climate': 'arid',
  'gravity': '1 standard',
  'terrain': 'desert',
  'surface_water': 1,
  'population': 200000,
  'residents': ['http://swapi.dev/api/people/1/',
   'http://swapi.dev/api/people/2/',
   'http://swapi.dev/api/people/4/',
   'http://swapi.dev/api/people/6/',
   'http://swapi.dev/api/people/7/',
   'http://swapi.dev/api/people/8/',
   'http://swapi.dev/api/people/9/',
   'http://swapi.dev/api/people/11/',
   'http://swapi.dev/api/people/43/',
   'http://swapi.dev/api/people/62/'],
  'films': ['http://swapi.dev/api/films/1/',
   'http://swapi.dev/api/films/3/',
   'http://swapi.dev/api/films/4/',
   'http://swapi.dev/api/films/5/',
   'http://swapi.dev/api/films/6/'],
  'created': '2014-12-09T13:50:49.641000Z',
  'edited': '2014-12-20T20:58:18.411000Z',
  'url': 'http://swapi.dev/api/planets/1/'},
 'films': ['http://swapi.dev/api/films/1/',
  'http://swapi.dev/api/films/2/',
  'http://swapi.dev/api/films/3/',
  'http://swapi.dev/api/films/6/'],
 'species': [],
 'vehicles': ['http://swapi.dev/api/vehicles/14/',
  'http://swapi.dev/api/vehicles/30/'],
 'starships': [{'name': 'X-wing',
   'model': 'T-65 X-wing',
   'manufacturer': 'Incom Corporation',
   'cost_in_credits': 149999,
   'length': '12.5',
   'max_atmosphering_speed': 1050,
   'crew': 1,
   'passengers': 0,
   'cargo_capacity': 110,
   'consumables': '1 week',
   'hyperdrive_rating': '1.0',
   'MGLT': 100,
   'starship_class': 'Starfighter',
   'pilots': ['http://swapi.dev/api/people/1/',
    'http://swapi.dev/api/people/9/',
    'http://swapi.dev/api/people/18/',
    'http://swapi.dev/api/people/19/'],
   'films': ['http://swapi.dev/api/films/1/',
    'http://swapi.dev/api/films/2/',
    'http://swapi.dev/api/films/3/'],
   'created': '2014-12-12T11:19:05.340000Z',
   'edited': '2014-12-20T21:23:49.886000Z',
   'url': 'http://swapi.dev/api/starships/12/'},
  {'name': 'Imperial shuttle',
   'model': 'Lambda-class T-4a shuttle',
   'manufacturer': 'Sienar Fleet Systems',
   'cost_in_credits': 240000,
   'length': 20,
   'max_atmosphering_speed': 850,
   'crew': 6,
   'passengers': 20,
   'cargo_capacity': 80000,
   'consumables': '2 months',
   'hyperdrive_rating': '1.0',
   'MGLT': 50,
   'starship_class': 'Armed government transport',
   'pilots': ['http://swapi.dev/api/people/1/',
    'http://swapi.dev/api/people/13/',
    'http://swapi.dev/api/people/14/'],
   'films': ['http://swapi.dev/api/films/2/', 'http://swapi.dev/api/films/3/'],
   'created': '2014-12-15T13:04:47.235000Z',
   'edited': '2014-12-20T21:23:49.900000Z',
   'url': 'http://swapi.dev/api/starships/22/'}],
 'created': '2014-12-09T13:50:51.644000Z',
 'edited': '2014-12-20T21:17:56.891000Z',
 'url': 'http://swapi.dev/api/people/1/'}

Insertion

Single inserts

[20]:
result = people.insert_one(data)
[21]:
db.list_collection_names()
[21]:
['people']

Bulk inserts

We load some previously retrieved values from file to avoid hitting the SWAPI server repeatedly.

[22]:
import pickle

with open('sw.pickle', 'rb') as f:
    xs = pickle.load(f)
[23]:
result = people.insert_many(xs)
[24]:
result.inserted_ids
[24]:
[ObjectId('5fac82d339937fab8d3c0873'),
 ObjectId('5fac82d339937fab8d3c0874'),
 ObjectId('5fac82d339937fab8d3c0875'),
 ObjectId('5fac82d339937fab8d3c0876'),
 ObjectId('5fac82d339937fab8d3c0877'),
 ObjectId('5fac82d339937fab8d3c0878'),
 ObjectId('5fac82d339937fab8d3c0879'),
 ObjectId('5fac82d339937fab8d3c087a'),
 ObjectId('5fac82d339937fab8d3c087b')]

Queries

[25]:
people.find_one(
    # search criteria
    {'name': 'Luke Skywalker'},
    # values to return
    {'name': True,
     'hair_color': True,
     'skin_color': True,
     'eye_color': True
    }
)
[25]:
{'_id': ObjectId('5fac82d339937fab8d3c0872'),
 'name': 'Luke Skywalker',
 'hair_color': 'blond',
 'skin_color': 'fair',
 'eye_color': 'blue'}
[26]:
for p in people.find(
    # search criteria
    {},
    # values to return
    {'name': True,
     'hair_color': True,
     'skin_color': True,
     'eye_color': True
    }
):
    print(p)
{'_id': ObjectId('5fac82d339937fab8d3c0872'), 'name': 'Luke Skywalker', 'hair_color': 'blond', 'skin_color': 'fair', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0873'), 'name': 'C-3PO', 'hair_color': 'n/a', 'skin_color': 'gold', 'eye_color': 'yellow'}
{'_id': ObjectId('5fac82d339937fab8d3c0874'), 'name': 'R2-D2', 'hair_color': 'n/a', 'skin_color': 'white, blue', 'eye_color': 'red'}
{'_id': ObjectId('5fac82d339937fab8d3c0875'), 'name': 'Darth Vader', 'hair_color': 'none', 'skin_color': 'white', 'eye_color': 'yellow'}
{'_id': ObjectId('5fac82d339937fab8d3c0876'), 'name': 'Leia Organa', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'brown'}
{'_id': ObjectId('5fac82d339937fab8d3c0877'), 'name': 'Owen Lars', 'hair_color': 'brown, grey', 'skin_color': 'light', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0878'), 'name': 'Beru Whitesun lars', 'hair_color': 'brown', 'skin_color': 'light', 'eye_color': 'blue'}
{'_id': ObjectId('5fac82d339937fab8d3c0879'), 'name': 'R5-D4', 'hair_color': 'n/a', 'skin_color': 'white, red', 'eye_color': 'red'}
{'_id': ObjectId('5fac82d339937fab8d3c087a'), 'name': 'Biggs Darklighter', 'hair_color': 'black', 'skin_color': 'light', 'eye_color': 'brown'}
{'_id': ObjectId('5fac82d339937fab8d3c087b'), 'name': 'Obi-Wan Kenobi', 'hair_color': 'auburn, white', 'skin_color': 'fair', 'eye_color': 'blue-gray'}

Using object ID

Note that ObjectID is NOT a string. You must convert a string to ObjectID before use.

From the official docs, the ObjectID consists of

  • a 4-byte value representing the seconds since the Unix epoch,

  • a 5-byte random value, and

  • a 3-byte counter, starting with a random value.

In particular, note that sorting by ObjectIDs generated across different machines will give an approximate time ordering.

[27]:
result.inserted_ids[0]
[27]:
ObjectId('5fac82d339937fab8d3c0873')
[28]:
people.find_one(
    result.inserted_ids[0],
    {'name': True, 'hair_color': True, 'skin_color': True, 'eye_color': True}
)
[28]:
{'_id': ObjectId('5fac82d339937fab8d3c0873'),
 'name': 'C-3PO',
 'hair_color': 'n/a',
 'skin_color': 'gold',
 'eye_color': 'yellow'}

Bulk queries

The general find method returns a cursor, where each entry is a dictionary.

[29]:
for person in people.find(
    {'gender': 'male'}
):
    print(person['name'])
Luke Skywalker
Darth Vader
Owen Lars
Biggs Darklighter
Obi-Wan Kenobi

You can also explicitly define the projection.

[30]:
for x in people.find(
    {'gender': 'male'},
    {
        '_id': False,
        'name': True,
        'gender': True
    }
):
    pprint(x)
{'gender': 'male', 'name': 'Luke Skywalker'}
{'gender': 'male', 'name': 'Darth Vader'}
{'gender': 'male', 'name': 'Owen Lars'}
{'gender': 'male', 'name': 'Biggs Darklighter'}
{'gender': 'male', 'name': 'Obi-Wan Kenobi'}

Using relational operators

[34]:
for x in people.find(
    {
        'mass': {'$lt': 100},
    },
    {
        'name': True,
        'mass': True,
        '_id': False
    }
):
    pprint(x)
{'mass': 77, 'name': 'Luke Skywalker'}
{'mass': 75, 'name': 'C-3PO'}
{'mass': 32, 'name': 'R2-D2'}
{'mass': 49, 'name': 'Leia Organa'}
{'mass': 75, 'name': 'Beru Whitesun lars'}
{'mass': 32, 'name': 'R5-D4'}
{'mass': 84, 'name': 'Biggs Darklighter'}
{'mass': 77, 'name': 'Obi-Wan Kenobi'}
[35]:
mass_range = {'$lt': 100, '$gt': 50}
[36]:
for x in people.find(
    {
        'mass': mass_range,
    },
    {
        'name': True,
        'mass': True,
        '_id': False
    }
):
    pprint(x)
{'mass': 77, 'name': 'Luke Skywalker'}
{'mass': 75, 'name': 'C-3PO'}
{'mass': 75, 'name': 'Beru Whitesun lars'}
{'mass': 84, 'name': 'Biggs Darklighter'}
{'mass': 77, 'name': 'Obi-Wan Kenobi'}

Matching multiple criteria

This is quite subtle. By default, when matching on multiple criteria, the search is across items. Here Obi-Wan Kenobi is returned because each of the 3 conditions is matched by one or more of his starships, even though none of his starships match all 3 criteria.

[38]:
for x in people.find(
    {
        'starships.cost_in_credits': {'$lt': 250000},
        'starships.max_atmosphering_speed': {'$gt': 500},
        'starships.passengers': {'$gt': 0}
    },
    {
        'name': True,
        'starship.name': True,
        'starships.max_atmosphering_speed': True,
        'starships.passengers': True,
        'starships.cost_in_credits': True,
        '_id': False
    }
):
    pprint(x)
{'name': 'Luke Skywalker',
 'starships': [{'cost_in_credits': 149999,
                'max_atmosphering_speed': 1050,
                'passengers': 0},
               {'cost_in_credits': 240000,
                'max_atmosphering_speed': 850,
                'passengers': 20}]}
{'name': 'Obi-Wan Kenobi',
 'starships': [{'cost_in_credits': 180000,
                'max_atmosphering_speed': 1150,
                'passengers': 0},
               {'cost_in_credits': 125000000,
                'max_atmosphering_speed': 1050,
                'passengers': 48247},
               {'cost_in_credits': 'unknown',
                'max_atmosphering_speed': 1050,
                'passengers': 3},
               {'cost_in_credits': 320000,
                'max_atmosphering_speed': 1500,
                'passengers': 0},
               {'cost_in_credits': 168000,
                'max_atmosphering_speed': 1100,
                'passengers': 0}]}
[39]:
for x in people.find(
    {'name': 'Obi-Wan Kenobi'},
    {
        'starships.name': True,
        'starships.cost_in_credits': True,
        'starships.max_atmosphering_speed': True,
        'starships.passengers': True,
        '_id': False
    }
):
    pprint(x)
{'starships': [{'cost_in_credits': 180000,
                'max_atmosphering_speed': 1150,
                'name': 'Jedi starfighter',
                'passengers': 0},
               {'cost_in_credits': 125000000,
                'max_atmosphering_speed': 1050,
                'name': 'Trade Federation cruiser',
                'passengers': 48247},
               {'cost_in_credits': 'unknown',
                'max_atmosphering_speed': 1050,
                'name': 'Naboo star skiff',
                'passengers': 3},
               {'cost_in_credits': 320000,
                'max_atmosphering_speed': 1500,
                'name': 'Jedi Interceptor',
                'passengers': 0},
               {'cost_in_credits': 168000,
                'max_atmosphering_speed': 1100,
                'name': 'Belbullab-22 starfighter',
                'passengers': 0}]}

Matching multiple criteria simultaneously

To find someone with a starship that matches all 3 conditions, we need to use the elemMatch operator.

[40]:
for x in people.find(
    {
        'starships': {
            '$elemMatch': {
                'cost_in_credits': {'$lt': 250000},
                'max_atmosphering_speed': {'$gt': 500},
                'passengers': {'$gt': 1}
            }
        }
    },
    {
        'name': True,
        'starships.name': True,
        'starships.max_atmosphering_speed': True,
        'starships.passengers': True,
        'starships.cost_in_credits': True,
        '_id': False
    }
):
    pprint(x)
{'name': 'Luke Skywalker',
 'starships': [{'cost_in_credits': 149999,
                'max_atmosphering_speed': 1050,
                'name': 'X-wing',
                'passengers': 0},
               {'cost_in_credits': 240000,
                'max_atmosphering_speed': 850,
                'name': 'Imperial shuttle',
                'passengers': 20}]}

Aggregate Queries

[41]:
people.count_documents({'homeworld.name': 'Tatooine'})
[41]:
7
[42]:
people.distinct('homeworld.name')
[42]:
['Alderaan', 'Naboo', 'Stewjon', 'Tatooine']

Using aggregate

The aggregate function runs a pipeline of commands, and uses the $group operator to summarize results. Within the aggregate method, you assemble a pipeline of operations that is executed atomically.

Filter and count

[43]:
cmds = [
     {'$match': {'homeworld.name': 'Tatooine'}},
     {'$group': {'_id': '$homeworld.name',
                 'count': {'$sum': 1}}},
]
[44]:
for p in people.aggregate(cmds):
    pprint(p)
{'_id': 'Tatooine', 'count': 7}

Filter and find total mass

[45]:
cmds = [
     {'$match': {'homeworld.name': 'Tatooine'}},
     {'$group': {'_id': '$homeworld.name',
                 'total_mass': {'$sum': '$mass'}}},
]
[46]:
for p in people.aggregate(cmds):
    pprint(p)
{'_id': 'Tatooine', 'total_mass': 599}

Total mass of all members of a planet

[47]:
cmds = [
     {'$group': {'_id': '$homeworld.name',
                 'total_mass': {'$sum': '$mass'}}},
]
[48]:
for p in people.aggregate(cmds):
    pprint(p)
{'_id': 'Tatooine', 'total_mass': 599}
{'_id': 'Stewjon', 'total_mass': 77}
{'_id': 'Naboo', 'total_mass': 32}
{'_id': 'Alderaan', 'total_mass': 49}

Filter, project, group by, sorting.

[49]:
cmds = [
     {
         '$match': {
             'mass': {
                 '$lt': 100
                     }
         },
     },
     {
         '$group': {
             '_id': '$homeworld.name',
             'total_mass': {'$sum': '$mass'},
             'avg_mass': {'$avg': '$mass'}
         },
     },
     {
        '$sort': {
            'avg_mass': -1
        }
     }
]
[50]:
for p in people.aggregate(cmds):
    pprint(p)
{'_id': 'Stewjon', 'avg_mass': 77.0, 'total_mass': 77}
{'_id': 'Tatooine', 'avg_mass': 68.6, 'total_mass': 343}
{'_id': 'Alderaan', 'avg_mass': 49.0, 'total_mass': 49}
{'_id': 'Naboo', 'avg_mass': 32.0, 'total_mass': 32}

SQL equivalent (approximate)

SELECT species.name, AVG(mass) AS avg_mass, SUM(mass) AS total_mass
WHERE mass < 100
FROM people
JOIN species
ON people.species_id = species.species_id
GROUP BY species.name
ORDER BY avg_mass

Using MapReduce

With MapReduce you get the full power of JavaScript, but it is more complex and often less efficient. You should use aggregate in preference to map_reduce in most cases.

  • In the map stage, you create a (key, value) pair

  • In the reduce stage, you perform a reduction (e.g. sum) of the values associated with each key

Motivating Python example

[51]:
from functools import reduce
[52]:
eye_color = ['blue', 'blue', 'green', 'brown', 'grey', 'green', 'blue']
[53]:
res = [(x, 1) for x in eye_color]
res
[53]:
[('blue', 1),
 ('blue', 1),
 ('green', 1),
 ('brown', 1),
 ('grey', 1),
 ('green', 1),
 ('blue', 1)]
[54]:
d = {}
for k, v in res:
    d[k] = d.get(k, 0) + v
d
[54]:
{'blue': 3, 'green': 2, 'brown': 1, 'grey': 1}

Map-reduce example in Mongo

[55]:
from bson.code import Code

Count the number by eye_color

[56]:
mapper = Code('''
function() {
    emit(this.eye_color, 1);
}
''')

reducer = Code('''
function (key, values) {
    var total = 0;
    for (var i = 0; i < values.length; i++) {
        total += values[i];
    }
    return total;
}
''')

result = people.map_reduce(
    mapper,
    reducer,
    'result1'
)
[57]:
for doc in result.find():
    pprint(doc)
{'_id': 'blue', 'value': 3.0}
{'_id': 'blue-gray', 'value': 1.0}
{'_id': 'brown', 'value': 2.0}
{'_id': 'red', 'value': 2.0}
{'_id': 'yellow', 'value': 2.0}

The output is also stored in the result1 collection we specified.

[58]:
list(db.result1.find())
[58]:
[{'_id': 'blue', 'value': 3.0},
 {'_id': 'blue-gray', 'value': 1.0},
 {'_id': 'brown', 'value': 2.0},
 {'_id': 'red', 'value': 2.0},
 {'_id': 'yellow', 'value': 2.0}]

Using JavaScript Array functions to simplify code.

[59]:
mapper = Code('''
function() {
    emit(this.eye_color, 1);
}
''')

reducer = Code('''
function (key, values) {
    return Array.sum(values);
}
''')

result = people.map_reduce(
    mapper,
    reducer,
    'result2'
)
[60]:
for doc in result.find():
    pprint(doc)
{'_id': 'blue', 'value': 3.0}
{'_id': 'blue-gray', 'value': 1.0}
{'_id': 'brown', 'value': 2.0}
{'_id': 'red', 'value': 2.0}
{'_id': 'yellow', 'value': 2.0}

Find avergae mass by gender.

[61]:
mapper = Code('''
function() {
    emit(this.gender, this.mass);
}
''')

reducer = Code('''
function (key, values) {
    return Array.avg(values);
}
''')

result = people.map_reduce(
    mapper,
    reducer,
    'result3'
)
[62]:
for doc in result.find():
    pprint(doc)
{'_id': 'female', 'value': 62.0}
{'_id': 'male', 'value': 98.8}
{'_id': 'n/a', 'value': 46.333333333333336}

Count number of members in each species

[63]:
mapper = Code('''
function() {
    this.species.map(function(z) {
      emit(z.name, 1);
    })
}
''')

reducer = Code('''
function (key, values) {
    return Array.sum(values);
}
''')

result = people.map_reduce(
    mapper,
    reducer,
    'result3'
)
[64]:
for doc in result.find():
    pprint(doc)
{'_id': None, 'value': 3.0}

Using the aggregate method

See if you can convert the above MapReduce queries to aggregate method calls. An example is provided.

[65]:
cmds = [
    {
         '$group': {
             '_id': '$eye_color',
             'count': {'$sum': 1},
         },
     },
     {
        '$sort': {
            '_id': 1
        }
     }
]
[66]:
for p in people.aggregate(cmds):
    pprint(p)
{'_id': 'blue', 'count': 3}
{'_id': 'blue-gray', 'count': 1}
{'_id': 'brown', 'count': 2}
{'_id': 'red', 'count': 2}
{'_id': 'yellow', 'count': 2}

Geospatial queries

You specify queries using GeoJSON Objects

  • Point

  • LineString

  • Polygon

  • MultiPoint

  • MultiLineString

  • MultiPolygon

  • GeometryCollection

[67]:
crime = db.crime
[68]:
import json
[69]:
path = 'data/crime-mapping.geojson'

with open(path) as f:
    datastore = json.load(f)
[70]:
results = crime.insert_many(datastore['features'])
[71]:
crime.find_one({})
[71]:
{'_id': ObjectId('5fac82d439937fab8d3c087c'),
 'geometry': {'type': 'Point', 'coordinates': [-78.78200313, 35.760212065]},
 'type': 'Feature',
 'properties': {'ucr': '2650',
  'domestic': 'N',
  'period': ['Everything', 'Last Year'],
  'street': 'KILDAIRE FARM RD',
  'radio': 'Everything,Last Year',
  'time_to': -62135553600,
  'crime_type': 'ALL OTHER - ESCAPE FROM CUSTODY OR RESIST ARREST',
  'district': 'D3',
  'phxrecordstatus': None,
  'lon': -78.78200313,
  'timeframe': ['Last Year'],
  'crimeday': 'THURSDAY',
  'phxstatus': None,
  'location_category': 'TOWN OWNED',
  'violentproperty': 'All Other',
  'residential_subdivision': 'SHOPPES OF KILDAIRE',
  'offensecategory': 'All Other Offenses',
  'chrgcnt': None,
  'time_from': -62135553600,
  'map_reference': 'P027',
  'date_to': '11/30/2017',
  'lat': 35.760212065,
  'phxcommunity': 'No',
  'crime_category': 'ALL OTHER',
  'activity_date': None,
  'beat_number': '112',
  'record': 3145,
  'incident_number': '17010528',
  'apartment_complex': None,
  'date_from': '2017-11-30'}}
[72]:
crime.find_one({},
              {
                  'geometry': 1,
                  '_id': 0,
              }
              )
[72]:
{'geometry': {'type': 'Point', 'coordinates': [-78.78200313, 35.760212065]}}
[73]:
crime.create_index([('geometry', GEOSPHERE)])
[73]:
'geometry_2dsphere'

List 5 crimes near the location

[74]:
loc = SON([('type', 'Point'), ('coordinates', [-78.78200313, 35.760212065])])

for doc in crime.find(
    {
        'geometry' : SON([('$near', {'$geometry' : loc})])
    },
    {
        '_id': 0,
        'properties.crime_type': 1,
        'properties.date_from': 1
    }
).limit(5):
    pprint(doc)
{'properties': {'crime_type': 'ALL OTHER - ESCAPE FROM CUSTODY OR RESIST '
                              'ARREST',
                'date_from': '2017-11-30'}}
{'properties': {'crime_type': 'LARCENY - AUTO PARTS OR ACCESSORIES',
                'date_from': '2018-03-20'}}
{'properties': {'crime_type': 'COUNTERFEITING - USING',
                'date_from': '2018-08-05'}}
{'properties': {'crime_type': 'DRUGS - DRUG VIOLATIONS '
                              '(POSS./SELL/MAN./DEL./TRNSPRT/CULT.)',
                'date_from': '2017-11-30'}}
{'properties': {'crime_type': 'VANDALISM - DAMAGE TO PROPERTY',
                'date_from': '2018-03-26'}}

List crimes committed nearby (within 200 m)

[75]:
loc = SON([('type', 'Point'), ('coordinates', [-78.78200313, 35.760212065])])

for doc in crime.find(
    {
        'geometry' : SON([('$geoNear', {'$geometry' : loc, '$minDistance': 1e-6, '$maxDistance': 200})]),
    },
    {
        '_id': 0,
        'geometry.coordinates': 1,
        'properties.crime_type': 1,
        'properties.date_from': 1
    }
):
    pprint(doc)
{'geometry': {'coordinates': [-78.78102423, 35.7607323]},
 'properties': {'crime_type': 'ASSAULT - SIMPLE - ALL OTHER',
                'date_from': '2018-02-14'}}
{'geometry': {'coordinates': [-78.78131931, 35.761138061]},
 'properties': {'crime_type': 'VANDALISM - GRAFFITI',
                'date_from': '2018-07-20'}}
{'geometry': {'coordinates': [-78.7827814, 35.759087052]},
 'properties': {'crime_type': 'VANDALISM - GRAFFITI',
                'date_from': '2018-07-29'}}

Indexes

Just as with relational databases, you can add indexes to speed up search. Note that while reads become faster, writes become slower. There is always a trade-off.

[76]:
people.find({}).explain
[76]:
<bound method Cursor.explain of <pymongo.cursor.Cursor object at 0x111dfe670>>
[77]:
people.find({'name': 'Luke Skywalker'}).explain()
[77]:
{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'starwars.people',
  'indexFilterSet': False,
  'parsedQuery': {'name': {'$eq': 'Luke Skywalker'}},
  'winningPlan': {'stage': 'COLLSCAN',
   'filter': {'name': {'$eq': 'Luke Skywalker'}},
   'direction': 'forward'},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 1,
  'executionTimeMillis': 0,
  'totalKeysExamined': 0,
  'totalDocsExamined': 10,
  'executionStages': {'stage': 'COLLSCAN',
   'filter': {'name': {'$eq': 'Luke Skywalker'}},
   'nReturned': 1,
   'executionTimeMillisEstimate': 0,
   'works': 12,
   'advanced': 1,
   'needTime': 10,
   'needYield': 0,
   'saveState': 0,
   'restoreState': 0,
   'isEOF': 1,
   'direction': 'forward',
   'docsExamined': 10},
  'allPlansExecution': []},
 'serverInfo': {'host': 'Cliburns-MacBook-Pro.local',
  'port': 27017,
  'version': '4.2.3',
  'gitVersion': '6874650b362138df74be53d366bbefc321ea32d4'},
 'ok': 1.0}
[78]:
people.create_index('name')
[78]:
'name_1'
[79]:
people.find({'name': 'Luke Skywalker'}).explain()
[79]:
{'queryPlanner': {'plannerVersion': 1,
  'namespace': 'starwars.people',
  'indexFilterSet': False,
  'parsedQuery': {'name': {'$eq': 'Luke Skywalker'}},
  'winningPlan': {'stage': 'FETCH',
   'inputStage': {'stage': 'IXSCAN',
    'keyPattern': {'name': 1},
    'indexName': 'name_1',
    'isMultiKey': False,
    'multiKeyPaths': {'name': []},
    'isUnique': False,
    'isSparse': False,
    'isPartial': False,
    'indexVersion': 2,
    'direction': 'forward',
    'indexBounds': {'name': ['["Luke Skywalker", "Luke Skywalker"]']}}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 1,
  'executionTimeMillis': 1,
  'totalKeysExamined': 1,
  'totalDocsExamined': 1,
  'executionStages': {'stage': 'FETCH',
   'nReturned': 1,
   'executionTimeMillisEstimate': 0,
   'works': 2,
   'advanced': 1,
   'needTime': 0,
   'needYield': 0,
   'saveState': 0,
   'restoreState': 0,
   'isEOF': 1,
   'docsExamined': 1,
   'alreadyHasObj': 0,
   'inputStage': {'stage': 'IXSCAN',
    'nReturned': 1,
    'executionTimeMillisEstimate': 0,
    'works': 2,
    'advanced': 1,
    'needTime': 0,
    'needYield': 0,
    'saveState': 0,
    'restoreState': 0,
    'isEOF': 1,
    'keyPattern': {'name': 1},
    'indexName': 'name_1',
    'isMultiKey': False,
    'multiKeyPaths': {'name': []},
    'isUnique': False,
    'isSparse': False,
    'isPartial': False,
    'indexVersion': 2,
    'direction': 'forward',
    'indexBounds': {'name': ['["Luke Skywalker", "Luke Skywalker"]']},
    'keysExamined': 1,
    'seeks': 1,
    'dupsTested': 0,
    'dupsDropped': 0}},
  'allPlansExecution': []},
 'serverInfo': {'host': 'Cliburns-MacBook-Pro.local',
  'port': 27017,
  'version': '4.2.3',
  'gitVersion': '6874650b362138df74be53d366bbefc321ea32d4'},
 'ok': 1.0}