In [1]:
import numpy as np
import pandas as pd
from timeit import timeit

Enhancing performance

We first look at native code compilation. Here we show 3 common methods for doing this using numba JIT compilation, cython AOT compilation, and direct wrapping of C++ code using pybind11. In general, numba is the simplest to use, while you have the most flexibility with pybind11. Which approach gives the best performance generally requires some experimentation.

Then we review common methods for concurrent execution of embarrassingly parallel code using multiprocessing, concurrent.futures and joblib. Comparison of performance using processes and threads is made, with a brief explanation of the Global Interpreter Lock (GIL).

More details for each of the libraries used to improve performance is provided in the course notebooks.

Python

In [2]:
def cdist(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.

    xs has shape (m, p)
    ys has shape (n, p)

    Return value has shape (m, n)
    """

    m, p = xs.shape
    n, p = ys.shape

    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res

Sanity check

In [3]:
xs = np.arange(6).reshape(-1,2).astype('float')
ys = np.arange(4).reshape(-1, 2).astype('float')
zs = cdist(xs, ys)
In [4]:
zs
Out[4]:
array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])
In [5]:
%timeit -r 3 -n 10 cdist(xs, ys)
60.7 µs ± 1.39 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)
In [6]:
m = 1000
n = 1000
p = 100

X = np.random.random((m, p))
Y = np.random.random((n, p))
In [7]:
%%time

Z = cdist(X, Y)
CPU times: user 9.46 s, sys: 20 ms, total: 9.48 s
Wall time: 9.48 s
In [8]:
t0 = timeit(lambda : cdist(X, Y), number=1)

Using numba

In [9]:
from numba import jit, njit
In [10]:
@njit
def cdist_numba(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.

    xs has shape (m, p)
    ys has shape (n, p)

    Return value has shape (m, n)
    """

    m, p = xs.shape
    n, p = ys.shape

    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res

Check

In [11]:
assert(np.allclose(cdist(xs, ys), cdist_numba(xs, ys)))
In [12]:
%%time

Z = cdist_numba(X, Y)
CPU times: user 280 ms, sys: 8 ms, total: 288 ms
Wall time: 287 ms
In [13]:
t_numba = timeit(lambda : cdist_numba(X, Y), number=1)

Unrolling

We can help numba by unrolling the code.

In [14]:
@njit
def cdist_numba1(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.

    xs has shape (m, p)
    ys has shape (n, p)

    Return value has shape (m, n)
    """

    m, p = xs.shape
    n, p = ys.shape

    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            s = 0
            for k in range(p):
                s += (ys[j,k] - xs[i,k])**2
            res[i, j] = np.sqrt(s)
    return res

Check

In [15]:
assert(np.allclose(cdist(xs, ys), cdist_numba1(xs, ys)))
In [16]:
%%time

Z = cdist_numba1(X, Y)
CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 111 ms
In [17]:
t_numba1 = timeit(lambda : cdist_numba1(X, Y), number=1)

Using cython

In [18]:
%load_ext cython
In [19]:
%%cython -a

import numpy as np

def cdist_cython(xs, ys):
    """Returns pairwise distance between row vectors in xs and ys.

    xs has shape (m, p)
    ys has shape (n, p)

    Return value has shape (m, n)
    """

    m, p = xs.shape
    n, p = ys.shape

    res = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
    return res
Out[19]:
Cython: _cython_magic_e0260da0fe5bce3ea805711e620fd9f5.pyx

Generated by Cython 0.28.5

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: 
+02: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 03: 
+04: def cdist_cython(xs, ys):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
static char __pyx_doc_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython[] = "Returns pairwise distance between row vectors in xs and ys.\n    \n    xs has shape (m, p)\n    ys has shape (n, p)\n    \n    Return value has shape (m, n)    \n    ";
static PyMethodDef __pyx_mdef_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython = {"cdist_cython", (PyCFunction)__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython, METH_VARARGS|METH_KEYWORDS, __pyx_doc_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython};
static PyObject *__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
  PyObject *__pyx_v_xs = 0;
  PyObject *__pyx_v_ys = 0;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cdist_cython (wrapper)", 0);
  {
    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_xs,&__pyx_n_s_ys,0};
    PyObject* values[2] = {0,0};
    if (unlikely(__pyx_kwds)) {
      Py_ssize_t kw_args;
      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
      switch (pos_args) {
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        CYTHON_FALLTHROUGH;
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        CYTHON_FALLTHROUGH;
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      kw_args = PyDict_Size(__pyx_kwds);
      switch (pos_args) {
        case  0:
        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_xs)) != 0)) kw_args--;
        else goto __pyx_L5_argtuple_error;
        CYTHON_FALLTHROUGH;
        case  1:
        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ys)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("cdist_cython", 1, 2, 2, 1); __PYX_ERR(0, 4, __pyx_L3_error)
        }
      }
      if (unlikely(kw_args > 0)) {
        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cdist_cython") < 0)) __PYX_ERR(0, 4, __pyx_L3_error)
      }
    } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
      goto __pyx_L5_argtuple_error;
    } else {
      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
    }
    __pyx_v_xs = values[0];
    __pyx_v_ys = values[1];
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("cdist_cython", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 4, __pyx_L3_error)
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_e0260da0fe5bce3ea805711e620fd9f5.cdist_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython(__pyx_self, __pyx_v_xs, __pyx_v_ys);

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_xs, PyObject *__pyx_v_ys) {
  PyObject *__pyx_v_m = NULL;
  CYTHON_UNUSED PyObject *__pyx_v_p = NULL;
  PyObject *__pyx_v_n = NULL;
  PyObject *__pyx_v_res = NULL;
  PyObject *__pyx_v_i = NULL;
  PyObject *__pyx_v_j = NULL;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cdist_cython", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_4);
  __Pyx_XDECREF(__pyx_t_6);
  __Pyx_XDECREF(__pyx_t_11);
  __Pyx_XDECREF(__pyx_t_12);
  __Pyx_XDECREF(__pyx_t_13);
  __Pyx_XDECREF(__pyx_t_14);
  __Pyx_AddTraceback("_cython_magic_e0260da0fe5bce3ea805711e620fd9f5.cdist_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XDECREF(__pyx_v_m);
  __Pyx_XDECREF(__pyx_v_p);
  __Pyx_XDECREF(__pyx_v_n);
  __Pyx_XDECREF(__pyx_v_res);
  __Pyx_XDECREF(__pyx_v_i);
  __Pyx_XDECREF(__pyx_v_j);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple_ = PyTuple_Pack(8, __pyx_n_s_xs, __pyx_n_s_ys, __pyx_n_s_m, __pyx_n_s_p, __pyx_n_s_n, __pyx_n_s_res, __pyx_n_s_i, __pyx_n_s_j); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple_);
  __Pyx_GIVEREF(__pyx_tuple_);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython, NULL, __pyx_n_s_cython_magic_e0260da0fe5bce3ea8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cdist_cython, __pyx_t_1) < 0) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 05:     """Returns pairwise distance between row vectors in xs and ys.
 06:     
 07:     xs has shape (m, p)
 08:     ys has shape (n, p)
 09:     
 10:     Return value has shape (m, n)    
 11:     """
 12: 
+13:     m, p = xs.shape
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_xs, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) {
    PyObject* sequence = __pyx_t_1;
    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
    if (unlikely(size != 2)) {
      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
      __PYX_ERR(0, 13, __pyx_L1_error)
    }
    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
    if (likely(PyTuple_CheckExact(sequence))) {
      __pyx_t_2 = PyTuple_GET_ITEM(sequence, 0);
      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 1);
    } else {
      __pyx_t_2 = PyList_GET_ITEM(sequence, 0);
      __pyx_t_3 = PyList_GET_ITEM(sequence, 1);
    }
    __Pyx_INCREF(__pyx_t_2);
    __Pyx_INCREF(__pyx_t_3);
    #else
    __pyx_t_2 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 13, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __pyx_t_3 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    #endif
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  } else {
    Py_ssize_t index = -1;
    __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __pyx_t_5 = Py_TYPE(__pyx_t_4)->tp_iternext;
    index = 0; __pyx_t_2 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_2)) goto __pyx_L3_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_2);
    index = 1; __pyx_t_3 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_3)) goto __pyx_L3_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_3);
    if (__Pyx_IternextUnpackEndCheck(__pyx_t_5(__pyx_t_4), 2) < 0) __PYX_ERR(0, 13, __pyx_L1_error)
    __pyx_t_5 = NULL;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    goto __pyx_L4_unpacking_done;
    __pyx_L3_unpacking_failed:;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    __pyx_t_5 = NULL;
    if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index);
    __PYX_ERR(0, 13, __pyx_L1_error)
    __pyx_L4_unpacking_done:;
  }
  __pyx_v_m = __pyx_t_2;
  __pyx_t_2 = 0;
  __pyx_v_p = __pyx_t_3;
  __pyx_t_3 = 0;
+14:     n, p = ys.shape
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_ys, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 14, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) {
    PyObject* sequence = __pyx_t_1;
    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
    if (unlikely(size != 2)) {
      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
      __PYX_ERR(0, 14, __pyx_L1_error)
    }
    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
    if (likely(PyTuple_CheckExact(sequence))) {
      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0);
      __pyx_t_2 = PyTuple_GET_ITEM(sequence, 1);
    } else {
      __pyx_t_3 = PyList_GET_ITEM(sequence, 0);
      __pyx_t_2 = PyList_GET_ITEM(sequence, 1);
    }
    __Pyx_INCREF(__pyx_t_3);
    __Pyx_INCREF(__pyx_t_2);
    #else
    __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 14, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __pyx_t_2 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 14, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    #endif
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  } else {
    Py_ssize_t index = -1;
    __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 14, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __pyx_t_5 = Py_TYPE(__pyx_t_4)->tp_iternext;
    index = 0; __pyx_t_3 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_3)) goto __pyx_L5_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_3);
    index = 1; __pyx_t_2 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_2)) goto __pyx_L5_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_2);
    if (__Pyx_IternextUnpackEndCheck(__pyx_t_5(__pyx_t_4), 2) < 0) __PYX_ERR(0, 14, __pyx_L1_error)
    __pyx_t_5 = NULL;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    goto __pyx_L6_unpacking_done;
    __pyx_L5_unpacking_failed:;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    __pyx_t_5 = NULL;
    if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index);
    __PYX_ERR(0, 14, __pyx_L1_error)
    __pyx_L6_unpacking_done:;
  }
  __pyx_v_n = __pyx_t_3;
  __pyx_t_3 = 0;
  __Pyx_DECREF_SET(__pyx_v_p, __pyx_t_2);
  __pyx_t_2 = 0;
 15: 
+16:     res = np.empty((m, n))
  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 16, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_INCREF(__pyx_v_m);
  __Pyx_GIVEREF(__pyx_v_m);
  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_m);
  __Pyx_INCREF(__pyx_v_n);
  __Pyx_GIVEREF(__pyx_v_n);
  PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_n);
  __pyx_t_4 = NULL;
  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
    if (likely(__pyx_t_4)) {
      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
      __Pyx_INCREF(__pyx_t_4);
      __Pyx_INCREF(function);
      __Pyx_DECREF_SET(__pyx_t_3, function);
    }
  }
  if (!__pyx_t_4) {
    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    __Pyx_GOTREF(__pyx_t_1);
  } else {
    #if CYTHON_FAST_PYCALL
    if (PyFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    } else
    #endif
    #if CYTHON_FAST_PYCCALL
    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    } else
    #endif
    {
      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 16, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_6);
      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL;
      __Pyx_GIVEREF(__pyx_t_2);
      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_t_2);
      __pyx_t_2 = 0;
      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    }
  }
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_v_res = __pyx_t_1;
  __pyx_t_1 = 0;
+17:     for i in range(m):
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_m); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
    __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0;
    __pyx_t_8 = NULL;
  } else {
    __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 17, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    __pyx_t_8 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 17, __pyx_L1_error)
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  for (;;) {
    if (likely(!__pyx_t_8)) {
      if (likely(PyList_CheckExact(__pyx_t_3))) {
        if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 17, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      } else {
        if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 17, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      }
    } else {
      __pyx_t_1 = __pyx_t_8(__pyx_t_3);
      if (unlikely(!__pyx_t_1)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
          else __PYX_ERR(0, 17, __pyx_L1_error)
        }
        break;
      }
      __Pyx_GOTREF(__pyx_t_1);
    }
    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1);
    __pyx_t_1 = 0;
/* … */
  }
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+18:         for j in range(n):
    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
      __pyx_t_6 = __pyx_t_1; __Pyx_INCREF(__pyx_t_6); __pyx_t_9 = 0;
      __pyx_t_10 = NULL;
    } else {
      __pyx_t_9 = -1; __pyx_t_6 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 18, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_6);
      __pyx_t_10 = Py_TYPE(__pyx_t_6)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 18, __pyx_L1_error)
    }
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    for (;;) {
      if (likely(!__pyx_t_10)) {
        if (likely(PyList_CheckExact(__pyx_t_6))) {
          if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_6)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_1 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 18, __pyx_L1_error)
          #else
          __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_1);
          #endif
        } else {
          if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_6)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 18, __pyx_L1_error)
          #else
          __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_1);
          #endif
        }
      } else {
        __pyx_t_1 = __pyx_t_10(__pyx_t_6);
        if (unlikely(!__pyx_t_1)) {
          PyObject* exc_type = PyErr_Occurred();
          if (exc_type) {
            if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
            else __PYX_ERR(0, 18, __pyx_L1_error)
          }
          break;
        }
        __Pyx_GOTREF(__pyx_t_1);
      }
      __Pyx_XDECREF_SET(__pyx_v_j, __pyx_t_1);
      __pyx_t_1 = 0;
/* … */
    }
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+19:             res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
      __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_2);
      __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_sqrt); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_4);
      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
      __pyx_t_11 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_11);
      __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_11, __pyx_n_s_sum); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_12);
      __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
      __pyx_t_11 = __Pyx_PyObject_GetItem(__pyx_v_ys, __pyx_v_j); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_11);
      __pyx_t_13 = __Pyx_PyObject_GetItem(__pyx_v_xs, __pyx_v_i); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_13);
      __pyx_t_14 = PyNumber_Subtract(__pyx_t_11, __pyx_t_13); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_14);
      __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
      __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
      __pyx_t_13 = PyNumber_Power(__pyx_t_14, __pyx_int_2, Py_None); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_13);
      __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
      __pyx_t_14 = NULL;
      if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_12))) {
        __pyx_t_14 = PyMethod_GET_SELF(__pyx_t_12);
        if (likely(__pyx_t_14)) {
          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_12);
          __Pyx_INCREF(__pyx_t_14);
          __Pyx_INCREF(function);
          __Pyx_DECREF_SET(__pyx_t_12, function);
        }
      }
      if (!__pyx_t_14) {
        __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_12, __pyx_t_13); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
        __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
        __Pyx_GOTREF(__pyx_t_2);
      } else {
        #if CYTHON_FAST_PYCALL
        if (PyFunction_Check(__pyx_t_12)) {
          PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_13};
          __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0;
          __Pyx_GOTREF(__pyx_t_2);
          __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
        } else
        #endif
        #if CYTHON_FAST_PYCCALL
        if (__Pyx_PyFastCFunction_Check(__pyx_t_12)) {
          PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_13};
          __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0;
          __Pyx_GOTREF(__pyx_t_2);
          __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
        } else
        #endif
        {
          __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_11);
          __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_14); __pyx_t_14 = NULL;
          __Pyx_GIVEREF(__pyx_t_13);
          PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_13);
          __pyx_t_13 = 0;
          __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_11, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_2);
          __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
        }
      }
      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
      __pyx_t_12 = NULL;
      if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) {
        __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_4);
        if (likely(__pyx_t_12)) {
          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
          __Pyx_INCREF(__pyx_t_12);
          __Pyx_INCREF(function);
          __Pyx_DECREF_SET(__pyx_t_4, function);
        }
      }
      if (!__pyx_t_12) {
        __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
        __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
        __Pyx_GOTREF(__pyx_t_1);
      } else {
        #if CYTHON_FAST_PYCALL
        if (PyFunction_Check(__pyx_t_4)) {
          PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_2};
          __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
          __Pyx_GOTREF(__pyx_t_1);
          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
        } else
        #endif
        #if CYTHON_FAST_PYCCALL
        if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
          PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_2};
          __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
          __Pyx_GOTREF(__pyx_t_1);
          __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
        } else
        #endif
        {
          __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_11);
          __Pyx_GIVEREF(__pyx_t_12); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_12); __pyx_t_12 = NULL;
          __Pyx_GIVEREF(__pyx_t_2);
          PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_2);
          __pyx_t_2 = 0;
          __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_11, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_1);
          __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
        }
      }
      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
      __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_4);
      __Pyx_INCREF(__pyx_v_i);
      __Pyx_GIVEREF(__pyx_v_i);
      PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_v_i);
      __Pyx_INCREF(__pyx_v_j);
      __Pyx_GIVEREF(__pyx_v_j);
      PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_v_j);
      if (unlikely(PyObject_SetItem(__pyx_v_res, __pyx_t_4, __pyx_t_1) < 0)) __PYX_ERR(0, 19, __pyx_L1_error)
      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+20:     return res
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_res);
  __pyx_r = __pyx_v_res;
  goto __pyx_L0;

Check

In [20]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))
In [21]:
%%time

Z = cdist_cython(X, Y)
CPU times: user 9.88 s, sys: 8 ms, total: 9.88 s
Wall time: 9.88 s
In [22]:
t_cython = timeit(lambda : cdist_cython(X, Y), number=1)
In [23]:
%%cython -a

import cython
import numpy as np
from libc.math cimport sqrt, pow

@cython.boundscheck(False)
@cython.wraparound(False)
def cdist_cython1(double[:, :] xs, double[:, :] ys):
    """Returns pairwise distance between row vectors in xs and ys.

    xs has shape (m, p)
    ys has shape (n, p)

    Return value has shape (m, n)
    """

    cdef int m, n, p

    m = xs.shape[0]
    n = ys.shape[0]
    p = xs.shape[1]

    cdef double[:, :] res = np.empty((m, n))

    cdef int i, j

    cdef double s
    for i in range(m):
        for j in range(n):
            s = 0.0
            for k in range(p):
                s += pow(ys[j,k] - xs[i,k], 2)
            res[i, j] = sqrt(s)
    return res
Out[23]:
Cython: _cython_magic_c23979ecdfeb0fdf6011e4df6437b314.pyx

Generated by Cython 0.28.5

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: 
+02: import cython
  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+03: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 3, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 3, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 04: from libc.math cimport sqrt, pow
 05: 
 06: @cython.boundscheck(False)
 07: @cython.wraparound(False)
+08: def cdist_cython1(double[:, :] xs, double[:, :] ys):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
static char __pyx_doc_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1[] = "Returns pairwise distance between row vectors in xs and ys.\n    \n    xs has shape (m, p)\n    ys has shape (n, p)\n    \n    Return value has shape (m, n)    \n    ";
static PyMethodDef __pyx_mdef_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1 = {"cdist_cython1", (PyCFunction)__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1, METH_VARARGS|METH_KEYWORDS, __pyx_doc_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1};
static PyObject *__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
  __Pyx_memviewslice __pyx_v_xs = { 0, 0, { 0 }, { 0 }, { 0 } };
  __Pyx_memviewslice __pyx_v_ys = { 0, 0, { 0 }, { 0 }, { 0 } };
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cdist_cython1 (wrapper)", 0);
  {
    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_xs,&__pyx_n_s_ys,0};
    PyObject* values[2] = {0,0};
    if (unlikely(__pyx_kwds)) {
      Py_ssize_t kw_args;
      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
      switch (pos_args) {
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        CYTHON_FALLTHROUGH;
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        CYTHON_FALLTHROUGH;
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      kw_args = PyDict_Size(__pyx_kwds);
      switch (pos_args) {
        case  0:
        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_xs)) != 0)) kw_args--;
        else goto __pyx_L5_argtuple_error;
        CYTHON_FALLTHROUGH;
        case  1:
        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ys)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("cdist_cython1", 1, 2, 2, 1); __PYX_ERR(0, 8, __pyx_L3_error)
        }
      }
      if (unlikely(kw_args > 0)) {
        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cdist_cython1") < 0)) __PYX_ERR(0, 8, __pyx_L3_error)
      }
    } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
      goto __pyx_L5_argtuple_error;
    } else {
      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
    }
    __pyx_v_xs = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(values[0], PyBUF_WRITABLE); if (unlikely(!__pyx_v_xs.memview)) __PYX_ERR(0, 8, __pyx_L3_error)
    __pyx_v_ys = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(values[1], PyBUF_WRITABLE); if (unlikely(!__pyx_v_ys.memview)) __PYX_ERR(0, 8, __pyx_L3_error)
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("cdist_cython1", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 8, __pyx_L3_error)
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_c23979ecdfeb0fdf6011e4df6437b314.cdist_cython1", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1(__pyx_self, __pyx_v_xs, __pyx_v_ys);

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_xs, __Pyx_memviewslice __pyx_v_ys) {
  int __pyx_v_m;
  int __pyx_v_n;
  int __pyx_v_p;
  __Pyx_memviewslice __pyx_v_res = { 0, 0, { 0 }, { 0 }, { 0 } };
  int __pyx_v_i;
  int __pyx_v_j;
  double __pyx_v_s;
  int __pyx_v_k;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cdist_cython1", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_4);
  __Pyx_XDECREF(__pyx_t_5);
  __PYX_XDEC_MEMVIEW(&__pyx_t_6, 1);
  __Pyx_AddTraceback("_cython_magic_c23979ecdfeb0fdf6011e4df6437b314.cdist_cython1", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __PYX_XDEC_MEMVIEW(&__pyx_v_res, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_xs, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_ys, 1);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__22 = PyTuple_Pack(10, __pyx_n_s_xs, __pyx_n_s_ys, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_p, __pyx_n_s_res, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_s, __pyx_n_s_k); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple__22);
  __Pyx_GIVEREF(__pyx_tuple__22);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1, NULL, __pyx_n_s_cython_magic_c23979ecdfeb0fdf60); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cdist_cython1, __pyx_t_1) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(2, 0, 10, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__22, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_jovyan_cache_ipython_cytho, __pyx_n_s_cdist_cython1, 8, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) __PYX_ERR(0, 8, __pyx_L1_error)
 09:     """Returns pairwise distance between row vectors in xs and ys.
 10:     
 11:     xs has shape (m, p)
 12:     ys has shape (n, p)
 13:     
 14:     Return value has shape (m, n)    
 15:     """
 16: 
 17:     cdef int m, n, p
 18: 
+19:     m = xs.shape[0]
  __pyx_v_m = (__pyx_v_xs.shape[0]);
+20:     n = ys.shape[0]
  __pyx_v_n = (__pyx_v_ys.shape[0]);
+21:     p = xs.shape[1]
  __pyx_v_p = (__pyx_v_xs.shape[1]);
 22: 
+23:     cdef double[:, :] res = np.empty((m, n))
  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_m); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_5);
  __Pyx_GIVEREF(__pyx_t_2);
  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2);
  __Pyx_GIVEREF(__pyx_t_4);
  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_4);
  __pyx_t_2 = 0;
  __pyx_t_4 = 0;
  __pyx_t_4 = NULL;
  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
    if (likely(__pyx_t_4)) {
      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
      __Pyx_INCREF(__pyx_t_4);
      __Pyx_INCREF(function);
      __Pyx_DECREF_SET(__pyx_t_3, function);
    }
  }
  if (!__pyx_t_4) {
    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    __Pyx_GOTREF(__pyx_t_1);
  } else {
    #if CYTHON_FAST_PYCALL
    if (PyFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5};
      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    } else
    #endif
    #if CYTHON_FAST_PYCCALL
    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5};
      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    } else
    #endif
    {
      __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_2);
      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_4); __pyx_t_4 = NULL;
      __Pyx_GIVEREF(__pyx_t_5);
      PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_t_5);
      __pyx_t_5 = 0;
      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    }
  }
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_6 = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(__pyx_t_1, PyBUF_WRITABLE); if (unlikely(!__pyx_t_6.memview)) __PYX_ERR(0, 23, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_res = __pyx_t_6;
  __pyx_t_6.memview = NULL;
  __pyx_t_6.data = NULL;
 24: 
 25:     cdef int i, j
 26: 
 27:     cdef double s
+28:     for i in range(m):
  __pyx_t_7 = __pyx_v_m;
  __pyx_t_8 = __pyx_t_7;
  for (__pyx_t_9 = 0; __pyx_t_9 < __pyx_t_8; __pyx_t_9+=1) {
    __pyx_v_i = __pyx_t_9;
+29:         for j in range(n):
    __pyx_t_10 = __pyx_v_n;
    __pyx_t_11 = __pyx_t_10;
    for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_11; __pyx_t_12+=1) {
      __pyx_v_j = __pyx_t_12;
+30:             s = 0.0
      __pyx_v_s = 0.0;
+31:             for k in range(p):
      __pyx_t_13 = __pyx_v_p;
      __pyx_t_14 = __pyx_t_13;
      for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
        __pyx_v_k = __pyx_t_15;
+32:                 s += pow(ys[j,k] - xs[i,k], 2)
        __pyx_t_16 = __pyx_v_j;
        __pyx_t_17 = __pyx_v_k;
        __pyx_t_18 = __pyx_v_i;
        __pyx_t_19 = __pyx_v_k;
        __pyx_v_s = (__pyx_v_s + pow(((*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_ys.data + __pyx_t_16 * __pyx_v_ys.strides[0]) ) + __pyx_t_17 * __pyx_v_ys.strides[1]) ))) - (*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_xs.data + __pyx_t_18 * __pyx_v_xs.strides[0]) ) + __pyx_t_19 * __pyx_v_xs.strides[1]) )))), 2.0));
      }
+33:             res[i, j] = sqrt(s)
      __pyx_t_20 = __pyx_v_i;
      __pyx_t_21 = __pyx_v_j;
      *((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_res.data + __pyx_t_20 * __pyx_v_res.strides[0]) ) + __pyx_t_21 * __pyx_v_res.strides[1]) )) = sqrt(__pyx_v_s);
    }
  }
+34:     return res
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_res, 2, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 34, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_r = __pyx_t_1;
  __pyx_t_1 = 0;
  goto __pyx_L0;

Check

In [24]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))
In [25]:
%%time

Z = cdist_cython1(X, Y)
CPU times: user 112 ms, sys: 8 ms, total: 120 ms
Wall time: 120 ms
In [26]:
t_cython1 = timeit(lambda : cdist_cython1(X, Y), number=1)

Using pybind11

In [29]:
%%file funcs.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['/usr/include/eigen3']
setup_pybind11(cfg)
%>

#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>

#include <cmath>
#include <Eigen/LU>

namespace py = pybind11;

using Eigen::MatrixXd;

MatrixXd cdist(MatrixXd xs, MatrixXd ys) {
    int m = xs.rows();
    int n = ys.rows();
    int p = ys.cols();

    MatrixXd res(m, n);

    double s;
    for (int i=0; i<m; i++) {
        for (int j=0; j<n; j++) {
            s = 0;
            for (int k=0; k<p; k++) {
                s += pow(ys(j,k) - xs(i,k), 2);
            }
            res(i,j) = sqrt(s);
        }
    }

    return res;
}

PYBIND11_MODULE(funcs, m) {
    m.doc() = "auto-compiled c++ extension";
    m.def("cdist", &cdist);
}
Overwriting funcs.cpp

Check. Note that the cppimport.imp only needs to be done once to build and wrap the C++ module. Once the module is built, it can subsequently be used like any other module.

In [30]:
import cppimport

funcs = cppimport.imp("funcs")
funcs.cdist(xs, ys)
assert(np.allclose(cdist(xs, ys), cdist_cython1(xs, ys)))
In [31]:
%%time

Z = funcs.cdist(X, Y)
CPU times: user 156 ms, sys: 4 ms, total: 160 ms
Wall time: 157 ms
In [32]:
t_pybind11 = timeit(lambda : funcs.cdist(X, Y), number=1)

Tabulation

In [33]:
perf = pd.DataFrame(dict(
    methods = ['python', 'numba', 'numba1',  'cython', 'cython1', 'pybind11'],
    times = [t0, t_numba, t_numba1, t_cython, t_cython1, t_pybind11],
))
In [34]:
perf['speed-up'] = np.around(perf['times'][0]/perf['times'], 1)
perf
Out[34]:
methods times speed-up
0 python 9.340464 1.0
1 numba 0.277997 33.6
2 numba1 0.111793 83.6
3 cython 9.911234 0.9
4 cython1 0.119332 78.3
5 pybind11 0.154224 60.6

Using multiple cores

The standard implementation of Python uses a Global Interpreter Lock (GIL). This means that only one thread can be run at any one time, and multiple threads work by time-slicing. Hence multi-threaded code with lots of latency can result in speed-ups, but multi-threaded code which is computationally intensive will not see any speed-up. For numerically intensive code, parallel code needs to be run in separate processes to see speed-ups.

First we see how to split the computation into pieces using a loop.

In [35]:
xs
Out[35]:
array([[0., 1.],
       [2., 3.],
       [4., 5.]])
In [36]:
ys
Out[36]:
array([[0., 1.],
       [2., 3.]])
In [37]:
cdist(xs, ys)
Out[37]:
array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])
In [38]:
res = np.concatenate([cdist(x, ys) for x in np.split(xs, 3, 0)])
res
Out[38]:
array([[0.        , 2.82842712],
       [2.82842712, 0.        ],
       [5.65685425, 2.82842712]])
In [39]:
%%time

Z = cdist(X, Y)
CPU times: user 9.98 s, sys: 8 ms, total: 9.99 s
Wall time: 9.99 s

Using multiprocessing

In [40]:
from multiprocessing import Pool
In [41]:
%%time

with Pool(processes=4) as p:
    Z1 = p.starmap(cdist, [(X_, Y) for X_ in np.split(X, 100, 0)])
    Z1 = np.concatenate(Z1)
CPU times: user 32 ms, sys: 44 ms, total: 76 ms
Wall time: 2.94 s

Check

In [42]:
np.testing.assert_allclose(Z, Z1)

Using `concurrent.futures

In [43]:
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
In [44]:
def cdist_(args):
    return cdist(*args)
In [45]:
%%time

with ProcessPoolExecutor(max_workers=4) as pool:
    Z2 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
    Z2 = np.concatenate(Z2)
CPU times: user 136 ms, sys: 88 ms, total: 224 ms
Wall time: 2.7 s

Check

In [46]:
np.testing.assert_allclose(Z, Z2)

Using joblib

joblib provides parallel processing using a comprehension syntax.

In [47]:
from joblib import Parallel, delayed
In [48]:
%%time

Z3 = Parallel(n_jobs=4)(delayed(cdist)(X_, Y) for X_ in np.split(X, 100, 0))
Z3 = np.concatenate(Z3)
CPU times: user 144 ms, sys: 60 ms, total: 204 ms
Wall time: 2.68 s

Check

In [49]:
np.testing.assert_allclose(Z, Z3)

Using threads

Note that there is no gain with using multiple threads for computationally intensive tasks because of the GIL.

In [50]:
%%time

with ThreadPoolExecutor(max_workers=4) as pool:
    Z4 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
    Z4 = np.concatenate(Z4)
CPU times: user 10.3 s, sys: 184 ms, total: 10.4 s
Wall time: 10.2 s

Check

In [51]:
np.testing.assert_allclose(Z, Z4)