In [1]:
import numpy as np
import pandas as pd
from timeit import timeit
Enhancing performance¶
We first look at native code compilation. Here we show 3 common methods
for doing this using numba
JIT compilation, cython
AOT
compilation, and direct wrapping of C++ code using pybind11
. In
general, numba
is the simplest to use, while you have the most
flexibility with pybind11
. Which approach gives the best performance
generally requires some experimentation.
Then we review common methods for concurrent execution of embarrassingly
parallel code using multiprocessing
, concurrent.futures
and
joblib
. Comparison of performance using processes and threads is
made, with a brief explanation of the Global Interpreter Lock (GIL).
More details for each of the libraries used to improve performance is provided in the course notebooks.
Python¶
In [2]:
def cdist(xs, ys):
"""Returns pairwise distance between row vectors in xs and ys.
xs has shape (m, p)
ys has shape (n, p)
Return value has shape (m, n)
"""
m, p = xs.shape
n, p = ys.shape
res = np.empty((m, n))
for i in range(m):
for j in range(n):
res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
return res
Sanity check¶
In [3]:
xs = np.arange(6).reshape(-1,2).astype('float')
ys = np.arange(4).reshape(-1, 2).astype('float')
zs = cdist(xs, ys)
In [4]:
zs
Out[4]:
array([[0. , 2.82842712],
[2.82842712, 0. ],
[5.65685425, 2.82842712]])
In [5]:
%timeit -r 3 -n 10 cdist(xs, ys)
60.7 µs ± 1.39 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)
In [6]:
m = 1000
n = 1000
p = 100
X = np.random.random((m, p))
Y = np.random.random((n, p))
In [7]:
%%time
Z = cdist(X, Y)
CPU times: user 9.46 s, sys: 20 ms, total: 9.48 s
Wall time: 9.48 s
In [8]:
t0 = timeit(lambda : cdist(X, Y), number=1)
Using numba
¶
In [9]:
from numba import jit, njit
In [10]:
@njit
def cdist_numba(xs, ys):
"""Returns pairwise distance between row vectors in xs and ys.
xs has shape (m, p)
ys has shape (n, p)
Return value has shape (m, n)
"""
m, p = xs.shape
n, p = ys.shape
res = np.empty((m, n))
for i in range(m):
for j in range(n):
res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
return res
Check
In [11]:
assert(np.allclose(cdist(xs, ys), cdist_numba(xs, ys)))
In [12]:
%%time
Z = cdist_numba(X, Y)
CPU times: user 280 ms, sys: 8 ms, total: 288 ms
Wall time: 287 ms
In [13]:
t_numba = timeit(lambda : cdist_numba(X, Y), number=1)
Unrolling¶
We can help numba
by unrolling the code.
In [14]:
@njit
def cdist_numba1(xs, ys):
"""Returns pairwise distance between row vectors in xs and ys.
xs has shape (m, p)
ys has shape (n, p)
Return value has shape (m, n)
"""
m, p = xs.shape
n, p = ys.shape
res = np.empty((m, n))
for i in range(m):
for j in range(n):
s = 0
for k in range(p):
s += (ys[j,k] - xs[i,k])**2
res[i, j] = np.sqrt(s)
return res
Check
In [15]:
assert(np.allclose(cdist(xs, ys), cdist_numba1(xs, ys)))
In [16]:
%%time
Z = cdist_numba1(X, Y)
CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 111 ms
In [17]:
t_numba1 = timeit(lambda : cdist_numba1(X, Y), number=1)
Using cython
¶
In [18]:
%load_ext cython
In [19]:
%%cython -a
import numpy as np
def cdist_cython(xs, ys):
"""Returns pairwise distance between row vectors in xs and ys.
xs has shape (m, p)
ys has shape (n, p)
Return value has shape (m, n)
"""
m, p = xs.shape
n, p = ys.shape
res = np.empty((m, n))
for i in range(m):
for j in range(n):
res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
return res
Out[19]:
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01:
+02: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
03:
+04: def cdist_cython(xs, ys):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static char __pyx_doc_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython[] = "Returns pairwise distance between row vectors in xs and ys.\n \n xs has shape (m, p)\n ys has shape (n, p)\n \n Return value has shape (m, n) \n "; static PyMethodDef __pyx_mdef_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython = {"cdist_cython", (PyCFunction)__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython, METH_VARARGS|METH_KEYWORDS, __pyx_doc_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython}; static PyObject *__pyx_pw_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_xs = 0; PyObject *__pyx_v_ys = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("cdist_cython (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_xs,&__pyx_n_s_ys,0}; PyObject* values[2] = {0,0}; if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_xs)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ys)) != 0)) kw_args--; else { __Pyx_RaiseArgtupleInvalid("cdist_cython", 1, 2, 2, 1); __PYX_ERR(0, 4, __pyx_L3_error) } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cdist_cython") < 0)) __PYX_ERR(0, 4, __pyx_L3_error) } } else if (PyTuple_GET_SIZE(__pyx_args) != 2) { goto __pyx_L5_argtuple_error; } else { values[0] = PyTuple_GET_ITEM(__pyx_args, 0); values[1] = PyTuple_GET_ITEM(__pyx_args, 1); } __pyx_v_xs = values[0]; __pyx_v_ys = values[1]; } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("cdist_cython", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 4, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_e0260da0fe5bce3ea805711e620fd9f5.cdist_cython", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython(__pyx_self, __pyx_v_xs, __pyx_v_ys); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_cdist_cython(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_xs, PyObject *__pyx_v_ys) { PyObject *__pyx_v_m = NULL; CYTHON_UNUSED PyObject *__pyx_v_p = NULL; PyObject *__pyx_v_n = NULL; PyObject *__pyx_v_res = NULL; PyObject *__pyx_v_i = NULL; PyObject *__pyx_v_j = NULL; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("cdist_cython", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_4); __Pyx_XDECREF(__pyx_t_6); __Pyx_XDECREF(__pyx_t_11); __Pyx_XDECREF(__pyx_t_12); __Pyx_XDECREF(__pyx_t_13); __Pyx_XDECREF(__pyx_t_14); __Pyx_AddTraceback("_cython_magic_e0260da0fe5bce3ea805711e620fd9f5.cdist_cython", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XDECREF(__pyx_v_m); __Pyx_XDECREF(__pyx_v_p); __Pyx_XDECREF(__pyx_v_n); __Pyx_XDECREF(__pyx_v_res); __Pyx_XDECREF(__pyx_v_i); __Pyx_XDECREF(__pyx_v_j); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple_ = PyTuple_Pack(8, __pyx_n_s_xs, __pyx_n_s_ys, __pyx_n_s_m, __pyx_n_s_p, __pyx_n_s_n, __pyx_n_s_res, __pyx_n_s_i, __pyx_n_s_j); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_e0260da0fe5bce3ea805711e620fd9f5_1cdist_cython, NULL, __pyx_n_s_cython_magic_e0260da0fe5bce3ea8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_cdist_cython, __pyx_t_1) < 0) __PYX_ERR(0, 4, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
05: """Returns pairwise distance between row vectors in xs and ys.
06:
07: xs has shape (m, p)
08: ys has shape (n, p)
09:
10: Return value has shape (m, n)
11: """
12:
+13: m, p = xs.shape
__pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_xs, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) { PyObject* sequence = __pyx_t_1; Py_ssize_t size = __Pyx_PySequence_SIZE(sequence); if (unlikely(size != 2)) { if (size > 2) __Pyx_RaiseTooManyValuesError(2); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); __PYX_ERR(0, 13, __pyx_L1_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { __pyx_t_2 = PyTuple_GET_ITEM(sequence, 0); __pyx_t_3 = PyTuple_GET_ITEM(sequence, 1); } else { __pyx_t_2 = PyList_GET_ITEM(sequence, 0); __pyx_t_3 = PyList_GET_ITEM(sequence, 1); } __Pyx_INCREF(__pyx_t_2); __Pyx_INCREF(__pyx_t_3); #else __pyx_t_2 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); #endif __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { Py_ssize_t index = -1; __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 13, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_5 = Py_TYPE(__pyx_t_4)->tp_iternext; index = 0; __pyx_t_2 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_2)) goto __pyx_L3_unpacking_failed; __Pyx_GOTREF(__pyx_t_2); index = 1; __pyx_t_3 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_3)) goto __pyx_L3_unpacking_failed; __Pyx_GOTREF(__pyx_t_3); if (__Pyx_IternextUnpackEndCheck(__pyx_t_5(__pyx_t_4), 2) < 0) __PYX_ERR(0, 13, __pyx_L1_error) __pyx_t_5 = NULL; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; goto __pyx_L4_unpacking_done; __pyx_L3_unpacking_failed:; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_5 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); __PYX_ERR(0, 13, __pyx_L1_error) __pyx_L4_unpacking_done:; } __pyx_v_m = __pyx_t_2; __pyx_t_2 = 0; __pyx_v_p = __pyx_t_3; __pyx_t_3 = 0;
+14: n, p = ys.shape
__pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_ys, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 14, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) { PyObject* sequence = __pyx_t_1; Py_ssize_t size = __Pyx_PySequence_SIZE(sequence); if (unlikely(size != 2)) { if (size > 2) __Pyx_RaiseTooManyValuesError(2); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); __PYX_ERR(0, 14, __pyx_L1_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); __pyx_t_2 = PyTuple_GET_ITEM(sequence, 1); } else { __pyx_t_3 = PyList_GET_ITEM(sequence, 0); __pyx_t_2 = PyList_GET_ITEM(sequence, 1); } __Pyx_INCREF(__pyx_t_3); __Pyx_INCREF(__pyx_t_2); #else __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 14, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_2 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 14, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); #endif __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { Py_ssize_t index = -1; __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 14, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_5 = Py_TYPE(__pyx_t_4)->tp_iternext; index = 0; __pyx_t_3 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_3)) goto __pyx_L5_unpacking_failed; __Pyx_GOTREF(__pyx_t_3); index = 1; __pyx_t_2 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_2)) goto __pyx_L5_unpacking_failed; __Pyx_GOTREF(__pyx_t_2); if (__Pyx_IternextUnpackEndCheck(__pyx_t_5(__pyx_t_4), 2) < 0) __PYX_ERR(0, 14, __pyx_L1_error) __pyx_t_5 = NULL; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; goto __pyx_L6_unpacking_done; __pyx_L5_unpacking_failed:; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_5 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); __PYX_ERR(0, 14, __pyx_L1_error) __pyx_L6_unpacking_done:; } __pyx_v_n = __pyx_t_3; __pyx_t_3 = 0; __Pyx_DECREF_SET(__pyx_v_p, __pyx_t_2); __pyx_t_2 = 0;
15:
+16: res = np.empty((m, n))
__pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_v_m); __Pyx_GIVEREF(__pyx_v_m); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_m); __Pyx_INCREF(__pyx_v_n); __Pyx_GIVEREF(__pyx_v_n); PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_n); __pyx_t_4 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3); if (likely(__pyx_t_4)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); __Pyx_INCREF(__pyx_t_4); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); } } if (!__pyx_t_4) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif { __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL; __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_t_2); __pyx_t_2 = 0; __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_res = __pyx_t_1; __pyx_t_1 = 0;
+17: for i in range(m):
__pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_m); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0; __pyx_t_8 = NULL; } else { __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_8 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 17, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_8)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 17, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 17, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_8(__pyx_t_3); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 17, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+18: for j in range(n):
__pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { __pyx_t_6 = __pyx_t_1; __Pyx_INCREF(__pyx_t_6); __pyx_t_9 = 0; __pyx_t_10 = NULL; } else { __pyx_t_9 = -1; __pyx_t_6 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __pyx_t_10 = Py_TYPE(__pyx_t_6)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 18, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_10)) { if (likely(PyList_CheckExact(__pyx_t_6))) { if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_6)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 18, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_6)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 18, __pyx_L1_error) #else __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { __pyx_t_1 = __pyx_t_10(__pyx_t_6); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); else __PYX_ERR(0, 18, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_1); } __Pyx_XDECREF_SET(__pyx_v_j, __pyx_t_1); __pyx_t_1 = 0; /* … */ } __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+19: res[i, j] = np.sqrt(np.sum((ys[j] - xs[i])**2))
__pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_sqrt); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_11 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_11, __pyx_n_s_sum); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; __pyx_t_11 = __Pyx_PyObject_GetItem(__pyx_v_ys, __pyx_v_j); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __pyx_t_13 = __Pyx_PyObject_GetItem(__pyx_v_xs, __pyx_v_i); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); __pyx_t_14 = PyNumber_Subtract(__pyx_t_11, __pyx_t_13); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_14); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; __pyx_t_13 = PyNumber_Power(__pyx_t_14, __pyx_int_2, Py_None); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; __pyx_t_14 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_12))) { __pyx_t_14 = PyMethod_GET_SELF(__pyx_t_12); if (likely(__pyx_t_14)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_12); __Pyx_INCREF(__pyx_t_14); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_12, function); } } if (!__pyx_t_14) { __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_12, __pyx_t_13); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; __Pyx_GOTREF(__pyx_t_2); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_12)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_13}; __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_12)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_13}; __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } else #endif { __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_14); __pyx_t_14 = NULL; __Pyx_GIVEREF(__pyx_t_13); PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_13); __pyx_t_13 = 0; __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_11, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; } } __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; __pyx_t_12 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) { __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_4); if (likely(__pyx_t_12)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); __Pyx_INCREF(__pyx_t_12); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_4, function); } } if (!__pyx_t_12) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_4)) { PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_2}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) { PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_2}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif { __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_12); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_12); __pyx_t_12 = NULL; __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_t_2); __pyx_t_2 = 0; __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_11, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; } } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_INCREF(__pyx_v_i); __Pyx_GIVEREF(__pyx_v_i); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_v_i); __Pyx_INCREF(__pyx_v_j); __Pyx_GIVEREF(__pyx_v_j); PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_v_j); if (unlikely(PyObject_SetItem(__pyx_v_res, __pyx_t_4, __pyx_t_1) < 0)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+20: return res
__Pyx_XDECREF(__pyx_r); __Pyx_INCREF(__pyx_v_res); __pyx_r = __pyx_v_res; goto __pyx_L0;
Check
In [20]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))
In [21]:
%%time
Z = cdist_cython(X, Y)
CPU times: user 9.88 s, sys: 8 ms, total: 9.88 s
Wall time: 9.88 s
In [22]:
t_cython = timeit(lambda : cdist_cython(X, Y), number=1)
In [23]:
%%cython -a
import cython
import numpy as np
from libc.math cimport sqrt, pow
@cython.boundscheck(False)
@cython.wraparound(False)
def cdist_cython1(double[:, :] xs, double[:, :] ys):
"""Returns pairwise distance between row vectors in xs and ys.
xs has shape (m, p)
ys has shape (n, p)
Return value has shape (m, n)
"""
cdef int m, n, p
m = xs.shape[0]
n = ys.shape[0]
p = xs.shape[1]
cdef double[:, :] res = np.empty((m, n))
cdef int i, j
cdef double s
for i in range(m):
for j in range(n):
s = 0.0
for k in range(p):
s += pow(ys[j,k] - xs[i,k], 2)
res[i, j] = sqrt(s)
return res
Out[23]:
Generated by Cython 0.28.5
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01:
+02: import cython
__pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+03: import numpy as np
__pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 3, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 3, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
04: from libc.math cimport sqrt, pow
05:
06: @cython.boundscheck(False)
07: @cython.wraparound(False)
+08: def cdist_cython1(double[:, :] xs, double[:, :] ys):
/* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static char __pyx_doc_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1[] = "Returns pairwise distance between row vectors in xs and ys.\n \n xs has shape (m, p)\n ys has shape (n, p)\n \n Return value has shape (m, n) \n "; static PyMethodDef __pyx_mdef_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1 = {"cdist_cython1", (PyCFunction)__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1, METH_VARARGS|METH_KEYWORDS, __pyx_doc_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1}; static PyObject *__pyx_pw_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { __Pyx_memviewslice __pyx_v_xs = { 0, 0, { 0 }, { 0 }, { 0 } }; __Pyx_memviewslice __pyx_v_ys = { 0, 0, { 0 }, { 0 }, { 0 } }; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("cdist_cython1 (wrapper)", 0); { static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_xs,&__pyx_n_s_ys,0}; PyObject* values[2] = {0,0}; if (unlikely(__pyx_kwds)) { Py_ssize_t kw_args; const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args); switch (pos_args) { case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1); CYTHON_FALLTHROUGH; case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0); CYTHON_FALLTHROUGH; case 0: break; default: goto __pyx_L5_argtuple_error; } kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_xs)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_ys)) != 0)) kw_args--; else { __Pyx_RaiseArgtupleInvalid("cdist_cython1", 1, 2, 2, 1); __PYX_ERR(0, 8, __pyx_L3_error) } } if (unlikely(kw_args > 0)) { if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cdist_cython1") < 0)) __PYX_ERR(0, 8, __pyx_L3_error) } } else if (PyTuple_GET_SIZE(__pyx_args) != 2) { goto __pyx_L5_argtuple_error; } else { values[0] = PyTuple_GET_ITEM(__pyx_args, 0); values[1] = PyTuple_GET_ITEM(__pyx_args, 1); } __pyx_v_xs = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(values[0], PyBUF_WRITABLE); if (unlikely(!__pyx_v_xs.memview)) __PYX_ERR(0, 8, __pyx_L3_error) __pyx_v_ys = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(values[1], PyBUF_WRITABLE); if (unlikely(!__pyx_v_ys.memview)) __PYX_ERR(0, 8, __pyx_L3_error) } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; __Pyx_RaiseArgtupleInvalid("cdist_cython1", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 8, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("_cython_magic_c23979ecdfeb0fdf6011e4df6437b314.cdist_cython1", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; __pyx_r = __pyx_pf_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1(__pyx_self, __pyx_v_xs, __pyx_v_ys); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_cdist_cython1(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_xs, __Pyx_memviewslice __pyx_v_ys) { int __pyx_v_m; int __pyx_v_n; int __pyx_v_p; __Pyx_memviewslice __pyx_v_res = { 0, 0, { 0 }, { 0 }, { 0 } }; int __pyx_v_i; int __pyx_v_j; double __pyx_v_s; int __pyx_v_k; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("cdist_cython1", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_4); __Pyx_XDECREF(__pyx_t_5); __PYX_XDEC_MEMVIEW(&__pyx_t_6, 1); __Pyx_AddTraceback("_cython_magic_c23979ecdfeb0fdf6011e4df6437b314.cdist_cython1", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __PYX_XDEC_MEMVIEW(&__pyx_v_res, 1); __PYX_XDEC_MEMVIEW(&__pyx_v_xs, 1); __PYX_XDEC_MEMVIEW(&__pyx_v_ys, 1); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* … */ __pyx_tuple__22 = PyTuple_Pack(10, __pyx_n_s_xs, __pyx_n_s_ys, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_p, __pyx_n_s_res, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_s, __pyx_n_s_k); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__22); __Pyx_GIVEREF(__pyx_tuple__22); /* … */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_c23979ecdfeb0fdf6011e4df6437b314_1cdist_cython1, NULL, __pyx_n_s_cython_magic_c23979ecdfeb0fdf60); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_cdist_cython1, __pyx_t_1) < 0) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_codeobj__23 = (PyObject*)__Pyx_PyCode_New(2, 0, 10, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__22, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_jovyan_cache_ipython_cytho, __pyx_n_s_cdist_cython1, 8, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__23)) __PYX_ERR(0, 8, __pyx_L1_error)
09: """Returns pairwise distance between row vectors in xs and ys.
10:
11: xs has shape (m, p)
12: ys has shape (n, p)
13:
14: Return value has shape (m, n)
15: """
16:
17: cdef int m, n, p
18:
+19: m = xs.shape[0]
__pyx_v_m = (__pyx_v_xs.shape[0]);
+20: n = ys.shape[0]
__pyx_v_n = (__pyx_v_ys.shape[0]);
+21: p = xs.shape[1]
__pyx_v_p = (__pyx_v_xs.shape[1]);
22:
+23: cdef double[:, :] res = np.empty((m, n))
__pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_empty); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_m); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2); __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_4); __pyx_t_2 = 0; __pyx_t_4 = 0; __pyx_t_4 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3); if (likely(__pyx_t_4)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); __Pyx_INCREF(__pyx_t_4); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); } } if (!__pyx_t_4) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5}; __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5}; __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else #endif { __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_4); __pyx_t_4 = NULL; __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_t_5); __pyx_t_5 = 0; __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_6 = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(__pyx_t_1, PyBUF_WRITABLE); if (unlikely(!__pyx_t_6.memview)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_res = __pyx_t_6; __pyx_t_6.memview = NULL; __pyx_t_6.data = NULL;
24:
25: cdef int i, j
26:
27: cdef double s
+28: for i in range(m):
__pyx_t_7 = __pyx_v_m; __pyx_t_8 = __pyx_t_7; for (__pyx_t_9 = 0; __pyx_t_9 < __pyx_t_8; __pyx_t_9+=1) { __pyx_v_i = __pyx_t_9;
+29: for j in range(n):
__pyx_t_10 = __pyx_v_n; __pyx_t_11 = __pyx_t_10; for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_11; __pyx_t_12+=1) { __pyx_v_j = __pyx_t_12;
+30: s = 0.0
__pyx_v_s = 0.0;
+31: for k in range(p):
__pyx_t_13 = __pyx_v_p; __pyx_t_14 = __pyx_t_13; for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) { __pyx_v_k = __pyx_t_15;
+32: s += pow(ys[j,k] - xs[i,k], 2)
__pyx_t_16 = __pyx_v_j; __pyx_t_17 = __pyx_v_k; __pyx_t_18 = __pyx_v_i; __pyx_t_19 = __pyx_v_k; __pyx_v_s = (__pyx_v_s + pow(((*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_ys.data + __pyx_t_16 * __pyx_v_ys.strides[0]) ) + __pyx_t_17 * __pyx_v_ys.strides[1]) ))) - (*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_xs.data + __pyx_t_18 * __pyx_v_xs.strides[0]) ) + __pyx_t_19 * __pyx_v_xs.strides[1]) )))), 2.0)); }
+33: res[i, j] = sqrt(s)
__pyx_t_20 = __pyx_v_i; __pyx_t_21 = __pyx_v_j; *((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_res.data + __pyx_t_20 * __pyx_v_res.strides[0]) ) + __pyx_t_21 * __pyx_v_res.strides[1]) )) = sqrt(__pyx_v_s); } }
+34: return res
__Pyx_XDECREF(__pyx_r); __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_res, 2, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 34, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0;
Check
In [24]:
assert(np.allclose(cdist(xs, ys), cdist_cython(xs, ys)))
In [25]:
%%time
Z = cdist_cython1(X, Y)
CPU times: user 112 ms, sys: 8 ms, total: 120 ms
Wall time: 120 ms
In [26]:
t_cython1 = timeit(lambda : cdist_cython1(X, Y), number=1)
Using pybind11
¶
In [29]:
%%file funcs.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['/usr/include/eigen3']
setup_pybind11(cfg)
%>
#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>
#include <cmath>
#include <Eigen/LU>
namespace py = pybind11;
using Eigen::MatrixXd;
MatrixXd cdist(MatrixXd xs, MatrixXd ys) {
int m = xs.rows();
int n = ys.rows();
int p = ys.cols();
MatrixXd res(m, n);
double s;
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
s = 0;
for (int k=0; k<p; k++) {
s += pow(ys(j,k) - xs(i,k), 2);
}
res(i,j) = sqrt(s);
}
}
return res;
}
PYBIND11_MODULE(funcs, m) {
m.doc() = "auto-compiled c++ extension";
m.def("cdist", &cdist);
}
Overwriting funcs.cpp
Check. Note that the cppimport.imp
only needs to be done once to
build and wrap the C++ module. Once the module is built, it can
subsequently be used like any other module.
In [30]:
import cppimport
funcs = cppimport.imp("funcs")
funcs.cdist(xs, ys)
assert(np.allclose(cdist(xs, ys), cdist_cython1(xs, ys)))
In [31]:
%%time
Z = funcs.cdist(X, Y)
CPU times: user 156 ms, sys: 4 ms, total: 160 ms
Wall time: 157 ms
In [32]:
t_pybind11 = timeit(lambda : funcs.cdist(X, Y), number=1)
Tabulation¶
In [33]:
perf = pd.DataFrame(dict(
methods = ['python', 'numba', 'numba1', 'cython', 'cython1', 'pybind11'],
times = [t0, t_numba, t_numba1, t_cython, t_cython1, t_pybind11],
))
In [34]:
perf['speed-up'] = np.around(perf['times'][0]/perf['times'], 1)
perf
Out[34]:
methods | times | speed-up | |
---|---|---|---|
0 | python | 9.340464 | 1.0 |
1 | numba | 0.277997 | 33.6 |
2 | numba1 | 0.111793 | 83.6 |
3 | cython | 9.911234 | 0.9 |
4 | cython1 | 0.119332 | 78.3 |
5 | pybind11 | 0.154224 | 60.6 |
Using multiple cores¶
The standard implementation of Python uses a Global Interpreter Lock (GIL). This means that only one thread can be run at any one time, and multiple threads work by time-slicing. Hence multi-threaded code with lots of latency can result in speed-ups, but multi-threaded code which is computationally intensive will not see any speed-up. For numerically intensive code, parallel code needs to be run in separate processes to see speed-ups.
First we see how to split the computation into pieces using a loop.
In [35]:
xs
Out[35]:
array([[0., 1.],
[2., 3.],
[4., 5.]])
In [36]:
ys
Out[36]:
array([[0., 1.],
[2., 3.]])
In [37]:
cdist(xs, ys)
Out[37]:
array([[0. , 2.82842712],
[2.82842712, 0. ],
[5.65685425, 2.82842712]])
In [38]:
res = np.concatenate([cdist(x, ys) for x in np.split(xs, 3, 0)])
res
Out[38]:
array([[0. , 2.82842712],
[2.82842712, 0. ],
[5.65685425, 2.82842712]])
In [39]:
%%time
Z = cdist(X, Y)
CPU times: user 9.98 s, sys: 8 ms, total: 9.99 s
Wall time: 9.99 s
Using multiprocessing
¶
In [40]:
from multiprocessing import Pool
In [41]:
%%time
with Pool(processes=4) as p:
Z1 = p.starmap(cdist, [(X_, Y) for X_ in np.split(X, 100, 0)])
Z1 = np.concatenate(Z1)
CPU times: user 32 ms, sys: 44 ms, total: 76 ms
Wall time: 2.94 s
Check
In [42]:
np.testing.assert_allclose(Z, Z1)
Using `concurrent.futures¶
In [43]:
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
In [44]:
def cdist_(args):
return cdist(*args)
In [45]:
%%time
with ProcessPoolExecutor(max_workers=4) as pool:
Z2 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
Z2 = np.concatenate(Z2)
CPU times: user 136 ms, sys: 88 ms, total: 224 ms
Wall time: 2.7 s
Check
In [46]:
np.testing.assert_allclose(Z, Z2)
Using joblib
¶
joblib
provides parallel processing using a comprehension syntax.
In [47]:
from joblib import Parallel, delayed
In [48]:
%%time
Z3 = Parallel(n_jobs=4)(delayed(cdist)(X_, Y) for X_ in np.split(X, 100, 0))
Z3 = np.concatenate(Z3)
CPU times: user 144 ms, sys: 60 ms, total: 204 ms
Wall time: 2.68 s
Check
In [49]:
np.testing.assert_allclose(Z, Z3)
Using threads¶
Note that there is no gain with using multiple threads for computationally intensive tasks because of the GIL.
In [50]:
%%time
with ThreadPoolExecutor(max_workers=4) as pool:
Z4 = list(pool.map(cdist_, [(X_, Y) for X_ in np.split(X, 100, 0)]))
Z4 = np.concatenate(Z4)
CPU times: user 10.3 s, sys: 184 ms, total: 10.4 s
Wall time: 10.2 s
Check
In [51]:
np.testing.assert_allclose(Z, Z4)