In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cython
import timeit
import math
In [2]:
%load_ext cython

Native code compilation

We will see how to convert Python code to native compiled code. We will use the example of calculating the pairwise distance between a set of vectors, a \(O(n^2)\) operation.

For native code compilation, it is usually preferable to use explicit for loops and minimize the use of numpy vectorization and broadcasting because

  • It makes it easier for the numba JIT to optimize
  • It is easier to “cythonize”
  • It is easier to port to C++

However, use of vectors and matrices is fine especially if you will be porting to use a C++ library such as Eigen.

Timing code

Manual

In [3]:
import time

def f(n=1):
    start = time.time()
    time.sleep(n)
    elapsed = time.time() - start
    return elapsed
In [4]:
f(1)
Out[4]:
1.0016651153564453

Clock time

In [5]:
%%time

time.sleep(1)
CPU times: user 634 µs, sys: 1.11 ms, total: 1.74 ms
Wall time: 1 s

Using timeit

The -r argument says how many runs to average over, and -n says how many times to run the function in a loop per run.

In [6]:
%timeit time.sleep(0.01)
11.3 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [7]:
%timeit -r3 time.sleep(0.01)
11.1 ms ± 124 µs per loop (mean ± std. dev. of 3 runs, 100 loops each)
In [8]:
%timeit -n10 time.sleep(0.01)
11.2 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [9]:
%timeit -r3 -n10 time.sleep(0.01)
11 ms ± 99.5 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Time unit conversions

1 s = 1,000 ms
1 ms = 1,000 µs
1 µs = 1,000 ns

Profiling

If you want to identify bottlenecks in a Python script, do the following:

  • First make sure that the script is modular - i.e. it consists mainly of function calls
  • Each function should be fairly small and only do one thing
  • Then run a profiler to identify the bottleneck function(s) and optimize them

See the Python docs on profiling Python code

Profiling can be done in a notebook with %prun, with the following readouts as column headers:

  • ncalls
    • for the number of calls,
  • tottime
    • for the total time spent in the given function (and excluding time made in calls to sub-functions),
  • percall
    • is the quotient of tottime divided by ncalls
  • cumtime
    • is the total time spent in this and all subfunctions (from invocation till exit). This figure is accurate even for recursive functions.
  • percall
    • is the quotient of cumtime divided by primitive calls
  • filename:lineno(function)
    • provides the respective data of each function
In [10]:
def foo1(n):
    return np.sum(np.square(np.arange(n)))

def foo2(n):
    return sum(i*i for i in range(n))

def foo3(n):
    [foo1(n) for i in range(10)]
    foo2(n)

def foo4(n):
    return [foo2(n) for i in range(100)]

def work(n):
    foo1(n)
    foo2(n)
    foo3(n)
    foo4(n)
In [11]:
%%time

work(int(1e5))
CPU times: user 1.33 s, sys: 4.25 ms, total: 1.33 s
Wall time: 1.34 s
In [12]:
%prun -q -D work.prof work(int(1e5))

*** Profile stats marshalled to file 'work.prof'.
In [13]:
import pstats
p = pstats.Stats('work.prof')
p.print_stats()
pass
Fri Mar 30 15:17:26 2018    work.prof

         10200380 function calls in 2.535 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    2.535    2.535 {built-in method builtins.exec}
       11    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
      102    1.088    0.011    2.531    0.025 {built-in method builtins.sum}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.003    0.003 <ipython-input-10-32cd1fde8562>:8(<listcomp>)
       11    0.001    0.000    0.001    0.000 {built-in method numpy.core.multiarray.arange}
       11    0.001    0.000    0.001    0.000 {method 'reduce' of 'numpy.ufunc' objects}
       11    0.000    0.000    0.001    0.000 /usr/local/lib/python3.6/site-packages/numpy/core/fromnumeric.py:1778(sum)
       11    0.000    0.000    0.001    0.000 /usr/local/lib/python3.6/site-packages/numpy/core/_methods.py:31(_sum)
      102    0.000    0.000    2.531    0.025 <ipython-input-10-32cd1fde8562>:4(foo2)
        1    0.000    0.000    2.480    2.480 <ipython-input-10-32cd1fde8562>:12(<listcomp>)
        1    0.000    0.000    0.028    0.028 <ipython-input-10-32cd1fde8562>:7(foo3)
       11    0.002    0.000    0.003    0.000 <ipython-input-10-32cd1fde8562>:1(foo1)
 10200102    1.443    0.000    1.443    0.000 <ipython-input-10-32cd1fde8562>:5(<genexpr>)
        1    0.000    0.000    2.535    2.535 <ipython-input-10-32cd1fde8562>:14(work)
        1    0.000    0.000    2.480    2.480 <ipython-input-10-32cd1fde8562>:11(foo4)
        1    0.000    0.000    2.535    2.535 <string>:1(<module>)


In [14]:
p.sort_stats('time', 'cumulative').print_stats('foo')
pass
Fri Mar 30 15:17:26 2018    work.prof

         10200380 function calls in 2.535 seconds

   Ordered by: internal time, cumulative time
   List reduced from 17 to 4 due to restriction <'foo'>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       11    0.002    0.000    0.003    0.000 <ipython-input-10-32cd1fde8562>:1(foo1)
      102    0.000    0.000    2.531    0.025 <ipython-input-10-32cd1fde8562>:4(foo2)
        1    0.000    0.000    0.028    0.028 <ipython-input-10-32cd1fde8562>:7(foo3)
        1    0.000    0.000    2.480    2.480 <ipython-input-10-32cd1fde8562>:11(foo4)


In [15]:
p.sort_stats('ncalls').print_stats(5)
pass
Fri Mar 30 15:17:26 2018    work.prof

         10200380 function calls in 2.535 seconds

   Ordered by: call count
   List reduced from 17 to 5 due to restriction <5>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 10200102    1.443    0.000    1.443    0.000 <ipython-input-10-32cd1fde8562>:5(<genexpr>)
      102    1.088    0.011    2.531    0.025 {built-in method builtins.sum}
      102    0.000    0.000    2.531    0.025 <ipython-input-10-32cd1fde8562>:4(foo2)
       11    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
       11    0.001    0.000    0.001    0.000 {built-in method numpy.core.multiarray.arange}


Optimizing a function

Our example will be to optimize a function that calculates the pairwise distance between a set of vectors.

We first use a built-in function fromscipy to check that our answers are right and also to benchmark how our code compares in speed to an optimized compiled routine.

In [16]:
from scipy.spatial.distance import squareform, pdist
In [17]:
n = 100
p = 100
xs = np.random.random((n, p))
In [18]:
sol = squareform(pdist(xs))
In [19]:
%timeit -r3 -n10 squareform(pdist(xs))
492 µs ± 14.1 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Python

Simple version

In [20]:
def pdist_py(xs):
    """Unvectorized Python."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            for k in range(p):
                A[i,j] += (xs[i, k] - xs[j, k])**2
            A[i,j] = np.sqrt(A[i,j])
    return A

Note that we

  • first check that the output is right
  • then check how fast the code is
In [21]:
func = pdist_py
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
1.17 s ± 2.98 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)

Exploiting symmetry

In [22]:
def pdist_sym(xs):
    """Unvectorized Python."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            for k in range(p):
                A[i,j] += (xs[i, k] - xs[j, k])**2
            A[i,j] = np.sqrt(A[i,j])
    A += A.T
    return A
In [23]:
func = pdist_sym
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
573 ms ± 2.35 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)

Vectorizing inner loop

In [24]:
def pdist_vec(xs):
    """Vectorize inner loop."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            A[i,j] = np.sqrt(np.sum((xs[i] - xs[j])**2))
    A += A.T
    return A
In [25]:
func = pdist_vec
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
70.8 ms ± 275 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Broadcasting and vectorizing

Note that the broadcast version does twice as much work as it does not exploit symmetry.

In [26]:
def pdist_numpy(xs):
    """Fully vectroized version."""
    return np.sqrt(np.square(xs[:, None] - xs[None, :]).sum(axis=-1))
In [27]:
func = pdist_numpy
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 squareform(func(xs))
True
4.23 ms ± 165 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

JIT with numba

We use the numba.jit decorator which will trigger generation and execution of compiled code when the function is first called.

In [28]:
from numba import jit

Using jit as a function

In [29]:
pdist_numba_py = jit(pdist_py, nopython=True, cache=True)
In [30]:
func = pdist_numba_py
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
1.8 ms ± 25 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Using jit as a decorator

In [31]:
@jit(nopython=True, cache=True)
def pdist_numba_py_1(xs):
    """Unvectorized Python."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            for k in range(p):
                A[i,j] += (xs[i, k] - xs[j, k])**2
            A[i,j] = np.sqrt(A[i,j])
    return A
In [32]:
func = pdist_numba_py_1
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
1.75 ms ± 34.8 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Can we make the code faster?

Note that in the inner loop, we are updating a matrix when we only need to update a scalar. Let’s fix this.

In [33]:
@jit(nopython=True, cache=True)
def pdist_numba_py_2(xs):
    """Unvectorized Python."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            d = 0.0
            for k in range(p):
                d += (xs[i, k] - xs[j, k])**2
            A[i,j] = np.sqrt(d)
    return A
In [34]:
func = pdist_numba_py_2
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
954 µs ± 25.2 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Can we make the code even faster?

We can also try to exploit symmetry.

In [35]:
@jit(nopython=True, cache=True)
def pdist_numba_py_sym(xs):
    """Unvectorized Python."""
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            d = 0.0
            for k in range(p):
                d += (xs[i, k] - xs[j, k])**2
            A[i,j] = np.sqrt(d)
    A += A.T
    return A
In [36]:
func = pdist_numba_py_sym
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
521 µs ± 7.12 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Does jit work with vectorized code?

In [37]:
pdist_numba_vec = jit(pdist_vec, nopython=True, cache=True)
In [38]:
%timeit -r3 -n10 pdist_vec(xs)
70.1 ms ± 454 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)
In [39]:
func = pdist_numba_vec
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
1.58 ms ± 17.8 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Does jit work with broadcasting?

In [40]:
pdist_numba_numpy = jit(pdist_numpy, nopython=True, cache=True)
In [41]:
%timeit -r3 -n10 pdist_numpy(xs)
4.01 ms ± 159 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)
In [42]:
func = pdist_numba_numpy
try:
    print(np.allclose(func(xs), sol))
    %timeit -r3 -n10 func(xs)
except Exception as e:
    print(e)
Failed at nopython (nopython frontend)
Internal error at <numba.typeinfer.StaticGetItemConstraint object at 0x11310b9e8>:
--%<-----------------------------------------------------------------
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/numba/errors.py", line 259, in new_error_context
    yield
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 503, in __call__
    self.resolve(typeinfer, typeinfer.typevars, fnty=self.func)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 441, in resolve
    sig = typeinfer.resolve_call(fnty, pos_args, kw_args, literals=literals)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 1115, in resolve_call
    literals=literals)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/context.py", line 191, in resolve_function_type
    res = defn.apply(args, kws)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/templates.py", line 207, in apply
    sig = generic(args, kws)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/arraydecl.py", line 165, in generic
    out = get_array_index_type(ary, idx)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/arraydecl.py", line 71, in get_array_index_type
    % (ty, idx))
TypeError: unsupported array index type none in (slice<a:b>, none)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 137, in propagate
    constraint(typeinfer)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 341, in __call__
    self.fallback(typeinfer)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 503, in __call__
    self.resolve(typeinfer, typeinfer.typevars, fnty=self.func)
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/usr/local/lib/python3.6/site-packages/numba/errors.py", line 265, in new_error_context
    six.reraise(type(newerr), newerr, sys.exc_info()[2])
  File "/usr/local/lib/python3.6/site-packages/numba/six.py", line 658, in reraise
    raise value.with_traceback(tb)
  File "/usr/local/lib/python3.6/site-packages/numba/errors.py", line 259, in new_error_context
    yield
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 503, in __call__
    self.resolve(typeinfer, typeinfer.typevars, fnty=self.func)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 441, in resolve
    sig = typeinfer.resolve_call(fnty, pos_args, kw_args, literals=literals)
  File "/usr/local/lib/python3.6/site-packages/numba/typeinfer.py", line 1115, in resolve_call
    literals=literals)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/context.py", line 191, in resolve_function_type
    res = defn.apply(args, kws)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/templates.py", line 207, in apply
    sig = generic(args, kws)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/arraydecl.py", line 165, in generic
    out = get_array_index_type(ary, idx)
  File "/usr/local/lib/python3.6/site-packages/numba/typing/arraydecl.py", line 71, in get_array_index_type
    % (ty, idx))
numba.errors.InternalError: unsupported array index type none in (slice<a:b>, none)
[1] During: typing of intrinsic-call at <ipython-input-26-f5984e680640> (3)
[2] During: typing of static-get-item at <ipython-input-26-f5984e680640> (3)
--%<-----------------------------------------------------------------

File "<ipython-input-26-f5984e680640>", line 3

We need to use reshape to broadcast

In [43]:
def pdist_numpy_(xs):
    """Fully vectroized version."""
    return np.sqrt(np.square(xs.reshape(n,1,p) - xs.reshape(1,n,p)).sum(axis=-1))
In [44]:
pdist_numba_numpy_ = jit(pdist_numpy_, nopython=True, cache=True)
In [45]:
%timeit -r3 -n10 pdist_numpy_(xs)
4.17 ms ± 310 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)
In [46]:
func = pdist_numba_numpy_
print(np.allclose(func(xs), sol))
%timeit -r3 -n10 func(xs)
True
6.02 ms ± 65.9 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)

Summary

  • numba appears to work best with converting fairly explicit Python code
  • This might change in the future as the numba JIT compiler becomes more sophisticated
  • Always check optimized code for correctness
  • We can use timeit magic as a simple way to benchmark functions

Cython

Cython is an Ahead Of Time (AOT) compiler. It compiles the code and replaces the function invoked with the compiled version.

In the notebook, calling %cython -a magic shows code colored by how many Python C API calls are being made. You want to reduce the yellow as much as possible.

In [47]:
%%cython -a

import numpy as np

def pdist_cython_1(xs):
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            d = 0.0
            for k in range(p):
                d += (xs[i,k] - xs[j,k])**2
            A[i,j] = np.sqrt(d)
    A += A.T
    return A
Out[47]:
Cython: _cython_magic_a0e00db7609c25c75f864fdfa1538fee.pyx

Generated by Cython 0.28

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: 
+02: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 03: 
+04: def pdist_cython_1(xs):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_1pdist_cython_1(PyObject *__pyx_self, PyObject *__pyx_v_xs); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_1pdist_cython_1 = {"pdist_cython_1", (PyCFunction)__pyx_pw_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_1pdist_cython_1, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_1pdist_cython_1(PyObject *__pyx_self, PyObject *__pyx_v_xs) {
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pdist_cython_1 (wrapper)", 0);
  __pyx_r = __pyx_pf_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_pdist_cython_1(__pyx_self, ((PyObject *)__pyx_v_xs));

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_pdist_cython_1(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_xs) {
  PyObject *__pyx_v_n = NULL;
  PyObject *__pyx_v_p = NULL;
  PyObject *__pyx_v_A = NULL;
  PyObject *__pyx_v_i = NULL;
  PyObject *__pyx_v_j = NULL;
  PyObject *__pyx_v_d = NULL;
  PyObject *__pyx_v_k = NULL;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pdist_cython_1", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_4);
  __Pyx_XDECREF(__pyx_t_6);
  __Pyx_XDECREF(__pyx_t_13);
  __Pyx_AddTraceback("_cython_magic_a0e00db7609c25c75f864fdfa1538fee.pdist_cython_1", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XDECREF(__pyx_v_n);
  __Pyx_XDECREF(__pyx_v_p);
  __Pyx_XDECREF(__pyx_v_A);
  __Pyx_XDECREF(__pyx_v_i);
  __Pyx_XDECREF(__pyx_v_j);
  __Pyx_XDECREF(__pyx_v_d);
  __Pyx_XDECREF(__pyx_v_k);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple_ = PyTuple_Pack(8, __pyx_n_s_xs, __pyx_n_s_n, __pyx_n_s_p, __pyx_n_s_A, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_d, __pyx_n_s_k); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple_);
  __Pyx_GIVEREF(__pyx_tuple_);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_a0e00db7609c25c75f864fdfa1538fee_1pdist_cython_1, NULL, __pyx_n_s_cython_magic_a0e00db7609c25c75f); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pdist_cython_1, __pyx_t_1) < 0) __PYX_ERR(0, 4, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+05:     n, p = xs.shape
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_xs, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) {
    PyObject* sequence = __pyx_t_1;
    Py_ssize_t size = __Pyx_PySequence_SIZE(sequence);
    if (unlikely(size != 2)) {
      if (size > 2) __Pyx_RaiseTooManyValuesError(2);
      else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
      __PYX_ERR(0, 5, __pyx_L1_error)
    }
    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
    if (likely(PyTuple_CheckExact(sequence))) {
      __pyx_t_2 = PyTuple_GET_ITEM(sequence, 0);
      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 1);
    } else {
      __pyx_t_2 = PyList_GET_ITEM(sequence, 0);
      __pyx_t_3 = PyList_GET_ITEM(sequence, 1);
    }
    __Pyx_INCREF(__pyx_t_2);
    __Pyx_INCREF(__pyx_t_3);
    #else
    __pyx_t_2 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __pyx_t_3 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_3);
    #endif
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  } else {
    Py_ssize_t index = -1;
    __pyx_t_4 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 5, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_4);
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __pyx_t_5 = Py_TYPE(__pyx_t_4)->tp_iternext;
    index = 0; __pyx_t_2 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_2)) goto __pyx_L3_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_2);
    index = 1; __pyx_t_3 = __pyx_t_5(__pyx_t_4); if (unlikely(!__pyx_t_3)) goto __pyx_L3_unpacking_failed;
    __Pyx_GOTREF(__pyx_t_3);
    if (__Pyx_IternextUnpackEndCheck(__pyx_t_5(__pyx_t_4), 2) < 0) __PYX_ERR(0, 5, __pyx_L1_error)
    __pyx_t_5 = NULL;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    goto __pyx_L4_unpacking_done;
    __pyx_L3_unpacking_failed:;
    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
    __pyx_t_5 = NULL;
    if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index);
    __PYX_ERR(0, 5, __pyx_L1_error)
    __pyx_L4_unpacking_done:;
  }
  __pyx_v_n = __pyx_t_2;
  __pyx_t_2 = 0;
  __pyx_v_p = __pyx_t_3;
  __pyx_t_3 = 0;
+06:     A = np.zeros((n, n))
  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 6, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_INCREF(__pyx_v_n);
  __Pyx_GIVEREF(__pyx_v_n);
  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_n);
  __Pyx_INCREF(__pyx_v_n);
  __Pyx_GIVEREF(__pyx_v_n);
  PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_v_n);
  __pyx_t_4 = NULL;
  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
    if (likely(__pyx_t_4)) {
      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
      __Pyx_INCREF(__pyx_t_4);
      __Pyx_INCREF(function);
      __Pyx_DECREF_SET(__pyx_t_2, function);
    }
  }
  if (!__pyx_t_4) {
    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    __Pyx_GOTREF(__pyx_t_1);
  } else {
    #if CYTHON_FAST_PYCALL
    if (PyFunction_Check(__pyx_t_2)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    } else
    #endif
    #if CYTHON_FAST_PYCCALL
    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    } else
    #endif
    {
      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 6, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_6);
      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_4); __pyx_t_4 = NULL;
      __Pyx_GIVEREF(__pyx_t_3);
      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_t_3);
      __pyx_t_3 = 0;
      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    }
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_v_A = __pyx_t_1;
  __pyx_t_1 = 0;
+07:     for i in range(n):
  __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
    __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_7 = 0;
    __pyx_t_8 = NULL;
  } else {
    __pyx_t_7 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 7, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_2);
    __pyx_t_8 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 7, __pyx_L1_error)
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  for (;;) {
    if (likely(!__pyx_t_8)) {
      if (likely(PyList_CheckExact(__pyx_t_2))) {
        if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_2)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 7, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      } else {
        if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_2)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_7); __Pyx_INCREF(__pyx_t_1); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 7, __pyx_L1_error)
        #else
        __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 7, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        #endif
      }
    } else {
      __pyx_t_1 = __pyx_t_8(__pyx_t_2);
      if (unlikely(!__pyx_t_1)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
          else __PYX_ERR(0, 7, __pyx_L1_error)
        }
        break;
      }
      __Pyx_GOTREF(__pyx_t_1);
    }
    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_1);
    __pyx_t_1 = 0;
/* … */
  }
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+08:         for j in range(i+1, n):
    __pyx_t_1 = __Pyx_PyInt_AddObjC(__pyx_v_i, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_6);
    __Pyx_GIVEREF(__pyx_t_1);
    PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_1);
    __Pyx_INCREF(__pyx_v_n);
    __Pyx_GIVEREF(__pyx_v_n);
    PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_v_n);
    __pyx_t_1 = 0;
    __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_range, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
    if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
      __pyx_t_6 = __pyx_t_1; __Pyx_INCREF(__pyx_t_6); __pyx_t_9 = 0;
      __pyx_t_10 = NULL;
    } else {
      __pyx_t_9 = -1; __pyx_t_6 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 8, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_6);
      __pyx_t_10 = Py_TYPE(__pyx_t_6)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 8, __pyx_L1_error)
    }
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    for (;;) {
      if (likely(!__pyx_t_10)) {
        if (likely(PyList_CheckExact(__pyx_t_6))) {
          if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_6)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_1 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 8, __pyx_L1_error)
          #else
          __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_1);
          #endif
        } else {
          if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_6)) break;
          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
          __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_6, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 8, __pyx_L1_error)
          #else
          __pyx_t_1 = PySequence_ITEM(__pyx_t_6, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_1);
          #endif
        }
      } else {
        __pyx_t_1 = __pyx_t_10(__pyx_t_6);
        if (unlikely(!__pyx_t_1)) {
          PyObject* exc_type = PyErr_Occurred();
          if (exc_type) {
            if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
            else __PYX_ERR(0, 8, __pyx_L1_error)
          }
          break;
        }
        __Pyx_GOTREF(__pyx_t_1);
      }
      __Pyx_XDECREF_SET(__pyx_v_j, __pyx_t_1);
      __pyx_t_1 = 0;
/* … */
    }
    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+09:             d = 0.0
      __Pyx_INCREF(__pyx_float_0_0);
      __Pyx_XDECREF_SET(__pyx_v_d, __pyx_float_0_0);
+10:             for k in range(p):
      __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_range, __pyx_v_p); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) {
        __pyx_t_3 = __pyx_t_1; __Pyx_INCREF(__pyx_t_3); __pyx_t_11 = 0;
        __pyx_t_12 = NULL;
      } else {
        __pyx_t_11 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_3);
        __pyx_t_12 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 10, __pyx_L1_error)
      }
      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
      for (;;) {
        if (likely(!__pyx_t_12)) {
          if (likely(PyList_CheckExact(__pyx_t_3))) {
            if (__pyx_t_11 >= PyList_GET_SIZE(__pyx_t_3)) break;
            #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
            __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_11); __Pyx_INCREF(__pyx_t_1); __pyx_t_11++; if (unlikely(0 < 0)) __PYX_ERR(0, 10, __pyx_L1_error)
            #else
            __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_11); __pyx_t_11++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
            __Pyx_GOTREF(__pyx_t_1);
            #endif
          } else {
            if (__pyx_t_11 >= PyTuple_GET_SIZE(__pyx_t_3)) break;
            #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
            __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_11); __Pyx_INCREF(__pyx_t_1); __pyx_t_11++; if (unlikely(0 < 0)) __PYX_ERR(0, 10, __pyx_L1_error)
            #else
            __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_11); __pyx_t_11++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
            __Pyx_GOTREF(__pyx_t_1);
            #endif
          }
        } else {
          __pyx_t_1 = __pyx_t_12(__pyx_t_3);
          if (unlikely(!__pyx_t_1)) {
            PyObject* exc_type = PyErr_Occurred();
            if (exc_type) {
              if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
              else __PYX_ERR(0, 10, __pyx_L1_error)
            }
            break;
          }
          __Pyx_GOTREF(__pyx_t_1);
        }
        __Pyx_XDECREF_SET(__pyx_v_k, __pyx_t_1);
        __pyx_t_1 = 0;
/* … */
      }
      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+11:                 d += (xs[i,k] - xs[j,k])**2
        __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        __Pyx_INCREF(__pyx_v_i);
        __Pyx_GIVEREF(__pyx_v_i);
        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_i);
        __Pyx_INCREF(__pyx_v_k);
        __Pyx_GIVEREF(__pyx_v_k);
        PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_v_k);
        __pyx_t_4 = __Pyx_PyObject_GetItem(__pyx_v_xs, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_4);
        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
        __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        __Pyx_INCREF(__pyx_v_j);
        __Pyx_GIVEREF(__pyx_v_j);
        PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_j);
        __Pyx_INCREF(__pyx_v_k);
        __Pyx_GIVEREF(__pyx_v_k);
        PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_v_k);
        __pyx_t_13 = __Pyx_PyObject_GetItem(__pyx_v_xs, __pyx_t_1); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_13);
        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
        __pyx_t_1 = PyNumber_Subtract(__pyx_t_4, __pyx_t_13); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
        __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
        __pyx_t_13 = PyNumber_Power(__pyx_t_1, __pyx_int_2, Py_None); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_13);
        __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
        __pyx_t_1 = PyNumber_InPlaceAdd(__pyx_v_d, __pyx_t_13); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_1);
        __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
        __Pyx_DECREF_SET(__pyx_v_d, __pyx_t_1);
        __pyx_t_1 = 0;
+12:             A[i,j] = np.sqrt(d)
      __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 12, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      __pyx_t_13 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_sqrt); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 12, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_13);
      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
      __pyx_t_1 = NULL;
      if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_13))) {
        __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_13);
        if (likely(__pyx_t_1)) {
          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_13);
          __Pyx_INCREF(__pyx_t_1);
          __Pyx_INCREF(function);
          __Pyx_DECREF_SET(__pyx_t_13, function);
        }
      }
      if (!__pyx_t_1) {
        __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_13, __pyx_v_d); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_3);
      } else {
        #if CYTHON_FAST_PYCALL
        if (PyFunction_Check(__pyx_t_13)) {
          PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_d};
          __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_13, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
          __Pyx_GOTREF(__pyx_t_3);
        } else
        #endif
        #if CYTHON_FAST_PYCCALL
        if (__Pyx_PyFastCFunction_Check(__pyx_t_13)) {
          PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_d};
          __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_13, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
          __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
          __Pyx_GOTREF(__pyx_t_3);
        } else
        #endif
        {
          __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 12, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_4);
          __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); __pyx_t_1 = NULL;
          __Pyx_INCREF(__pyx_v_d);
          __Pyx_GIVEREF(__pyx_v_d);
          PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_d);
          __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_13, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
          __Pyx_GOTREF(__pyx_t_3);
          __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
        }
      }
      __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
      __pyx_t_13 = PyTuple_New(2); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 12, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_13);
      __Pyx_INCREF(__pyx_v_i);
      __Pyx_GIVEREF(__pyx_v_i);
      PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_v_i);
      __Pyx_INCREF(__pyx_v_j);
      __Pyx_GIVEREF(__pyx_v_j);
      PyTuple_SET_ITEM(__pyx_t_13, 1, __pyx_v_j);
      if (unlikely(PyObject_SetItem(__pyx_v_A, __pyx_t_13, __pyx_t_3) < 0)) __PYX_ERR(0, 12, __pyx_L1_error)
      __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;
      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+13:     A += A.T
  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_A, __pyx_n_s_T); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_6 = PyNumber_InPlaceAdd(__pyx_v_A, __pyx_t_2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 13, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __Pyx_DECREF_SET(__pyx_v_A, __pyx_t_6);
  __pyx_t_6 = 0;
+14:     return A
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_A);
  __pyx_r = __pyx_v_A;
  goto __pyx_L0;
In [48]:
def pdist_base(xs):
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            d = 0.0
            for k in range(p):
                d += (xs[i,k] - xs[j,k])**2
            A[i,j] = np.sqrt(d)
    A += A.T
    return A
In [49]:
%timeit -r3 -n1 pdist_base(xs)
424 ms ± 1.52 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
In [50]:
func = pdist_cython_1
print(np.allclose(func(xs), sol))
%timeit -r3 -n1 func(xs)
True
399 ms ± 7.19 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)

Cython with static types

  • We provide types for all variables so that Cython can optimize their compilation to C code.
  • Note numpy functions are optimized for working with ndarrays and have unnecessary overhead for scalars. We therefor replace them with math functions from the C math library.
In [51]:
%%cython -a

import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pow

@cython.boundscheck(False)
@cython.wraparound(False)
def pdist_cython_2(double[:, :] xs):
    cdef int n, p
    cdef int i, j, k
    cdef double[:, :] A
    cdef double d

    n = xs.shape[0]
    p = xs.shape[1]
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            d = 0.0
            for k in range(p):
                d += pow(xs[i,k] - xs[j,k],2)
            A[i,j] = sqrt(d)
    for i in range(1, n):
        for j in range(i):
            A[i, j] = A[j, i]
    return A
Out[51]:
Cython: _cython_magic_e68147339654c54aed8e40a1749baa8e.pyx

Generated by Cython 0.28

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

 01: 
+02: import cython
  __pyx_t_1 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+03: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 3, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 3, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 04: cimport numpy as np
 05: from libc.math cimport sqrt, pow
 06: 
 07: @cython.boundscheck(False)
 08: @cython.wraparound(False)
+09: def pdist_cython_2(double[:, :] xs):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_e68147339654c54aed8e40a1749baa8e_1pdist_cython_2(PyObject *__pyx_self, PyObject *__pyx_arg_xs); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_e68147339654c54aed8e40a1749baa8e_1pdist_cython_2 = {"pdist_cython_2", (PyCFunction)__pyx_pw_46_cython_magic_e68147339654c54aed8e40a1749baa8e_1pdist_cython_2, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_e68147339654c54aed8e40a1749baa8e_1pdist_cython_2(PyObject *__pyx_self, PyObject *__pyx_arg_xs) {
  __Pyx_memviewslice __pyx_v_xs = { 0, 0, { 0 }, { 0 }, { 0 } };
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pdist_cython_2 (wrapper)", 0);
  assert(__pyx_arg_xs); {
    __pyx_v_xs = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(__pyx_arg_xs, PyBUF_WRITABLE); if (unlikely(!__pyx_v_xs.memview)) __PYX_ERR(0, 9, __pyx_L3_error)
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_e68147339654c54aed8e40a1749baa8e.pdist_cython_2", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_e68147339654c54aed8e40a1749baa8e_pdist_cython_2(__pyx_self, __pyx_v_xs);

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_e68147339654c54aed8e40a1749baa8e_pdist_cython_2(CYTHON_UNUSED PyObject *__pyx_self, __Pyx_memviewslice __pyx_v_xs) {
  int __pyx_v_n;
  int __pyx_v_p;
  int __pyx_v_i;
  int __pyx_v_j;
  int __pyx_v_k;
  __Pyx_memviewslice __pyx_v_A = { 0, 0, { 0 }, { 0 }, { 0 } };
  double __pyx_v_d;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pdist_cython_2", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_4);
  __Pyx_XDECREF(__pyx_t_5);
  __PYX_XDEC_MEMVIEW(&__pyx_t_6, 1);
  __Pyx_AddTraceback("_cython_magic_e68147339654c54aed8e40a1749baa8e.pdist_cython_2", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __PYX_XDEC_MEMVIEW(&__pyx_v_xs, 1);
  __PYX_XDEC_MEMVIEW(&__pyx_v_A, 1);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__31 = PyTuple_Pack(9, __pyx_n_s_xs, __pyx_n_s_xs, __pyx_n_s_n, __pyx_n_s_p, __pyx_n_s_i, __pyx_n_s_j, __pyx_n_s_k, __pyx_n_s_A, __pyx_n_s_d); if (unlikely(!__pyx_tuple__31)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_tuple__31);
  __Pyx_GIVEREF(__pyx_tuple__31);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_e68147339654c54aed8e40a1749baa8e_1pdist_cython_2, NULL, __pyx_n_s_cython_magic_e68147339654c54aed); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pdist_cython_2, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_codeobj__32 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__31, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_cliburn_ipython_cython__c, __pyx_n_s_pdist_cython_2, 9, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__32)) __PYX_ERR(0, 9, __pyx_L1_error)
 10:     cdef int n, p
 11:     cdef int i, j, k
 12:     cdef double[:, :] A
 13:     cdef double d
 14: 
+15:     n = xs.shape[0]
  __pyx_v_n = (__pyx_v_xs.shape[0]);
+16:     p = xs.shape[1]
  __pyx_v_p = (__pyx_v_xs.shape[1]);
+17:     A = np.zeros((n, n))
  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_zeros); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_2 = __Pyx_PyInt_From_int(__pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_5);
  __Pyx_GIVEREF(__pyx_t_2);
  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2);
  __Pyx_GIVEREF(__pyx_t_4);
  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_4);
  __pyx_t_2 = 0;
  __pyx_t_4 = 0;
  __pyx_t_4 = NULL;
  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
    if (likely(__pyx_t_4)) {
      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
      __Pyx_INCREF(__pyx_t_4);
      __Pyx_INCREF(function);
      __Pyx_DECREF_SET(__pyx_t_3, function);
    }
  }
  if (!__pyx_t_4) {
    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    __Pyx_GOTREF(__pyx_t_1);
  } else {
    #if CYTHON_FAST_PYCALL
    if (PyFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5};
      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    } else
    #endif
    #if CYTHON_FAST_PYCCALL
    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_5};
      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
    } else
    #endif
    {
      __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 17, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_2);
      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_4); __pyx_t_4 = NULL;
      __Pyx_GIVEREF(__pyx_t_5);
      PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_t_5);
      __pyx_t_5 = 0;
      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 17, __pyx_L1_error)
      __Pyx_GOTREF(__pyx_t_1);
      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
    }
  }
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __pyx_t_6 = __Pyx_PyObject_to_MemoryviewSlice_dsds_double(__pyx_t_1, PyBUF_WRITABLE); if (unlikely(!__pyx_t_6.memview)) __PYX_ERR(0, 17, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_A = __pyx_t_6;
  __pyx_t_6.memview = NULL;
  __pyx_t_6.data = NULL;
+18:     for i in range(n):
  __pyx_t_7 = __pyx_v_n;
  __pyx_t_8 = __pyx_t_7;
  for (__pyx_t_9 = 0; __pyx_t_9 < __pyx_t_8; __pyx_t_9+=1) {
    __pyx_v_i = __pyx_t_9;
+19:         for j in range(i+1, n):
    __pyx_t_10 = __pyx_v_n;
    __pyx_t_11 = __pyx_t_10;
    for (__pyx_t_12 = (__pyx_v_i + 1); __pyx_t_12 < __pyx_t_11; __pyx_t_12+=1) {
      __pyx_v_j = __pyx_t_12;
+20:             d = 0.0
      __pyx_v_d = 0.0;
+21:             for k in range(p):
      __pyx_t_13 = __pyx_v_p;
      __pyx_t_14 = __pyx_t_13;
      for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
        __pyx_v_k = __pyx_t_15;
+22:                 d += pow(xs[i,k] - xs[j,k],2)
        __pyx_t_16 = __pyx_v_i;
        __pyx_t_17 = __pyx_v_k;
        __pyx_t_18 = __pyx_v_j;
        __pyx_t_19 = __pyx_v_k;
        __pyx_v_d = (__pyx_v_d + pow(((*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_xs.data + __pyx_t_16 * __pyx_v_xs.strides[0]) ) + __pyx_t_17 * __pyx_v_xs.strides[1]) ))) - (*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_xs.data + __pyx_t_18 * __pyx_v_xs.strides[0]) ) + __pyx_t_19 * __pyx_v_xs.strides[1]) )))), 2.0));
      }
+23:             A[i,j] = sqrt(d)
      __pyx_t_20 = __pyx_v_i;
      __pyx_t_21 = __pyx_v_j;
      *((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_A.data + __pyx_t_20 * __pyx_v_A.strides[0]) ) + __pyx_t_21 * __pyx_v_A.strides[1]) )) = sqrt(__pyx_v_d);
    }
  }
+24:     for i in range(1, n):
  __pyx_t_7 = __pyx_v_n;
  __pyx_t_8 = __pyx_t_7;
  for (__pyx_t_9 = 1; __pyx_t_9 < __pyx_t_8; __pyx_t_9+=1) {
    __pyx_v_i = __pyx_t_9;
+25:         for j in range(i):
    __pyx_t_10 = __pyx_v_i;
    __pyx_t_11 = __pyx_t_10;
    for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_11; __pyx_t_12+=1) {
      __pyx_v_j = __pyx_t_12;
+26:             A[i, j] = A[j, i]
      __pyx_t_22 = __pyx_v_j;
      __pyx_t_23 = __pyx_v_i;
      __pyx_t_24 = __pyx_v_i;
      __pyx_t_25 = __pyx_v_j;
      *((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_A.data + __pyx_t_24 * __pyx_v_A.strides[0]) ) + __pyx_t_25 * __pyx_v_A.strides[1]) )) = (*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_A.data + __pyx_t_22 * __pyx_v_A.strides[0]) ) + __pyx_t_23 * __pyx_v_A.strides[1]) )));
    }
  }
+27:     return A
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_1 = __pyx_memoryview_fromslice(__pyx_v_A, 2, (PyObject *(*)(char *)) __pyx_memview_get_double, (int (*)(char *, PyObject *)) __pyx_memview_set_double, 0);; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 27, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_r = __pyx_t_1;
  __pyx_t_1 = 0;
  goto __pyx_L0;
In [52]:
func = pdist_cython_2
print(np.allclose(func(xs), sol))
%timeit -r3 -n1 func(xs)
True
693 µs ± 342 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)

Wrapping C++ cdoe

Function to port

def pdist_base(xs):
    n, p = xs.shape
    A = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            d = 0.0
            for k in range(p):
                d += (xs[i,k] - xs[j,k])**2
            A[i,j] = np.sqrt(d)
    A += A.T
    return A

First check that the function works as expected

In [53]:
%%file main.cpp
#include <iostream>
#include <Eigen/Dense>
#include <cmath>

using std::cout;

// takes numpy array as input and returns another numpy array
Eigen::MatrixXd pdist(Eigen::MatrixXd xs) {
    int n = xs.rows() ;
    int p = xs.cols();

    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(n, n);
    for (int i=0; i<n; i++) {
        for (int j=i+1; j<n; j++) {
            double d = 0;
            for (int k=0; k<p; k++) {
                d += std::pow(xs(i,k) - xs(j,k), 2);
            }
            A(i, j) = std::sqrt(d);
        }
    }
    A += A.transpose().eval();

    return A;
}

int main() {
    using namespace Eigen;

    MatrixXd A(3,2);
    A << 0, 0,
         3, 4,
         5, 12;
    std::cout << pdist(A) << "\n";
}
Overwriting main.cpp
In [54]:
%%bash

g++ -o main.exe main.cpp -I./eigen3
In [55]:
%%bash

./main.exe
      0       5      13
      5       0 8.24621
     13 8.24621       0
In [56]:
A = np.array([
    [0, 0],
    [3, 4],
    [5, 12]
])
In [57]:
squareform(pdist(A))
Out[57]:
array([[ 0.        ,  5.        , 13.        ],
       [ 5.        ,  0.        ,  8.24621125],
       [13.        ,  8.24621125,  0.        ]])

Now use the boiler plate for wrapping

In [58]:
%%file wrap.cpp
<%
cfg['compiler_args'] = ['-std=c++11']
cfg['include_dirs'] = ['./eigen3']
setup_pybind11(cfg)
%>

#include <pybind11/pybind11.h>
#include <pybind11/eigen.h>

// takes numpy array as input and returns another numpy array
Eigen::MatrixXd pdist(Eigen::MatrixXd xs) {
    int n = xs.rows() ;
    int p = xs.cols();

    Eigen::MatrixXd A = Eigen::MatrixXd::Zero(n, n);
    for (int i=0; i<n; i++) {
        for (int j=i+1; j<n; j++) {
            double d = 0;
            for (int k=0; k<p; k++) {
                d += std::pow(xs(i,k) - xs(j,k), 2);
            }
            A(i, j) = std::sqrt(d);
        }
    }
    A += A.transpose().eval();

    return A;
}

PYBIND11_PLUGIN(wrap) {
    pybind11::module m("wrap", "auto-compiled c++ extension");
    m.def("pdist", &pdist);
    return m.ptr();
}
Overwriting wrap.cpp
In [59]:
import cppimport
import numpy as np

code = cppimport.imp("wrap")
print(code.pdist(A))
[[ 0.          5.         13.        ]
 [ 5.          0.          8.24621125]
 [13.          8.24621125  0.        ]]
In [60]:
func = code.pdist
print(np.allclose(func(xs), sol))
%timeit -r3 -n1 func(xs)
True
614 µs ± 144 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)