Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why is C++ much faster than python with boost?

My goal is to write a small library for spectral finite elements in Python and to that purpose I tried extending python with a C++ library using Boost, with the hope that it would make my code faster.

class Quad {
    public:
        Quad(int, int);
        double integrate(boost::function<double(std::vector<double> const&)> const&);
        double integrate_wrapper(boost::python::object const&);
        std::vector< std::vector<double> > nodes;
        std::vector<double> weights;
};

...

namespace std {
    typedef std::vector< std::vector< std::vector<double> > > cube;
    typedef std::vector< std::vector<double> > mat;
    typedef std::vector<double> vec;
}

...

double Quad::integrate(boost::function<double(vec const&)> const& func) {

    double result = 0.;
    for (unsigned int i = 0; i < nodes.size(); ++i) {
        result += func(nodes[i]) * weights[i];
    }
    return result;
}

// ---- PYTHON WRAPPER ----
double Quad::integrate_wrapper(boost::python::object const& func) {
    std::function<double(vec const&)> lambda;
    switch (this->nodes[0].size()) {
        case 1: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func (v[0])); }; break;
        case 2: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1])); }; break;
        case 3: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1], v[2])); }; break;
        default: cout << "Dimension must be 1, 2, or 3" << endl; exit(0);
    }
    return integrate(lambda);
}

// ---- EXPOSE TO PYTHON ----
BOOST_PYTHON_MODULE(hermite)
{
    using namespace boost::python;

    class_<std::vec>("double_vector")
        .def(vector_indexing_suite<std::vec>())
        ;

    class_<std::mat>("double_mat")
        .def(vector_indexing_suite<std::mat>())
        ;

    class_<Quad>("Quad", init<int,int>())
        .def("integrate", &Quad::integrate_wrapper)
        .def_readonly("nodes", &Quad::nodes)
        .def_readonly("weights", &Quad::weights)
        ;
}

I compared the performance of three different methods to calculate the integral of two functions. The two functions are:

  • The function f1(x,y,z) = x*x
  • A function that is more difficult to evaluate: f2(x,y,z) = np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z)

The methods used are:

  1. Call the library from a C++ program:

    double func(vector<double> v) {
        return F1_OR_F2;
    }
    
    int main() {
        hermite::Quad quadrature(100, 3);
        double result = quadrature.integrate(func);
        cout << "Result = " << result << endl;
    }
    
  2. Call the library from a Python script:

    import hermite
    def function(x, y, z): return F1_OR_F2
    my_quad = hermite.Quad(100, 3)
    result = my_quad.integrate(function)
    
  3. Use a for loop in Python:

    import hermite
    def function(x, y, z): return F1_OR_F2
    my_quad = hermite.Quad(100, 3)
    weights = my_quad.weights
    nodes = my_quad.nodes
    result = 0.
    for i in range(len(weights)):
        result += weights[i] * function(nodes[i][0], nodes[i][1], nodes[i][2])
    

Here are the execution times of each of the method (The time was measured using the time command for method 1, and the python module time for methods 2 and 3, and the C++ code was compiled using Cmake and set (CMAKE_BUILD_TYPE Release))

  • For f1:

    • Method 1: 0.07s user 0.01s system 99% cpu 0.083 total
    • Method 2: 0.19s
    • Method 3: 3.06s
  • For f2:

    • Method 1: 0.28s user 0.01s system 99% cpu 0.289 total
    • Method 2: 12.47s
    • Method 3: 16.31s

Based on these results, my questions are the following:

  • Why is the first method so much faster than the second?

  • Could the python wrapper be improved to reach comparable performance between methods 1 and 2?

  • Why is method 2 more sensitive than method 3 to the difficulty of the function to integrate?


EDIT: I also tried to define a function that accepts a string as argument, writes it to a file, and proceeds to compile the file and dynamically load the resulting .so file:

double Quad::integrate_from_string(string const& function_body) {

    // Write function to file
    ofstream helper_file;
    helper_file.open("/tmp/helper_function.cpp");
    helper_file << "#include <vector>\n#include <cmath>\n";
    helper_file << "extern \"C\" double toIntegrate(std::vector<double> v) {\n";
    helper_file << "    return " << function_body << ";\n}";
    helper_file.close();

    // Compile file
    system("c++ /tmp/helper_function.cpp -o /tmp/helper_function.so -shared -fPIC");

    // Load function dynamically
    typedef double (*vec_func)(vec);
    void *function_so = dlopen("/tmp/helper_function.so", RTLD_NOW);
    vec_func func = (vec_func) dlsym(function_so, "toIntegrate");
    double result = integrate(func);
    dlclose(function_so);
    return result;
}

It's quite dirty and probably not very portable, so I'd be happy to find a better solution, but it works well and plays nicely with the ccode function of sympy.


SECOND EDIT I have rewritten the function in pure Python Using Numpy.

import numpy as np
import numpy.polynomial.hermite_e as herm
import time
def integrate(function, degrees):
    dim = len(degrees)
    nodes_multidim = []
    weights_multidim = []
    for i in range(dim):
        nodes_1d, weights_1d = herm.hermegauss(degrees[i])
        nodes_multidim.append(nodes_1d)
        weights_multidim.append(weights_1d)
    grid_nodes = np.meshgrid(*nodes_multidim)
    grid_weights = np.meshgrid(*weights_multidim)
    nodes_flattened = []
    weights_flattened = []
    for i in range(dim):
        nodes_flattened.append(grid_nodes[i].flatten())
        weights_flattened.append(grid_weights[i].flatten())
    nodes = np.vstack(nodes_flattened)
    weights = np.prod(np.vstack(weights_flattened), axis=0)
    return np.dot(function(nodes), weights)

def function(v): return F1_OR_F2
result = integrate(function, [100,100,100])
print("-> Result = " + str(result) + ", Time = " + str(end-start))

Somewhat surprisingly (at least to me), there is no significant difference in performance between this method and the pure C++ implementation. In particular, it takes 0.059s for f1 and 0.36s for f2.

like image 222
Rastapopoulos Avatar asked Feb 12 '18 15:02

Rastapopoulos


People also ask

Why does C code run faster than Python?

C/C++ is relatively fast as compared to Python because when you run the Python script, its interpreter will interpret the script line by line and generate output but in C, the compiler will first compile it and generate an output which is optimized with respect to the hardware.

How much faster is C compared to Python?

The Speed Issue Taking the fastest individual run times for several popular programming languages from the binary-tree benchmark on The Computer Language Benchmarks Game, Python's 48.03 seconds stand no chance against 0.94 seconds in C++ or 1.54 seconds in C.

Are for loops faster in C than Python?

The C code will be significantly faster even if compiled completely unoptimized. Because python is a higher level language, there's a lot of computational and representational baggage in python that isn't present in C. A loop void of content exposes a lot of that baggage.

How many times Python is slower than C++?

It means Python takes 25 times more time to run the same algorithm compared to C++.


Video Answer


2 Answers

Your functions take vectors by value, which involves copying the vector. integrate_wrapper does extra copies.

It also makes sense to accept boost::function by reference and capture func by reference in those lambdas.

Change these to (note the & and const& bits):

double integrate(boost::function<double(std::vector<double> const&)> const&);

double Quad::integrate_wrapper(boost::python::object func) {
    std::function<double(vec const&)> lambda;
    switch (this->nodes[0].size()) {
        case 1: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func (v[0])); }; break;
        case 2: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1])); }; break;
        case 3: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1], v[2])); }; break;
        default: cout << "Dimension must be 1, 2, or 3" << endl; exit(0);
    }
    return integrate(lambda);
}

Still though, calling a Python function from C++ is more expensive than calling a C++ function.


People normally use numpy for fast linear algebra in Python, it uses SIMD for many common operations. You should probably consider using numpy first before rolling out a C++ implementation. In C++ you would have to use Intel MKL on Eigen to vectorize.

like image 89
Maxim Egorushkin Avatar answered Oct 20 '22 00:10

Maxim Egorushkin


An alternative way

In a bit less general way your problem can be solved a lot easier. You could write the integration and the function in pure python code and compile it using numba.

First approach (running 0.025s (I7-4771) per integration after the first run)

The funktion is compiled at the first call, this takes about 0.5s

function_2:

@nb.njit(fastmath=True)
def function_to_integrate(x,y,z):
return np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z)

Integration

@nb.jit(fastmath=True)
def integrate3(num_int_Points):
  nodes_1d, weights_1d = herm.hermegauss(num_int_Points)

  result=0.

  for i in range(num_int_Points):
    for j in range(num_int_Points):
      result+=np.sum(function_to_integrate(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])

  return result

Testing

import numpy as np
import numpy.polynomial.hermite_e as herm
import numba as nb
import time

t1=time.time()
nodes_1d, weights_1d = herm.hermegauss(num_int_Points)

for i in range(100):
  #result = integrate3(nodes_1d,weights_1d,100)
  result = integrate3(100) 

print(time.time()-t1)
print(result)

Second approach

The function can also run in parallell, when integrating over many elements the gauss points and weights may be calculated only once. This will result in a runtime of about 0.005s.

@nb.njit(fastmath=True,parallel=True)
def integrate3(nodes_1d,weights_1d,num_int_Points):

  result=0.

  for i in nb.prange(num_int_Points):
    for j in range(num_int_Points):
      result+=np.sum(function_to_integrate(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])

  return result

Passing a abitrary function

import numpy as np
import numpy.polynomial.hermite_e as herm
import numba as nb
import time

def f(x,y,z):
  return np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z)

def make_integrate3(f):
  f_jit=nb.njit(f,fastmath=True)
  @nb.njit(fastmath=True,parallel=True)
  def integrate_3(nodes_1d,weights_1d,num_int_Points):
      result=0.
      for i in nb.prange(num_int_Points):
        for j in range(num_int_Points):
          result+=np.sum(f_jit(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])

      return result

  return integrate_3


int_fun=make_integrate3(f)
num_int_Points=100
nodes_1d, weights_1d = herm.hermegauss(num_int_Points)
#Calling it the first time (takes about 1s)
result = int_fun(nodes_1d,weights_1d,100)

t1=time.time()
for i in range(100):
  result = int_fun(nodes_1d,weights_1d,100)

print(time.time()-t1)
print(result)

After the first call this takes about 0.002s using Numba 0.38 with Intel SVML

like image 23
max9111 Avatar answered Oct 19 '22 23:10

max9111