The following questions
and their respective answers made me think how I could parse a single mathematical expression (in general terms along the lines of this answer https://stackoverflow.com/a/594294/1672565) given by a (more or less trusted) user efficiently for 20k to 30k input values coming from a database. I implemented a quick and dirty benchmark so I could compare different solutions.
# Runs with Python 3(.4)
import pprint
import time
# This is what I have
userinput_function = '5*(1-(x*0.1))' # String - numbers should be handled as floats
demo_len = 20000 # Parameter for benchmark (20k to 30k in real life)
print_results = False
# Some database, represented by an array of dicts (simplified for this example)
database_xy = []
for a in range(1, demo_len, 1):
database_xy.append({
'x':float(a),
'y_eval':0,
'y_sympya':0,
'y_sympyb':0,
'y_sympyc':0,
'y_aevala':0,
'y_aevalb':0,
'y_aevalc':0,
'y_numexpr': 0,
'y_simpleeval':0
})
# Solution #1: eval [yep, totally unsafe]
time_start = time.time()
func = eval("lambda x: " + userinput_function)
for item in database_xy:
item['y_eval'] = func(item['x'])
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('1 eval: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #2a: sympy - evalf (http://www.sympy.org)
import sympy
time_start = time.time()
x = sympy.symbols('x')
sympy_function = sympy.sympify(userinput_function)
for item in database_xy:
item['y_sympya'] = float(sympy_function.evalf(subs={x:item['x']}))
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('2a sympy: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #2b: sympy - lambdify (http://www.sympy.org)
from sympy.utilities.lambdify import lambdify
import sympy
import numpy
time_start = time.time()
sympy_functionb = sympy.sympify(userinput_function)
func = lambdify(x, sympy_functionb, 'numpy') # returns a numpy-ready function
xx = numpy.zeros(len(database_xy))
for index, item in enumerate(database_xy):
xx[index] = item['x']
yy = func(xx)
for index, item in enumerate(database_xy):
item['y_sympyb'] = yy[index]
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('2b sympy: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #2c: sympy - lambdify with numexpr [and numpy] (http://www.sympy.org)
from sympy.utilities.lambdify import lambdify
import sympy
import numpy
import numexpr
time_start = time.time()
sympy_functionb = sympy.sympify(userinput_function)
func = lambdify(x, sympy_functionb, 'numexpr') # returns a numpy-ready function
xx = numpy.zeros(len(database_xy))
for index, item in enumerate(database_xy):
xx[index] = item['x']
yy = func(xx)
for index, item in enumerate(database_xy):
item['y_sympyc'] = yy[index]
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('2c sympy: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #3a: asteval [based on ast] - with string magic (http://newville.github.io/asteval/index.html)
from asteval import Interpreter
aevala = Interpreter()
time_start = time.time()
aevala('def func(x):\n\treturn ' + userinput_function)
for item in database_xy:
item['y_aevala'] = aevala('func(' + str(item['x']) + ')')
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('3a aeval: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #3b (M Newville): asteval [based on ast] - parse & run (http://newville.github.io/asteval/index.html)
from asteval import Interpreter
aevalb = Interpreter()
time_start = time.time()
exprb = aevalb.parse(userinput_function)
for item in database_xy:
aevalb.symtable['x'] = item['x']
item['y_aevalb'] = aevalb.run(exprb)
time_end = time.time()
print('3b aeval: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #3c (M Newville): asteval [based on ast] - parse & run with numpy (http://newville.github.io/asteval/index.html)
from asteval import Interpreter
import numpy
aevalc = Interpreter()
time_start = time.time()
exprc = aevalc.parse(userinput_function)
x = numpy.array([item['x'] for item in database_xy])
aevalc.symtable['x'] = x
y = aevalc.run(exprc)
for index, item in enumerate(database_xy):
item['y_aevalc'] = y[index]
time_end = time.time()
print('3c aeval: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #4: simpleeval [based on ast] (https://github.com/danthedeckie/simpleeval)
from simpleeval import simple_eval
time_start = time.time()
for item in database_xy:
item['y_simpleeval'] = simple_eval(userinput_function, names={'x': item['x']})
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('4 simpleeval: ' + str(round(time_end - time_start, 4)) + ' seconds')
# Solution #5 numexpr [and numpy] (https://github.com/pydata/numexpr)
import numpy
import numexpr
time_start = time.time()
x = numpy.zeros(len(database_xy))
for index, item in enumerate(database_xy):
x[index] = item['x']
y = numexpr.evaluate(userinput_function)
for index, item in enumerate(database_xy):
item['y_numexpr'] = y[index]
time_end = time.time()
if print_results:
pprint.pprint(database_xy)
print('5 numexpr: ' + str(round(time_end - time_start, 4)) + ' seconds')
On my old test machine (Python 3.4, Linux 3.11 x86_64, two cores, 1.8GHz) I get the following results:
1 eval: 0.0185 seconds
2a sympy: 10.671 seconds
2b sympy: 0.0315 seconds
2c sympy: 0.0348 seconds
3a aeval: 2.8368 seconds
3b aeval: 0.5827 seconds
3c aeval: 0.0246 seconds
4 simpleeval: 1.2363 seconds
5 numexpr: 0.0312 seconds
What sticks out is the incredible speed of eval, though I do not want to use this in real life. The second best solution seems to be numexpr, which depends on numpy - a dependency I would like to avoid, although this is not a hard requirement. The next best thing is simpleeval, which is build around ast. aeval, another ast-based solution, suffers from the fact that I have to convert every single float input value into a string first, around which I could not find a way. sympy was initially my favorite because it offers the most flexible and apparently safest solution, but it ended up being last with some impressive distance to the second to last solution.
Update 1: There is a much faster approach using sympy. See solution 2b. It is almost as good as numexpr, though I am not sure whether sympy is actually using it internally.
Update 2: The sympy implementations now use sympify instead of simplify (as recommended by its lead developer, asmeurer - thanks). It is not using numexpr unless it is explicitly asked to do so (see solution 2c). I also added two significantly faster solutions based on asteval (thanks to M Newville).
What options do I have to speed any of the relatively safer solutions up even further? Are there other, safe(-ish) approaches using ast directly for instance?
I have used the C++ ExprTK library in the past with great success. Here is a benchmark speed test amongst other C++ parsers (e.g. Muparser, MathExpr, ATMSP etc...) and ExprTK comes out on top.
There is a Python wrapper to ExprTK called cexprtk which I have used and have found to be very fast. You are able to compile the mathematical expression just once and then evaluate this serialised expression as many times as required. Here is a simple example code using cexprtk
with the userinput_function
:
import cexprtk
import time
userinput_function = '5*(1-(x*0.1))' # String - numbers should be handled as floats
demo_len = 20000 # Parameter for benchmark (20k to 30k in real life)
time_start = time.time()
x = 1
st = cexprtk.Symbol_Table({"x":x}, add_constants = True) # Setup the symbol table
Expr = cexprtk.Expression(userinput_function, st) # Apply the symbol table to the userinput_function
for x in range(0,demo_len,1):
st.variables['x'] = x # Update the symbol table with the new x value
Expr() # evaluate expression
time_end = time.time()
print('1 cexprtk: ' + str(round(time_end - time_start, 4)) + ' seconds')
On my machine (Linux, dual core, 2.5GHz), for a demo length of 20000 this completes in 0.0202 seconds.
For a demo length of 2,000,000 cexprtk
finishes in 1.23 seconds.
Since you asked about asteval, there is a way to use it and get faster results:
aeval = Interpreter()
time_start = time.time()
expr = aeval.parse(userinput_function)
for item in database_xy:
aeval.symtable['x'] = item['x']
item['y_aeval'] = aeval.run(expr)
time_end = time.time()
That is, you can first parse ("pre-compile") the user input function, and then insert each new value of x
into the symbol table and the use Interpreter.run()
to evaluate the compiled expression for that value. On your scale, I think this will get you close to 0.5 seconds.
If you are willing to use numpy
, a hybrid solution:
aeval = Interpreter()
time_start = time.time()
expr = aeval.parse(userinput_function)
x = numpy.array([item['x'] for item in database_xy])
aeval.symtable['x'] = x
y = aeval.run(expr)
time_end = time.time()
should be much faster, and comparable in run time to using numexpr
.
CPython (and pypy) use a very simple stack language for executing functions, and it is fairly easy to write the bytecode yourself, using the ast module.
import sys
PY3 = sys.version_info.major > 2
import ast
from ast import parse
import types
from dis import opmap
ops = {
ast.Mult: opmap['BINARY_MULTIPLY'],
ast.Add: opmap['BINARY_ADD'],
ast.Sub: opmap['BINARY_SUBTRACT'],
ast.Div: opmap['BINARY_TRUE_DIVIDE'],
ast.Pow: opmap['BINARY_POWER'],
}
LOAD_CONST = opmap['LOAD_CONST']
RETURN_VALUE = opmap['RETURN_VALUE']
LOAD_FAST = opmap['LOAD_FAST']
def process(consts, bytecode, p, stackSize=0):
if isinstance(p, ast.Expr):
return process(consts, bytecode, p.value, stackSize)
if isinstance(p, ast.BinOp):
szl = process(consts, bytecode, p.left, stackSize)
szr = process(consts, bytecode, p.right, stackSize)
if type(p.op) in ops:
bytecode.append(ops[type(p.op)])
else:
print(p.op)
raise Exception("unspported opcode")
return max(szl, szr) + stackSize + 1
if isinstance(p, ast.Num):
if p.n not in consts:
consts.append(p.n)
idx = consts.index(p.n)
bytecode.append(LOAD_CONST)
bytecode.append(idx % 256)
bytecode.append(idx // 256)
return stackSize + 1
if isinstance(p, ast.Name):
bytecode.append(LOAD_FAST)
bytecode.append(0)
bytecode.append(0)
return stackSize + 1
raise Exception("unsupported token")
def makefunction(inp):
def f(x):
pass
if PY3:
oldcode = f.__code__
kwonly = oldcode.co_kwonlyargcount
else:
oldcode = f.func_code
stack_size = 0
consts = [None]
bytecode = []
p = ast.parse(inp).body[0]
stack_size = process(consts, bytecode, p, stack_size)
bytecode.append(RETURN_VALUE)
bytecode = bytes(bytearray(bytecode))
consts = tuple(consts)
if PY3:
code = types.CodeType(oldcode.co_argcount, oldcode.co_kwonlyargcount, oldcode.co_nlocals, stack_size, oldcode.co_flags, bytecode, consts, oldcode.co_names, oldcode.co_varnames, oldcode.co_filename, 'f', oldcode.co_firstlineno, b'')
f.__code__ = code
else:
code = types.CodeType(oldcode.co_argcount, oldcode.co_nlocals, stack_size, oldcode.co_flags, bytecode, consts, oldcode.co_names, oldcode.co_varnames, oldcode.co_filename, 'f', oldcode.co_firstlineno, '')
f.func_code = code
return f
This has the distinct advantage of generating essentially the same function as eval
, and it scales almost exactly as well as compile
+eval
(the compile
step is slightly slower than eval
's, and eval
will precompute anything it can (1+1+x
gets compiled as 2+x
).
For comparison, eval
finishes your 20k test in 0.0125 seconds, and makefunction
finishes in 0.014 seconds. Increasing the number of iterations to 2,000,000, eval
finishes in 1.23 seconds and makefunction
finishes in 1.32 seconds.
An interesting note, pypy recognizes that eval
and makefunction
produce essentially the same function, so the JIT warmup for the first accelerates the second.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With