So I have some data files that look like this:
47
425 425 -3 15000 15000 900 385 315 3 370 330 2 340 330 2
325 315 2 325 240 2 340 225 2 370 225 2 385 240 2
385 315 2 475 240 3 460 240 2 460 255 2 475 255 2
475 240 2 595 315 3 580 330 2 550 330 2 535 315 2
535 240 2 550 225 2 580 225 2 595 240 2 595 315 2
700 315 3 685 330 2 655 330 2 640 315 2 640 240 2
655 225 2 685 225 2 700 240 2 700 315 2 700 315 3
9076 456 2 9102 449 2 9127 443 2 9152 437 2 9178 433 2
9203 430 2 9229 428 2 9254 427 2 9280 425 2 9305 425 2
0 0 999 6865 259999
20
425 425 -3 15000 15000 900 385 315 3 370 330 2 340 330 2
325 315 2 325 240 2 340 225 2 370 225 2 385 240 2
385 315 2 475 240 3 460 240 2 460 255 2 475 255 2
475 240 2 595 315 3 580 330 2 550 330 2 535 315 2
The first number is the number of points in the following block of text, and then the block of text has that many points with up to 5 points per line. Each point has 3 components (I'll call them x, y, z). x and y get 6 characters, while z gets 4, so each point takes 16 characters. Occasionally z is 9999 resulting in no space between y and z, so using split() will mess up parsing those lines. Also, all the numbers are integers (no decimals), but there are some negatives.
In the actual file the blocks are generally 1000 points long with some blocks being smaller (at the end of a "page" where page breaks are denoted by z=9999)
My initial solution was to use regex:
import re
def get_points_regex(filename):
with open(filename, 'r') as f:
text = f.read()
points = []
for m in re.finditer('([ \d-]{6})([ \d-]{6})([ \d\-]{4})', text):
point = tuple(int(i) for i in m.groups())
points.append(point)
return points
My test file is 55283 lines long (4.4 MB) and contains 274761 points.
Using timeit on get_points_regex
I get 560 ms.
I then figured that while finditer is memory efficient, generating thousands of match objects is slow when I don't need any of their features, so I made a version using re.findall
:
def get_points_regex2():
with open(filename, 'r') as f:
text = f.read()
points = re.findall(r'([ \d-]{6})([ \d-]{6})([ \d\-]{4})', text)
points = [tuple(map(int, point)) for point in points]
return points
This version runs in 414 ms, 1.35x faster than finditer.
Then I was thinking that for such simple patterns regex might be overkill, so I made a version using pure python:
def get_points_simple():
points = []
with open(filename, 'r') as f:
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points.append((x, y, z))
return points
This runs in 386 ms, 1.07x faster than regex.
Then I broke down and tried Cython for the first time. I'm just running using the %%cython
cell magic in a jupyter notebook. I came up with this:
%%cython
def get_points_cython(filename):
cdef int i, x, y, z
points = []
f = open(filename, 'r')
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points.append((x, y, z))
f.close()
return points
The cython function runs in 196 ms. (2x faster than pure python)
I tried to simplify some expressions, like not using a context manager for file opening. While I declared the integers I wasn't sure what else to do so I left the rest alone. I made a couple attempts at doing a 2D integer array instead of a list of tuples for points
, but python segfaulted (I'm assuming that's what happened, the IPython kernal died). I had cdef int points[1000000][3]
then I assigned with statements like points[j][1] = x
while incrementing j
. From some light reading and very little C background I think that might be a rather large array? Stack vs. heap (I don't know what these really are)? Need things like malloc? I'm a bit lost on that stuff.
Next I had read that maybe I should just use Numpy since Cython is good at that. Following this I was able to create this function:
%%cython
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
def get_points_cython_numpy(filename):
cdef int i, j, x, y, z
cdef np.ndarray points = np.zeros([1000000, 3], dtype=DTYPE)
f = open(filename, 'r')
j = 0
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points[j, 0] = x
points[j, 1] = y
points[j, 2] = z
j = j + 1
f.close()
return points
Unfortunately this takes 263 ms, so a little slower.
Am I missing something obvious with cython or python std lib that would make parsing this any faster, or is this about as fast as it gets for a file of this size?
I thought about pandas and numpy loading functions, but I figured the chunk size rows would complicate it too much. At one point I about had something working with pandas read_fwf followed by DataFrame.values.reshape(-1, 3), then drop rows with NaNs, but I knew that had to be slower by that point.
Any ideas to speed this up would be very appreciated!
I'd love to get this below 100ms so that a GUI can be updated rapidly from reading these files as they get generated. (Move slider > run background analysis > load data > plot results in real time).
Here is a faster example, it use fast_atoi()
to convert string to int, it's 2x faster then get_points_cython()
on my pc. If the number of points line have the same width (8 chars), then I think I can speedup it further (about 12x faster then get_points_cython()
).
%%cython
import numpy as np
cimport numpy as np
import cython
cdef int fast_atoi(char *buff):
cdef int c = 0, sign = 0, x = 0
cdef char *p = buff
while True:
c = p[0]
if c == 0:
break
if c == 45:
sign = 1
elif c > 47 and c < 58:
x = x * 10 + c - 48
p += 1
return -x if sign else x
@cython.boundscheck(False)
@cython.wraparound(False)
def get_points_cython_numpy(filename):
cdef int i, j, x, y, z, n_chunks
cdef bytes line, chunk
cdef int[:, ::1] points = np.zeros([500000, 3], np.int32)
f = open(filename, 'rb')
j = 0
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = fast_atoi(chunk[0:6])
y = fast_atoi(chunk[6:12])
z = fast_atoi(chunk[12:16])
points[j, 0] = x
points[j, 1] = y
points[j, 2] = z
j = j + 1
f.close()
return points.base[:j]
Here is the fasest method, the idea is read the whole file content into a bytes object, and get points data from it.
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline int fast_atoi(char *buf, int size):
cdef int i=0 ,c = 0, sign = 0, x = 0
for i in range(size):
c = buf[i]
if c == 0:
break
if c == 45:
sign = 1
elif c > 47 and c < 58:
x = x * 10 + c - 48
return -x if sign else x
@cython.boundscheck(False)
@cython.wraparound(False)
def fastest_read_points(fn):
cdef bytes buf
with open(fn, "rb") as f:
buf = f.read().replace(b"\n", b"") # change it with your endline.
cdef char * p = buf
cdef int length = len(buf)
cdef char * buf_end = p + length
cdef int count = length // 16 * 2 # create enough large array
cdef int[:, ::1] res = np.zeros((count, 3), np.int32)
cdef int i, j, block_count
i = 0
while p < buf_end:
block_count = fast_atoi(p, 10)
p += 10
for j in range(block_count):
res[i, 0] = fast_atoi(p, 6)
res[i, 1] = fast_atoi(p+6, 6)
res[i, 2] = fast_atoi(p+12, 4)
p += 16
i += 1
return res.base[:i]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With