I have the following piece of code that I execute around 2 million times in my application to parse that many records. This part seems to be the bottleneck and I was wondering if anyone could help me by suggesting some nifty tricks that could make these simple string manipulations faster.
try:
data = []
start = 0
end = 0
for info in self.Columns():
end = start + (info.columnLength)
slice = line[start:end]
if slice == '' or len(slice) != info.columnLength:
raise 'Wrong Input'
if info.hasSignage:
if(slice[0:1].strip() != '+' and slice[0:1].strip() != '-'):
raise 'Wrong Input'
if not info.skipColumn:
data.append(slice)
start = end
parsedLine = data
except:
parsedLine = False
def fubarise(data):
try:
if nasty(data):
raise ValueError("Look, Ma, I'm doing a big fat GOTO ...") # sheesh #1
more_of_the_same()
parsed_line = data
except ValueError:
parsed_line = False
# so it can be a "data" or False -- sheesh #2
return parsed_line
There is no point in having different error messages in the raise
statement; they are never seen. Sheesh #3.
Update: Here is a suggested improvement which uses struct.unpack
to partition input lines rapidly. It also illustrates better exception handling, under the assumption that the writer of the code is also running it and stopping on the first error is acceptable. A robust implementation which logs all errors in all columns of all lines for a user audience is another matter. Note that typically the error checking for each column would be much more extensive e.g. checking for a leading sign but not checking whether the column contains a valid number seems a little odd.
import struct
def unpacked_records(self):
cols = self.Columns()
unpack_fmt = ""
sign_checks = []
start = 0
for colx, info in enumerate(cols, 1):
clen = info.columnLength
if clen < 1:
raise ValueError("Column %d: Bad columnLength %r" % (colx, clen))
if info.skipColumn:
unpack_fmt += str(clen) + "x"
else:
unpack_fmt += str(clen) + "s"
if info.hasSignage:
sign_checks.append(start)
start += clen
expected_len = start
unpack = struct.Struct(unpack_fmt).unpack
for linex, line in enumerate(self.whatever_the_list_of_lines_is, 1):
if len(line) != expected_len:
raise ValueError(
"Line %d: Actual length %d, expected %d"
% (linex, len(line), expected_len))
if not all(line[i] in '+-' for i in sign_checks):
raise ValueError("Line %d: At least one column fails sign check" % linex)
yield unpack(line) # a tuple
what about (using some classes to have an executable example):
class Info(object):
columnLength = 5
hasSignage = True
skipColumn = False
class Something(object):
def Columns(self):
return [Info()]*4
def bottleneck(self):
try:
data = []
start = 0
end = 0
line = '+this-is just a line for testing'
for info in self.Columns():
start = end
collength = info.columnLength
end = start + collength
if info.skipColumn: # start with this
continue
elif collength == 0:
raise ValueError('Wrong Input')
slice = line[start:end] # only now slicing, because it
# is probably most expensive part
if len(slice) != collength:
raise ValueError('Wrong Input')
elif info.hasSignage and slice[0] not in '+-': # bit more compact
raise ValueError('Wrong Input')
else:
data.append(slice)
parsedLine = data
except:
parsedLine = False
Something().bottleneck()
edit:
when length of slice is 0, slice[0] does not exist, so if collength == 0
has to be checked for first
edit2: You are using this bit of code for many many lines, but the column info does not change, right? That allows you, to
edit3: I wonder how you will know what data index belongs to which column if you use 'skipColumn'...
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With