Pandas cummin
and cummax
functions seem to be really slow for my use case with many groups. How can I speed them up?
Update
import pandas as pd
import numpy as np
from collections import defaultdict
def cummax(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').cummax()
result = result.values
return result
def transform(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
result = df.groupby('group').transform(lambda x: x.cummax())
result = result.values
return result
def itertuples(g, v):
df1 = pd.DataFrame(g, columns=['group'])
df2 = pd.DataFrame(v)
df = pd.concat([df1, df2], axis=1)
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
return d[g][-1]
for row in df.itertuples(index=True):
index = row[0]
result[index] = d_(row[1], row[2:])
result = np.asarray(result)
return result
def numpy(g, v):
d = defaultdict(list)
result = [np.nan] * len(g)
def d_(g, v):
d[g].append(v)
if len(d[g]) > 1:
d[g][-1] = np.maximum(d[g][-2], d[g][-1])
return d[g][-1]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
result = np.asarray(result)
return result
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)
%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)
gives
1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop
Do you have any further suggestions how I can improve my code?
Old
import pandas as pd
import numpy as np
LENGTH = 100000
df = pd.DataFrame(
np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
columns=['group', 'value'])
df.groupby('group').cummax()
We'll use defaultdict
where the default value will be -np.inf
because I'll be taking max values and I want a default that everything is greater than.
Given an array of groups g
and values to accumulate maximums v
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()
True
headline chart
code
I took some liberties with Divakar's function to make it accurate.
%%cython
import numpy as np
from collections import defaultdict
# this is a cythonized version of the next function
def pir1(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def pir2(g, v):
d = defaultdict(lambda: -np.inf)
result = np.empty(len(g))
def d_(g, v):
d[g] = max(d[g], v)
return d[g]
for i in range(len(g)):
result[i] = d_(g[i], v[i])
return result
def argsort_unique(idx):
# Original idea : http://stackoverflow.com/a/41242285/3293881
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def div1(groupby, value):
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = sorted_groupby * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
def div2(groupby, value):
sidx = np.argsort(groupby, kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
# factorize groups to integers
sorted_groupby = np.append(
0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()
# Get shifts to be used for shifting each group
mx = sorted_value.max() + 1
shifts = (sorted_groupby - sorted_groupby.min()) * mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
NOTES:
integer groups
Over integer based groups, both div1
and div2
produce the same results
np.isclose(div1(g, v), pir1(g, v)).all()
True
np.isclose(div2(g, v), pir1(g, v)).all()
True
general groups
Over string and float based groups div1
becomes inaccurate but easily fixed
g = g / 1000000
np.isclose(div1(g, v), pir1(g, v)).all()
False
np.isclose(div2(g, v), pir1(g, v)).all()
True
time testing
results = pd.DataFrame(
index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
size_map = dict(Large=100, Medium=10, Small=1)
from timeit import timeit
for i in results.index:
for j in results.columns:
results.set_value(
i, j,
timeit(
'{}(g // size_map[j], v)'.format(i),
'from __main__ import {}, size_map, g, v, j'.format(i),
number=100
)
)
results
results.T.plot.barh()
Let's bring some NumPy magic to the table! Well, we will exploit np.maximum.accumulate
.
Explanation
To see how maximum.accumulate
could help us, let's assume we have the groups lined up sequentially.
Let's consider a sample grouby :
grouby column : [0, 0, 0, 1, 1, 2, 2, 2, 2, 2]
Let's consider a sample value :
value column : [3, 1, 4, 1, 3, 3, 1, 5, 2, 4]
Using maximum.accumulate
simply on value
won't give us the desired output, as we need to do these accumulations only within each group. To do so, one trick would be to offset each group from the group before it.
There could be few methods to do that offsetting work. One easy way would be to offset each group with an offset of max of value
+ 1 more than the previous one. For the sample, that offset would be 6
. So, for the second group, we will add 6
, third one as 12
and so on. Thus, the modfied value
would be -
value column : [3, 1, 4, 7, 9, 15, 13, 17, 14, 16]
Now, we can use maximum.accumulate
and the accumulations would be restricted within each group -
value cummaxed: [3, 3, 4, 7, 9, 15, 15, 17, 17, 17])
To go back to the original values, subtract the offsets that were added before.
value cummaxed: [3, 3, 4, 1, 3, 3, 3, 5, 5, 5])
That's our desired result!
At the start, we assumed the groups to be sequential. To get the data in that format, we will use np.argsort(groupby,kind='mergesort')
to get the sorted indices such that it keeps the order for the same numbers and then use these indices to index into groupby
column.
To account for negative groupby elements, we just need to offset by max() - min()
rather than just max()
.
Thus, the implementation would look something like this -
def argsort_unique(idx):
# Original idea : http://stackoverflow.com/a/41242285/3293881
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def numpy_cummmax(groupby, value, factorize_groupby=0):
# Inputs : 1D arrays.
# Get sorted indices keeping the order. Sort groupby and value cols.
sidx = np.argsort(groupby,kind='mergesort')
sorted_groupby, sorted_value = groupby[sidx], value[sidx]
if factorize_groupby==1:
sf = np.concatenate(([0], np.flatnonzero(sorted_groupby[1:] != \
sorted_groupby[:-1])+1, [sorted_groupby.size] ))
sorted_groupby = np.repeat(np.arange(sf.size-1), sf[1:] - sf[:-1])
# Get shifts to be used for shifting each group
mx = sorted_groupby.max()-sorted_groupby.min()+1
shifts = sorted_groupby*mx
# Shift and get max accumlate along value col.
# Those shifts helping out in keeping cummulative max within each group.
group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
return group_cummaxed[argsort_unique(sidx)]
Verification
1) Groupby as ints :
In [58]: # Setup with groupby as ints
...: LENGTH = 1000
...: g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
...: v = np.random.rand(LENGTH)
...:
In [59]: df = pd.DataFrame(np.column_stack((g,v)),columns=['group', 'value'])
In [60]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
2) Groupby as floats :
In [10]: # Setup with groupby as floats
...: LENGTH = 100000
...: df = pd.DataFrame(np.random.randint(0,LENGTH//2,(LENGTH,2))/10.0, \
...: columns=['group', 'value'])
In [18]: # Verify results
...: out1 = df.groupby('group').cummax()
...: out2 = numpy_cummmax(df['group'].values, df['value'].values, factorize_groupby=1)
...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
...:
True
Timings -
1) Groupby as ints (same as the setup used for timings in the question) :
In [24]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [25]: %timeit numpy(g, v) # Best solution from posted question
1 loops, best of 3: 373 ms per loop
In [26]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 165 ms per loop
In [27]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 157 ms per loop
In [28]: %timeit numpy_cummmax(g, v)
100 loops, best of 3: 18.3 ms per loop
2) Groupby as floats :
In [29]: LENGTH = 100000
...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
...: v = np.random.rand(LENGTH)
...:
In [30]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 157 ms per loop
In [31]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 156 ms per loop
In [32]: %timeit numpy_cummmax(g, v, factorize_groupby=1)
10 loops, best of 3: 20.8 ms per loop
In [34]: np.allclose(pir1(g, v),numpy_cummmax(g, v, factorize_groupby=1),atol=1e-5)
Out[34]: True
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With