Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Speed up Pandas cummin/cummax

Pandas cummin and cummax functions seem to be really slow for my use case with many groups. How can I speed them up?

Update

import pandas as pd
import numpy as np

from collections import defaultdict

def cummax(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').cummax()
    result = result.values
    return result


def transform(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').transform(lambda x: x.cummax())
    result = result.values
    return result

def itertuples(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
        return d[g][-1]

    for row in df.itertuples(index=True):
        index = row[0]
        result[index] = d_(row[1], row[2:])

    result = np.asarray(result)
    return result

def numpy(g, v):
    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = np.maximum(d[g][-2], d[g][-1])
        return d[g][-1]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    result = np.asarray(result)
    return result



LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)

%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)

gives

1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop

Do you have any further suggestions how I can improve my code?

Old

import pandas as pd
import numpy as np

LENGTH = 100000

df = pd.DataFrame(
    np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
    columns=['group', 'value'])

df.groupby('group').cummax()
like image 957
David Avatar asked Jan 07 '17 20:01

David


2 Answers

We'll use defaultdict where the default value will be -np.inf because I'll be taking max values and I want a default that everything is greater than.

solution

Given an array of groups g and values to accumulate maximums v

def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

demonstration

LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)

accuracy

vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()

True



Deep Dive

Compare with Divakar's answer

headline chart
enter image description here

code
I took some liberties with Divakar's function to make it accurate.

%%cython
import numpy as np
from collections import defaultdict

# this is a cythonized version of the next function
def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

def pir2(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

def argsort_unique(idx):
    # Original idea : http://stackoverflow.com/a/41242285/3293881 
    n = idx.size
    sidx = np.empty(n,dtype=int)
    sidx[idx] = np.arange(n)
    return sidx

def div1(groupby, value): 
    sidx = np.argsort(groupby,kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = sorted_groupby * mx

    # Shift and get max accumlate along value col. 
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

def div2(groupby, value):
    sidx = np.argsort(groupby, kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]
    # factorize groups to integers
    sorted_groupby = np.append(
        0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = (sorted_groupby - sorted_groupby.min()) * mx 

    # Shift and get max accumlate along value col. 
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

NOTES:

  • It was necessary to factorize the groups in Divakar's solution to generalize it

accuracy

integer groups
Over integer based groups, both div1 and div2 produce the same results

np.isclose(div1(g, v), pir1(g, v)).all()

True

np.isclose(div2(g, v), pir1(g, v)).all()

True

general groups
Over string and float based groups div1 becomes inaccurate but easily fixed

g = g / 1000000

np.isclose(div1(g, v), pir1(g, v)).all()

False

np.isclose(div2(g, v), pir1(g, v)).all()

True

time testing

results = pd.DataFrame(
    index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
    columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))

size_map = dict(Large=100, Medium=10, Small=1)

from timeit import timeit

for i in results.index:
    for j in results.columns:
        results.set_value(
            i, j,
            timeit(
                '{}(g // size_map[j], v)'.format(i),
                'from __main__ import {}, size_map, g, v, j'.format(i),
                number=100
            )
        )

results

enter image description here

results.T.plot.barh()

enter image description here

like image 55
piRSquared Avatar answered Sep 21 '22 05:09

piRSquared


Proposed approach

Let's bring some NumPy magic to the table! Well, we will exploit np.maximum.accumulate.

Explanation

To see how maximum.accumulate could help us, let's assume we have the groups lined up sequentially.

Let's consider a sample grouby :

grouby column : [0, 0, 0, 1, 1, 2, 2, 2, 2, 2]

Let's consider a sample value :

value column  : [3, 1, 4, 1, 3, 3, 1, 5, 2, 4]

Using maximum.accumulate simply on value won't give us the desired output, as we need to do these accumulations only within each group. To do so, one trick would be to offset each group from the group before it.

There could be few methods to do that offsetting work. One easy way would be to offset each group with an offset of max of value + 1 more than the previous one. For the sample, that offset would be 6. So, for the second group, we will add 6, third one as 12 and so on. Thus, the modfied value would be -

value column  : [3, 1, 4, 7, 9, 15, 13, 17, 14, 16]

Now, we can use maximum.accumulate and the accumulations would be restricted within each group -

value cummaxed: [3, 3, 4, 7, 9, 15, 15, 17, 17, 17])

To go back to the original values, subtract the offsets that were added before.

value cummaxed: [3, 3, 4, 1, 3, 3, 3, 5, 5, 5])

That's our desired result!

At the start, we assumed the groups to be sequential. To get the data in that format, we will use np.argsort(groupby,kind='mergesort') to get the sorted indices such that it keeps the order for the same numbers and then use these indices to index into groupby column.

To account for negative groupby elements, we just need to offset by max() - min() rather than just max().

Thus, the implementation would look something like this -

def argsort_unique(idx):
    # Original idea : http://stackoverflow.com/a/41242285/3293881 
    n = idx.size
    sidx = np.empty(n,dtype=int)
    sidx[idx] = np.arange(n)
    return sidx

def numpy_cummmax(groupby, value, factorize_groupby=0): 
    # Inputs : 1D arrays.

    # Get sorted indices keeping the order. Sort groupby and value cols.
    sidx = np.argsort(groupby,kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]

    if factorize_groupby==1:
        sf = np.concatenate(([0], np.flatnonzero(sorted_groupby[1:] != \
                            sorted_groupby[:-1])+1, [sorted_groupby.size] ))
        sorted_groupby = np.repeat(np.arange(sf.size-1), sf[1:] - sf[:-1])

    # Get shifts to be used for shifting each group
    mx = sorted_groupby.max()-sorted_groupby.min()+1
    shifts = sorted_groupby*mx

    # Shift and get max accumlate along value col. 
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

Runtime test and verification

Verification

1) Groupby as ints :

In [58]: # Setup with groupby as ints
    ...: LENGTH = 1000
    ...: g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
    ...: v = np.random.rand(LENGTH)
    ...: 

In [59]: df = pd.DataFrame(np.column_stack((g,v)),columns=['group', 'value'])

In [60]: # Verify results
    ...: out1 = df.groupby('group').cummax()
    ...: out2 = numpy_cummmax(df['group'].values, df['value'].values)
    ...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
    ...: 
True

2) Groupby as floats :

In [10]: # Setup with groupby as floats
    ...: LENGTH = 100000
    ...: df = pd.DataFrame(np.random.randint(0,LENGTH//2,(LENGTH,2))/10.0, \
    ...:                             columns=['group', 'value'])

In [18]: # Verify results
    ...: out1 = df.groupby('group').cummax()
    ...: out2 = numpy_cummmax(df['group'].values, df['value'].values, factorize_groupby=1)
    ...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
    ...: 
True

Timings -

1) Groupby as ints (same as the setup used for timings in the question) :

In [24]: LENGTH = 100000
    ...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
    ...: v = np.random.rand(LENGTH)
    ...: 

In [25]: %timeit numpy(g, v) # Best solution from posted question
1 loops, best of 3: 373 ms per loop

In [26]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 165 ms per loop

In [27]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 157 ms per loop

In [28]: %timeit numpy_cummmax(g, v)
100 loops, best of 3: 18.3 ms per loop

2) Groupby as floats :

In [29]: LENGTH = 100000
    ...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
    ...: v = np.random.rand(LENGTH)
    ...: 

In [30]: %timeit pir1(g, v) # @piRSquared's solution-1
1 loops, best of 3: 157 ms per loop

In [31]: %timeit pir2(g, v) # @piRSquared's solution-2
1 loops, best of 3: 156 ms per loop

In [32]: %timeit numpy_cummmax(g, v, factorize_groupby=1)
10 loops, best of 3: 20.8 ms per loop

In [34]: np.allclose(pir1(g, v),numpy_cummmax(g, v, factorize_groupby=1),atol=1e-5)
Out[34]: True
like image 38
Divakar Avatar answered Sep 20 '22 05:09

Divakar