Numpy reductions over successive non-contiguous slices

Tags:

Say I have two numpy arrays, A of shape (d, f) and I of shape (d,) containing indices in 0..n, e.g.

I = np.array([0, 0, 1, 0, 2, 1])
A = np.arange(12).reshape(6, 2)

I am looking for a fast way to make reductions, in particular sum, mean and max, over all the slices A[I == i, :]; a slow version would be

results = np.zeros((I.max() + 1, A.shape[1]))
for i in np.unique(I):
    results[i, :] = np.mean(A[I == i, :], axis=0)

which gives in this case

results = [[ 2.66666667,  3.66666667],
           [ 7.        ,  8.        ],
           [ 8.        ,  9.        ]])

EDIT: I did some timings based on Divakar's answer and a previous poster's (deleted) pandas-based answer.

Timing code:

from __future__ import division, print_function
import numpy as np, pandas as pd
from time import time

np.random.seed(0)
d = 500000
f = 500
n = 500
I = np.hstack((np.arange(n), np.random.randint(n, size=(d - n,))))
np.random.shuffle(I)
A = np.random.rand(d, f)

def reduce_naive(A, I, op="avg"):
    target_dtype = (np.float if op=="avg" else A.dtype)
    results = np.zeros((I.max() + 1, A.shape[1]), dtype=target_dtype)
    npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
    for i in np.unique(I):
        results[i, :] = npop(A[I == i, :], axis=0)
    return results

def reduce_reduceat(A, I, op="avg"):
    sidx = I.argsort()
    sI = I[sidx]
    sortedA = A[sidx]
    idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
    if op == "max":
        return np.maximum.reduceat(sortedA, idx, axis=0)
    sums = np.add.reduceat(sortedA, idx, axis=0)
    if op == "sum":
        return sums
    if op == "avg":
        count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
        return sums/count.astype(float)[:,None]

def reduce_bincount(A, I, op="avg"):
    ids = (I[:,None] + (I.max()+1)*np.arange(A.shape[1])).ravel()
    sums = np.bincount(ids, A.ravel()).reshape(A.shape[1],-1).T
    if op == "sum":
        return sums
    if op == "avg":
        return sums/np.bincount(ids).reshape(A.shape[1],-1).T

def reduce_pandas(A, I, op="avg"):
    group = pd.concat([pd.DataFrame(A), pd.DataFrame(I, columns=("i",))
                     ], axis=1
                    ).groupby('i')
    if op == "sum":
        return group.sum().values
    if op == "avg":
        return group.mean().values
    if op == "max":
        return group.max().values

def reduce_hybrid(A, I, op="avg"):
    sidx = I.argsort()
    sI = I[sidx]
    sortedA = A[sidx]

    idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
    unq_sI = sI[idx]    

    m = I.max()+1
    N = A.shape[1]

    target_dtype = (np.float if op=="avg" else A.dtype)
    out = np.zeros((m,N),dtype=target_dtype)
    ss_idx = np.r_[idx,A.shape[0]]

    npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
    for i in range(len(idx)):
        out[unq_sI[i]] = npop(sortedA[ss_idx[i]:ss_idx[i+1]], axis=0)
    return out

for op in ("sum", "avg", "max"):
    for name, method in (("naive   ", reduce_naive), 
                         ("reduceat", reduce_reduceat),
                         ("pandas  ", reduce_pandas),
                         ("bincount", reduce_bincount),
                         ("hybrid  ", reduce_hybrid)
                         ("numba   ", reduce_numba)
                        ):    
        if op == "max" and name == "bincount":
            continue
        # if name is not "naive":
        #      assert np.allclose(method(A, I, op), reduce_naive(A, I, op))
        times = []
        for tries in range(3):
            time0 = time(); method(A, I, op)
            times.append(time() - time0); 
        print(name, op, "{:.2f}".format(np.min(times)))
    print()

Timings:

naive    sum 1.10
reduceat sum 4.62
pandas   sum 5.29
bincount sum 1.54
hybrid   sum 0.62
numba    sum 0.31

naive    avg 1.12
reduceat avg 4.45
pandas   avg 5.23
bincount avg 2.43
hybrid   avg 0.61
numba    avg 0.33

naive    max 1.19
reduceat max 3.18
pandas   max 5.24
hybrid   max 0.72
numba    max 0.34

(I chose d and n as typical values for my use case - I have added the code for numba-versions in my answer).

331

asked Mar 17 '17 16:03

Maxim

2 Answers

Approach #1 : Using NumPy ufunc reduceat

We have ufuncs for those three reduction operations and luckily we also have ufunc.reduceat to perform those reductions limited at particular intervals along an axis. So, using those, we would have those three operations computed like so -

# Gives us sorted array based on input indices I and indices at which the
# sorted array should be interval-limited for reduceat operations to be
# applied later on using those results
def sorted_array_intervals(A, I):
    # Compute sort indices for I. To be later used for sorting A based on it.
    sidx = I.argsort()
    sI = I[sidx]
    sortedA = A[sidx]

    # Get indices at which intervals change. Also, get count in each interval
    idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
    return sortedA, idx

# Groupby sum reduction using the interval indices 
# to perform interval-limited ufunc reductions
def groupby_sum(A, I):
    sortedA, idx = sorted_array_intervals(A,I)
    return np.add.reduceat(sortedA, idx, axis=0)

# Groupby mean reduction
def groupby_mean(A, I):
    sortedA, idx = sorted_array_intervals(A,I)
    sums = np.add.reduceat(sortedA, idx, axis=0)
    count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
    return sums/count.astype(float)[:,None]

# Groupby max reduction
def groupby_max(A, I):
    sortedA, idx = sorted_array_intervals(A,I)
    return np.maximum.reduceat(sortedA, idx, axis=0)

Thus, if we need all those operations, we could reuse with one instance of sorted_array_intervals, like so -

def groupby_sum_mean_max(A, I):
    sortedA, idx = sorted_array_intervals(A,I)
    sums = np.add.reduceat(sortedA, idx, axis=0)
    count = np.r_[idx[1:] - idx[:-1], A.shape[0] - idx[-1]]
    avgs = sums/count.astype(float)[:,None]
    maxs = np.maximum.reduceat(sortedA, idx, axis=0)
    return sums, avgs, maxs

Approach #1-B : Hybrid version (sort + slice + reduce)

Here's a hybrid version that does takes help from sorted_array_intervals to get the sorted array and the indices at which the intervals change into the next group, but at the last stage uses slicing to sum each interval and does this iteratively for each of the groups. The slicing helps here as we are working with views.

The implementation would look something like this -

def reduce_hybrid(A, I, op="avg"):
    sidx = I.argsort()
    sI = I[sidx]
    sortedA = A[sidx]

    # Get indices at which intervals change. Also, get count in each interval
    idx = np.r_[ 0, np.flatnonzero(sI[1:] != sI[:-1])+1 ]
    unq_sI = sI[idx]    

    m = I.max()+1
    N = A.shape[1]

    target_dtype = (np.float if op=="avg" else A.dtype)
    out = np.zeros((m,N),dtype=target_dtype)
    ss_idx = np.r_[idx,A.shape[0]]

    npop = {"avg": np.mean, "sum": np.sum, "max": np.max}.get(op)
    for i in range(len(idx)):
        out[unq_sI[i]] = npop(sortedA[ss_idx[i]:ss_idx[i+1]], axis=0)
    return out

Runtime test (Using the setup from the benchmarks posted in the question) -

In [432]: d = 500000
     ...: f = 500
     ...: n = 500
     ...: I = np.hstack((np.arange(n), np.random.randint(n, size=(d - n,))))
     ...: np.random.shuffle(I)
     ...: A = np.random.rand(d, f)
     ...: 

In [433]: %timeit reduce_naive(A, I, op="sum")
     ...: %timeit reduce_hybrid(A, I, op="sum")
     ...: 
1 loops, best of 3: 1.03 s per loop
1 loops, best of 3: 549 ms per loop

In [434]: %timeit reduce_naive(A, I, op="avg")
     ...: %timeit reduce_hybrid(A, I, op="avg")
     ...: 
1 loops, best of 3: 1.04 s per loop
1 loops, best of 3: 550 ms per loop

In [435]: %timeit reduce_naive(A, I, op="max")
     ...: %timeit reduce_hybrid(A, I, op="max")
     ...: 
1 loops, best of 3: 1.14 s per loop
1 loops, best of 3: 631 ms per loop

Approach #2 : Using NumPy bincount

Here's another approach using np.bincount that does bin-based summing. So, with it, we could compute the sum and average values and also avoid sorting in the process, like so -

ids = (I[:,None] + (I.max()+1)*np.arange(A.shape[1])).ravel()
sums = np.bincount(ids, A.ravel()).reshape(A.shape[1],-1).T
avgs = sums/np.bincount(ids).reshape(A.shape[1],-1).T

172

answered Nov 05 '22 06:11

Divakar

Using the python/numpy jit compiler Numba I was able to get shorter timings with just-in-time compilation of the intuitive linear algorithm:

from numba import jit

@jit
def reducenb_avg(A, I):
    d, f = A.shape
    n = I.max() + 1
    result = np.zeros((n, f), np.float)
    count = np.zeros((n, 1), int)
    for i in range(d):
        result[I[i], :] += A[i]
        count[I[i], 0] += 1
    return result/count

@jit
def reducenb_sum(A, I):
    d, f = A.shape
    n = I.max() + 1
    result = np.zeros((n, f), A.dtype)
    for i in range(d):
        result[I[i], :] += A[i]
    return result

@jit
def reducenb_max(A, I):
    d, f = A.shape
    n = I.max() + 1
    result = -np.inf * np.ones((n, f))
    count = np.zeros((n, f))
    for i in range(d):
        result[I[i], :] = np.maximum(A[i], result[I[i], :])
    return result

def reduce_numba(A, I, op="avg"):
    return {"sum": reducenb_sum, "avg": reducenb_avg, "max": reducenb_max}.get(op)(A, I)

On the benchmark problem, these finish in ~0.32s, about half of the time of pure numpy sorting-based methods.

answered Nov 05 '22 07:11

Maxim

Related questions
                            
                                How can I prevent spacy's tokenizer from splitting a specific substring when tokenizing a string?
                            
                                Error when profiling an otherwise perfectly working multiprocessing python script with cProfile
                            
                                iterate over days (pandas)
                            
                                Python wsgi:ssl-error Can't connect to HTTPS URL because the SSL module is not available
                            
                                Python requests post json raw data
                            
                                Using np.where to find matching row in 2D array
                            
                                Use a custom failure message for `assertRaises()` in Python?
                            
                                How can I unit test the jinja2 template logic?
                            
                                What is the best method for using Datashader to plot data from a NumPy array?
                            
                                Prevent long lines getting wrapped in ruamel.yaml
                            
                                Python interface pattern and unit test code coverage
                            
                                Most important original feature(s) of Principal Component Analysis
                            
                                Conditional color with matplotlib scatter
                            
                                How can I access output embedding(output vector) in gensim word2vec?
                            
                                python3 + Pandas styles + Change alternate row color
                            
                                SSL Certification Error > hostname doesn't match
                            
                                Why is bokeh so much slower than matplotlib
                            
                                How to type hint a function that returns a function? [duplicate]
                            
                                How to add the second line of labels for axes
                            
                                Reindexing a pandas DataFrame using a dict (python3)

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Numpy reductions over successive non-contiguous slices

Tags:

python

arrays

vectorization

max

numpy

Maxim

People also ask

2 Answers

Divakar

Maxim

Recent Activity

Donate For Us