Pandas has a widely-used groupby facility to split up a DataFrame based on a corresponding mapping, from which you can apply a calculation on each subgroup and recombine the results.
Can this be done flexibly in NumPy without a native Python for-loop? With a Python loop, this would look like:
>>> import numpy as np
>>> X = np.arange(10).reshape(5, 2)
>>> groups = np.array([0, 0, 0, 1, 1])
# Split up elements (rows) of `X` based on their element wise group
>>> np.array([X[groups==i].sum() for i in np.unique(groups)])
array([15, 30])
Above 15 is the sum of the first three rows of X
, and 30 is the sum of the remaining two.
By "flexibly,” I just mean that we aren't focusing on one particular computation such as sum, count, maximum, etc, but rather passing any computation to the grouped arrays.
If not, is there a faster approach than the above?
How about using scipy sparse matrix
import numpy as np
from scipy import sparse
import time
x_len = 500000
g_len = 100
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
# original
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
# using scipy sparse matrix
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
result on my PC
0.15915322303771973
0.012875080108642578
0
More than 10 times faster.
Update!
Let's benchmark answers of @Paul Panzer and @Daniel F. It is summation only benchmark.
import numpy as np
from scipy import sparse
import time
# by @Daniel F
def groupby_np(X, groups, axis = 0, uf = np.add, out = None, minlength = 0, identity = None):
if minlength < groups.max() + 1:
minlength = groups.max() + 1
if identity is None:
identity = uf.identity
i = list(range(X.ndim))
del i[axis]
i = tuple(i)
n = out is None
if n:
if identity is None: # fallback to loops over 0-index for identity
assert np.all(np.in1d(np.arange(minlength), groups)), "No valid identity for unassinged groups"
s = [slice(None)] * X.ndim
for i_ in i:
s[i_] = 0
out = np.array([uf.reduce(X[tuple(s)][groups == i]) for i in range(minlength)])
else:
out = np.full((minlength,), identity, dtype = X.dtype)
uf.at(out, groups, uf.reduce(X, i))
if n:
return out
x_len = 500000
g_len = 200
X = np.arange(x_len * 2).reshape(x_len, 2)
groups = np.random.randint(0, g_len, x_len)
print("original")
s = time.time()
a = np.array([X[groups==i].sum() for i in np.unique(groups)])
print(time.time() - s)
print("use scipy coo matrix")
s = time.time()
x_sum = X.sum(axis=1)
b = np.array(sparse.coo_matrix(
(
x_sum,
(groups, np.arange(len(x_sum)))
),
shape=(g_len, x_len)
).sum(axis=1)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-b)).sum())
print("use scipy csr matrix @Daniel F")
s = time.time()
x_sum = X.sum(axis=1)
c = np.array(sparse.csr_matrix(
(
x_sum,
groups,
np.arange(len(groups)+1)
),
shape=(len(groups), g_len)
).sum(axis=0)).ravel()
print(time.time() - s)
#compare
print(np.abs((a-c)).sum())
print("use bincount @Paul Panzer @Daniel F")
s = time.time()
d = np.bincount(groups, X.sum(axis=1), g_len)
print(time.time() - s)
#compare
print(np.abs((a-d)).sum())
print("use ufunc @Daniel F")
s = time.time()
e = groupby_np(X, groups)
print(time.time() - s)
#compare
print(np.abs((a-e)).sum())
STDOUT
original
0.2882847785949707
use scipy coo matrix
0.012301445007324219
0
use scipy csr matrix @Daniel F
0.01046299934387207
0
use bincount @Paul Panzer @Daniel F
0.007468223571777344
0.0
use ufunc @Daniel F
0.04431319236755371
0
The winner is the bincount solution. But the csr matrix solution is also very interesting.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With