Convert numpy rows into columns based on ID

Question

Suppose I have a numpy array that maps between IDs of two item types:

[[1, 12],
 [1, 13],
 [1, 14],
 [2, 13],
 [2, 14],
 [3, 11]]

I would like to rearrange this array such that each row in the new array represents all items that matched the same ID in the original array. Here, each column would represent one of the mappings in the original array, up to a specified shape restriction on the number of columns in the new array. If we wanted to obtain this result from the above array, ensuring we only had 2 columns, we would obtain:

[[12, 13],  #Represents 1 - 14 was not kept as only 2 columns are allowed
 [13, 14],  #Represents 2
 [11,  0]]  #Represents 3 - 0 was used as padding since 3 did not have 2 mappings

The naïve approach here would be to use a for-loop that populates the new array as it encounters rows in the original array. Is there a more efficient means of accomplishing this with numpy's functionality?

Mazdak · Accepted Answer

Here is a general and mostly Numpythonic approach:

In [144]: def array_packer(arr):
     ...:     cols = arr.shape[1]
     ...:     ids = arr[:, 0]
     ...:     inds = np.where(np.diff(ids) != 0)[0] + 1
     ...:     sp = np.split(arr[:,1:], inds)
     ...:     result = [np.unique(a[: cols]) if a.shape[0] >= cols else
     ...:                    np.pad(np.unique(a), (0, (cols - 1) * (cols - a.shape[0])), 'constant')
     ...:                 for a in sp]
     ...:     return result
     ...:     
     ...:

Demo:

In [145]: a = np.array([[1, 12, 15, 45],
     ...:  [1, 13, 23, 9],
     ...:  [1, 14, 14, 11],
     ...:  [2, 13, 90, 34],
     ...:  [2, 14, 23, 43],
     ...:  [3, 11, 123, 53]])
     ...:  

In [146]: array_packer(a)
Out[146]: 
[array([ 9, 11, 12, 13, 14, 15, 23, 45,  0,  0,  0]),
 array([13, 14, 23, 34, 43, 90,  0,  0,  0,  0,  0,  0]),
 array([ 11,  53, 123,   0,   0,   0,   0,   0,   0,   0,   0,   0])]

In [147]: a = np.array([[1, 12, 15],
     ...:  [1, 13, 23],
     ...:  [1, 14, 14],
     ...:  [2, 13, 90],
     ...:  [2, 14, 23],
     ...:  [3, 11, 123]])
     ...: 
     ...:   
     ...:  

In [148]: array_packer(a)
Out[148]: 
[array([12, 13, 14, 15, 23]),
 array([13, 14, 23, 90,  0,  0]),
 array([ 11, 123,   0,   0,   0,   0])]

Paul Panzer · Answer

Here is an approach using a sparse matrix:

def pp(map_, maxitems=2):
    M = sparse.csr_matrix((map_[:, 1], map_[:, 0], np.arange(map_.shape[0]+1)))
    M = M.tocsc()
    sizes = np.diff(M.indptr)
    ids, = np.where(sizes)
    D = np.concatenate([M.data, np.zeros((maxitems - 1,), dtype=M.data.dtype)])
    D = np.lib.stride_tricks.as_strided(D, (D.size - maxitems + 1, maxitems),
                                        2 * D.strides)
    result = D[M.indptr[ids]]
    result[np.arange(maxitems) >= sizes[ids, None]] = 0
    return result

Timings using @crisz's code but modified to use less repetitive test data. Also I added a bit of "validation": chrisz's and my solutions give the same answer, the other two output a different format, so I couldn't check them.

enter image description here

Code:

from scipy import sparse
import numpy as np
from collections import defaultdict, deque

def pp(map_, maxitems=2):
    M = sparse.csr_matrix((map_[:, 1], map_[:, 0], np.arange(map_.shape[0]+1)))
    M = M.tocsc()
    sizes = np.diff(M.indptr)
    ids, = np.where(sizes)
    D = np.concatenate([M.data, np.zeros((maxitems - 1,), dtype=M.data.dtype)])
    D = np.lib.stride_tricks.as_strided(D, (D.size - maxitems + 1, maxitems),
                                        2 * D.strides)
    result = D[M.indptr[ids]]
    result[np.arange(maxitems) >= sizes[ids, None]] = 0
    return result

def chrisz(a):
  return [[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])]

def piotr(a):
  d = defaultdict(lambda: deque((0, 0), maxlen=2))
  for key, val in a:
    d[key].append(val)
  return d

def karams(arr):
  cols = arr.shape[1]
  ids = arr[:, 0]
  inds = np.where(np.diff(ids) != 0)[0] + 1
  sp = np.split(arr[:,1:], inds)
  result = [a[:2].ravel() if a.size >= cols else np.pad(a.ravel(), (0, cols -1 * (cols - a.size)), 'constant')for a in sp]
  return result

def make(nid, ntot):
    return np.c_[np.random.randint(0, nid, (ntot,)),
                 np.random.randint(0, 2**30, (ntot,))]

from timeit import timeit
import pandas as pd
import matplotlib.pyplot as plt

res = pd.DataFrame(
       index=['pp', 'chrisz', 'piotr', 'karams'],
       columns=[10, 50, 100, 500, 1000, 5000, 10000],# 50000],
       dtype=float
)

for c in res.columns:
#        l = np.repeat(np.array([[1, 12],[1, 13],[1, 14],[2, 13],[2, 14],[3, 11]]), c, axis=0)
    l = make(c // 2, c * 6)
    assert np.all(chrisz(l) == pp(l))
    for f in res.index:
        stmt = '{}(l)'.format(f)
        setp = 'from __main__ import l, {}'.format(f)
        res.at[f, c] = timeit(stmt, setp, number=30)

ax = res.div(res.min()).T.plot(loglog=True)
ax.set_xlabel("N");
ax.set_ylabel("time (relative)");

plt.show()

Convert numpy rows into columns based on ID

Tags:

python

arrays

numpy

scipy

n0shadow

2 Answers

Mazdak

Paul Panzer

Recent Activity

Donate For Us

Convert numpy rows into columns based on ID

Tags:

python

arrays

numpy

scipy

n0shadow

2 Answers

Mazdak

Paul Panzer

Related questions

Recent Activity

Donate For Us