Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fast way to remove array of specific row values from 2D numpy array

I have a 2D array like this:

a = np.array([[25, 83, 18, 71],
       [75,  7,  0, 85],
       [25, 83, 18, 71],
       [25, 83, 18, 71],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [56, 75, 50,  0],
       [19, 49, 92, 57],
       [52, 93, 58,  9]])

and I want to remove rows that has specific values, for example:

b = np.array([[56, 75, 50,  0], [52, 93, 58,  9], [25, 83, 18, 71]])

What is the most efficient way to do this in numpy or pandas? Expected output:

np.array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])

Update

The fastest approach is dimensionality reduction but it requires quite strict limitations of ranges of columns in general. There is my perfplot:

import pandas as pd
import numexpr as ne
import perfplot
from time import time

def remove_pd(data):
    a,b = data
    dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
    return dfa.merge(dfb, how='left', indicator=True)\
    .query('_merge == "left_only"').drop(columns='_merge').values
    
def remove_smalldata(data):
    a,b = data
    return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]

'''def remove_nploop(data):
    a, b = data
    for arr in b:
        a = a[np.all(~np.equal(a, arr), axis=1)]
    return a'''
        
def remove_looped(data): 
    a, b = data
    to_remain = [True]*len(a)
    ind = 0
    for vec_a in a:
        for vec_b in b:
            if np.array_equal(vec_a, vec_b):
                to_remain[ind] = False
                break
        ind += 1
    return a[to_remain]

def remove_looped_boost(data): 
    a, b = data
    to_remain = [True]*len(a)
    a_map = list(map(tuple, a.tolist()))
    b_map = set(map(tuple, b.tolist()))
    for i in range(len(a)):
        to_remain[i] = not(a_map[i] in b_map)
    return a[to_remain]

def remove_reducedim(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
    ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
    ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
    return a[~np.isin(ravel_a, ravel_b)]

def remove_reducedim_boost(data):
    a,b = data
    a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
    ma, MA = np.min(a, axis=0), np.max(a, axis=0)
    mb, MB = np.min(b, axis=0), np.max(b, axis=0)
    m1,m2,m3,m4 = np.min([ma, mb], axis=0)
    M1,M2,M3,M4 = np.max([MA, MB], axis=0)
    s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
    a1,a2,a3,a4 = a.T
    b1,b2,b3,b4 = b.T
    d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
        's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
    ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
    ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
    return a[~np.isin(ravel_a, ravel_b)]
    
def setup(x):
    a1 = np.random.randint(50000, size=(x,4))
    a2 = a1[np.random.randint(x, size=x)]
    return a1, a2
    
def build_args(figure):
    kernels = [remove_reducedim, remove_reducedim_boost, remove_pd, remove_looped, remove_looped_boost, remove_smalldata]
    return {'setup': setup,
    'kernels': {'A': kernels, 'B': kernels[:3]}[figure],
    'n_range': {'A': [2 ** k for k in range(12)], 'B': [2 ** k for k in range(11, 25)]}[figure],
     'xlabel': 'Remowing n rows from n rows',
     'title' : {'A':'Testing removal of small dataset', 'B':'Testing removal of large dataset'}[figure],
     'show_progress': False,
     'equality_check': lambda x,y: np.array_equal(x, y)}
    
t = time()
outs = [perfplot.bench(**build_args(n)) for n in ('A','B')]
fig = plt.figure(figsize=(20, 20))
for i in range(len(outs)):
    ax = fig.add_subplot(2, 1, i+1)
    ax.grid(True, which="both")
    outs[i].plot()
plt.show()
print('Overall testing time:', time()-t)

Output:

Overall testing time: 529.2596168518066

enter image description here

enter image description here

like image 884
mathfux Avatar asked Oct 13 '20 22:10

mathfux


People also ask

How do I remove a specific value from a NumPy array?

To remove an element from a NumPy array: Specify the index of the element to remove. Call the numpy. delete() function on the array for the given index.

How do I delete a specific row in NumPy?

Using the NumPy function np. delete() , you can delete any row and column from the NumPy array ndarray . Specify the axis (dimension) and position (row number, column number, etc.). It is also possible to select multiple rows and columns using a slice or a list.

How do I delete a row from NumPy array based on condition?

np. delete(ndarray, index, axis): Delete items of rows or columns from the NumPy array based on given index conditions and axis specified, the parameter ndarray is the array on which the manipulation will happen, the index is the particular rows based on conditions to be deleted, axis=0 for removing rows in our case.

How do I select a specific row in NumPy?

We can use [][] operator to select an element from Numpy Array i.e. Example 1: Select the element at row index 1 and column index 2. Or we can pass the comma separated list of indices representing row index & column index too i.e.


1 Answers

Here's a pandas approach doing a "anti join" using merge and query.

dfa = pd.DataFrame(a)
dfb = pd.DataFrame(b)

df = (
    dfa.merge(dfb, how='left', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns='_merge')
)

    0   1   2   3
1  75   7   0  85
4  75  48   8  43
5   7  47  96  94
6   7  47  96  94
8  19  49  92  57

Note: a plain numpy solution should be faster, but this should do fine.


Plain numpy but with a single loop:

for arr in b:
    a = a[np.all(~np.equal(a, arr), axis=1)]

array([[75,  7,  0, 85],
       [75, 48,  8, 43],
       [ 7, 47, 96, 94],
       [ 7, 47, 96, 94],
       [19, 49, 92, 57]])
like image 75
Erfan Avatar answered Nov 14 '22 22:11

Erfan