I have a 2D array like this:
a = np.array([[25, 83, 18, 71],
[75, 7, 0, 85],
[25, 83, 18, 71],
[25, 83, 18, 71],
[75, 48, 8, 43],
[ 7, 47, 96, 94],
[ 7, 47, 96, 94],
[56, 75, 50, 0],
[19, 49, 92, 57],
[52, 93, 58, 9]])
and I want to remove rows that has specific values, for example:
b = np.array([[56, 75, 50, 0], [52, 93, 58, 9], [25, 83, 18, 71]])
What is the most efficient way to do this in numpy
or pandas
? Expected output:
np.array([[75, 7, 0, 85],
[75, 48, 8, 43],
[ 7, 47, 96, 94],
[ 7, 47, 96, 94],
[19, 49, 92, 57]])
The fastest approach is dimensionality reduction but it requires quite strict limitations of ranges of columns in general. There is my perfplot:
import pandas as pd
import numexpr as ne
import perfplot
from time import time
def remove_pd(data):
a,b = data
dfa, dfb = pd.DataFrame(a), pd.DataFrame(b)
return dfa.merge(dfb, how='left', indicator=True)\
.query('_merge == "left_only"').drop(columns='_merge').values
def remove_smalldata(data):
a,b = data
return a[(a[None,:,:] != b[:,None,:]).any(-1).all(0)]
'''def remove_nploop(data):
a, b = data
for arr in b:
a = a[np.all(~np.equal(a, arr), axis=1)]
return a'''
def remove_looped(data):
a, b = data
to_remain = [True]*len(a)
ind = 0
for vec_a in a:
for vec_b in b:
if np.array_equal(vec_a, vec_b):
to_remain[ind] = False
break
ind += 1
return a[to_remain]
def remove_looped_boost(data):
a, b = data
to_remain = [True]*len(a)
a_map = list(map(tuple, a.tolist()))
b_map = set(map(tuple, b.tolist()))
for i in range(len(a)):
to_remain[i] = not(a_map[i] in b_map)
return a[to_remain]
def remove_reducedim(data):
a,b = data
a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
ma, MA = np.min(a, axis=0), np.max(a, axis=0)
mb, MB = np.min(b, axis=0), np.max(b, axis=0)
m, M = np.min([ma, mb], axis=0), np.max([MA, MB],axis=0)
ravel_a = np.ravel_multi_index((a-m).T, M - m + 1)
ravel_b = np.ravel_multi_index((b-m).T, M - m + 1)
return a[~np.isin(ravel_a, ravel_b)]
def remove_reducedim_boost(data):
a,b = data
a, b = a.astype(np.int64), b.astype(np.int64) #make sure box is not too small
ma, MA = np.min(a, axis=0), np.max(a, axis=0)
mb, MB = np.min(b, axis=0), np.max(b, axis=0)
m1,m2,m3,m4 = np.min([ma, mb], axis=0)
M1,M2,M3,M4 = np.max([MA, MB], axis=0)
s1,s2,s3,s4 = M1-m1+1, M2-m2+1, M3-m3+1, M4-m4+1
a1,a2,a3,a4 = a.T
b1,b2,b3,b4 = b.T
d = {'a1':a1, 'a2':a2, 'a3':a3, 'a4':a4, 'b1':b1, 'b2':b2, 'b3':b3, 'b4':b4,
's1':s1, 's2':s2, 's3':s3, 'm1':m1, 'm2':m2, 'm3':m3, 'm4':m4}
ravel_a = ne.evaluate('(a1-m1)+(a2-m2)*s1+(a3-m3)*s1*s2+(a4-m4)*s1*s2*s3',d)
ravel_b = ne.evaluate('(b1-m1)+(b2-m2)*s1+(b3-m3)*s1*s2+(b4-m4)*s1*s2*s3',d)
return a[~np.isin(ravel_a, ravel_b)]
def setup(x):
a1 = np.random.randint(50000, size=(x,4))
a2 = a1[np.random.randint(x, size=x)]
return a1, a2
def build_args(figure):
kernels = [remove_reducedim, remove_reducedim_boost, remove_pd, remove_looped, remove_looped_boost, remove_smalldata]
return {'setup': setup,
'kernels': {'A': kernels, 'B': kernels[:3]}[figure],
'n_range': {'A': [2 ** k for k in range(12)], 'B': [2 ** k for k in range(11, 25)]}[figure],
'xlabel': 'Remowing n rows from n rows',
'title' : {'A':'Testing removal of small dataset', 'B':'Testing removal of large dataset'}[figure],
'show_progress': False,
'equality_check': lambda x,y: np.array_equal(x, y)}
t = time()
outs = [perfplot.bench(**build_args(n)) for n in ('A','B')]
fig = plt.figure(figsize=(20, 20))
for i in range(len(outs)):
ax = fig.add_subplot(2, 1, i+1)
ax.grid(True, which="both")
outs[i].plot()
plt.show()
print('Overall testing time:', time()-t)
Output:
Overall testing time: 529.2596168518066
To remove an element from a NumPy array: Specify the index of the element to remove. Call the numpy. delete() function on the array for the given index.
Using the NumPy function np. delete() , you can delete any row and column from the NumPy array ndarray . Specify the axis (dimension) and position (row number, column number, etc.). It is also possible to select multiple rows and columns using a slice or a list.
np. delete(ndarray, index, axis): Delete items of rows or columns from the NumPy array based on given index conditions and axis specified, the parameter ndarray is the array on which the manipulation will happen, the index is the particular rows based on conditions to be deleted, axis=0 for removing rows in our case.
We can use [][] operator to select an element from Numpy Array i.e. Example 1: Select the element at row index 1 and column index 2. Or we can pass the comma separated list of indices representing row index & column index too i.e.
Here's a pandas approach doing a "anti join" using merge
and query
.
dfa = pd.DataFrame(a)
dfb = pd.DataFrame(b)
df = (
dfa.merge(dfb, how='left', indicator=True)
.query('_merge == "left_only"')
.drop(columns='_merge')
)
0 1 2 3
1 75 7 0 85
4 75 48 8 43
5 7 47 96 94
6 7 47 96 94
8 19 49 92 57
Note: a plain numpy solution should be faster, but this should do fine.
Plain numpy but with a single loop:
for arr in b:
a = a[np.all(~np.equal(a, arr), axis=1)]
array([[75, 7, 0, 85],
[75, 48, 8, 43],
[ 7, 47, 96, 94],
[ 7, 47, 96, 94],
[19, 49, 92, 57]])
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With