Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fastest way to find indices of highest value in a matrix iteratively and exclusionarily

For the sake of anyone who stumbles upon this in the future, I apologize – I had conceived of the problem and solution incorrectly. A proper implementation of finding the coordinates of pairwise best hits in a 2D matrix is:

def pairwise_best_hit(matrix):
    boolean_max_matrix = (matrix.max(axis=1,keepdims=1) == matrix) & (matrix.max(axis=0,keepdims=1) == matrix)
    return np.transpose(boolean_max_matrix.nonzero())

I'm attempting to find the "best hits" in a similarity matrix (i.e. an mxn matrix where index along each axis corresponds to the ith position in vector m and the jth position in vector n). The simplest way to explain this is finding the indices of the highest values in a matrix iteratively, excluding previously selected rows and columns. This results in min(m,n) indices chosen.

Here's my minimum reproducible example of my current implementation, using pandas:

import numpy as np
import pandas as pd

def pairwise_best_hit(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    seq1_hits = []
    seq2_hits = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        seq1_hits.append(index1)
        seq2_hits.append(index2)
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return [seq1_hits,seq2_hits]

and for a matrix

sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
pairwise_best_hit(sim)

returns

[[1, 2, 0], [1, 3, 2]]

Figured an edit would be the best way to respond to all comments simultaneously. Re: typical data size – m and n are anywhere from 250 to 1000 and the values in the matrix are floats.

Now, for the results on a matrix of my actual data, which is about 300x350. Slightly tweaking the answers from LastDuckStanding, Julien, and Axel Donath, we have:

def original(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    output_list = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        score = df.iloc[0]['sim']
        output_list.append((int(index1),int(index2),score))
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return output_list

def lastduckstanding(input_matrix):
    mat = input_matrix.copy()
    idxs = np.argsort(mat, None)
    output_list = []
    hit_is_set = set()
    hit_js_set = set()
    num_entries = min(mat.shape[0], mat.shape[1])
    for idx in reversed(idxs):
        i, j = divmod(idx, mat.shape[1])
        if i in hit_is_set or j in hit_js_set:
            continue
        hit_is_set.add(i)
        hit_js_set.add(j)
        output_list.append((i,j,mat[i,j]))
        if len(output_list) == num_entries:
            break
    return output_list

def julien(matrix: np.ndarray):
    out = []
    copy = matrix.copy()
    for _ in range(min(copy.shape)):
        ij = np.unravel_index(copy.argmax(), copy.shape)
        indeces_plus_score = (ij[0],ij[1],copy[ij[0],ij[1]])
        out.append(indeces_plus_score)
        copy[ij[0], :] = -np.inf
        copy[:, ij[1]] = -np.inf
    return out

def axeldonath(arr, indices):
    """Find the maximum value in a 2D array recursively."""
    if not np.any(np.isfinite(arr)):
        return indices
    
    idx_max = np.argmax(arr)
    idxs = np.unravel_index(idx_max, arr.shape)
    indeces_plus_score = (idxs[0],idxs[1],arr[idxs[0],idxs[1]])
    arr[idxs[0], :] = -np.inf
    arr[:, idxs[1]] = -np.inf
    indices.append(indeces_plus_score)
    return axeldonath(arr, indices)

def axeldonath_wrapper(similarity):
    testsim = similarity.copy()
    return axeldonath(testsim,[])

def timing_test(S,functionlist):
    for function in functionlist:
        testsim = S['matrix']
        print(function)
        %timeit function(testsim)

With the following timing results:

<function original at 0x7f405006d1c0>
287 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<function lastduckstanding at 0x7f405006d120>
30 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
<function julien at 0x7f405006d260>
7.9 ms ± 30.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
<function axeldonath_wrapper at 0x7f405006d3a0>
16.9 ms ± 42.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
like image 490
derived bee Avatar asked Nov 17 '25 07:11

derived bee


2 Answers

Here's a candidate using numpy only, about 100x faster on your small example:

import numpy as np
import pandas as pd
import timeit

sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])

def mine(sim):
    out = []
    copy = sim.copy()
    MIN = np.iinfo(copy.dtype).min # or -np.inf if using floats...
    for _ in range(min(copy.shape)):
        ij = np.unravel_index(copy.argmax(), copy.shape)
        out.append(ij)
        copy[ij[0]] = MIN
        copy[:,ij[1]] = MIN
    return np.transpose(out)

def yours(sim):
    xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
    seq1_hits = []
    seq2_hits = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        seq1_hits.append(index1)
        seq2_hits.append(index2)
        df = df[(df['index1']!=index1)&(df['index2']!=index2)]
    return [seq1_hits,seq2_hits]

assert np.all(mine(sim) == yours(sim))

%timeit yours(sim)
# 1.05 ms ± 6.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%timeit mine(sim)
# 8.18 µs ± 19.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

Comparison slowly degrades for larger arrays, but stays ahead (still 10x faster on 1000x1000 arrays):

sim = np.arange(10000)
np.random.shuffle(sim)
sim.shape = 100,100
%timeit yours(sim)
# 26.2 ms ± 64.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit mine(sim)
# 397 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

sim = np.arange(1000000)
np.random.shuffle(sim)
sim.shape = 1000,1000
%timeit yours(sim)
# 2.45 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit mine(sim)
#203 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
like image 85
Julien Avatar answered Nov 19 '25 20:11

Julien


Your solution looks like it is O(min(m, n) * mn), because every time you are going through the whole DataFrame and removing those entries that are in the same row and column and recreating a new DataFrame. it looks like Julien's solution has this complexity (except that it probably uses parallel processing and SIMD well) as well because it still goes through the whole array for the argmax.

To achieve O(mn * log(mn)) just don't modify your DataFrame, but iterate through the entries one by one from the highest value and discard it when it is already in the same row or column as the entries you extracted. You can use two sets (or array of flags) to store the previously chosen rows and columns respectively. I name this method sort_w_set.

I have also another method of O(min(m, n) * mn) worst-case time complexity but unknown average case complexity. It keeps track of the maximum of each unpicked row and where it is and only updates it when the column is picked. I've named this lazy_update. Unfortunately this solution is slightly slower according to my tests.

import itertools
import timeit
import numpy as np
import pandas as pd


def original(sim):
    xdim, ydim = np.meshgrid(np.arange(sim.shape[1]), np.arange(sim.shape[0]))
    table = np.vstack((sim.ravel(), xdim.ravel(), ydim.ravel())).T
    df = pd.DataFrame(table).rename(columns={
        0: 'sim',
        1: 'index2',
        2: 'index1'
    }).sort_values('sim', ascending=False)
    seq1_hits = []
    seq2_hits = []
    while len(df):
        index1 = df.iloc[0]['index1']
        index2 = df.iloc[0]['index2']
        seq1_hits.append(index1)
        seq2_hits.append(index2)
        df = df[(df['index1'] != index1) & (df['index2'] != index2)]
    return np.asanyarray([seq1_hits, seq2_hits])


def sort_w_set(mat):
    idxs = np.argsort(mat, None)
    hit_is = []
    hit_js = []
    hit_is_set = set()
    hit_js_set = set()
    num_entries = min(mat.shape[0], mat.shape[1])
    for idx in reversed(idxs):
        i, j = idx // mat.shape[1], idx % mat.shape[1]
        if i in hit_is_set or j in hit_js_set:
            continue
        hit_is.append(i)
        hit_is_set.add(i)
        hit_js.append(j)
        hit_js_set.add(j)
        if len(hit_is) == num_entries:
            break
    return np.asanyarray([hit_is, hit_js])


def lazy_update(mat, neg_inf=float("-inf")):
    m = mat.shape[0]
    n = mat.shape[1]
    max_js = np.argmax(mat, axis=1)
    maxes = np.take_along_axis(mat, max_js[:, np.newaxis], axis=1)
    is_j_hit = np.full(n, False)
    hit_is = []
    hit_js = []
    for k in range(min(m, n)):
        max_i = np.argmax(maxes)
        max_j = max_js[max_i]
        # print(max_i, max_j)
        hit_is.append(max_i)
        hit_js.append(max_j)
        max_js[max_i] = -1
        maxes[max_i] = neg_inf
        is_j_hit[max_j] = True
        for i in range(m):
            if max_js[i] != -1:
                if max_js[i] == max_j:
                    max_js[i] = new_max_j = np.argmax(
                        np.where(is_j_hit, neg_inf, mat[i]))
                    maxes[i] = mat[i, new_max_j]
    return np.asanyarray([hit_is, hit_js])


def julien(sim: np.ndarray):
    out = []
    copy = sim.copy()
    for _ in range(min(copy.shape)):
        ij = np.unravel_index(copy.argmax(), copy.shape)
        # print(ij)
        out.append(ij)
        copy[ij[0]] = -1.0
        copy[:, ij[1]] = -1.0
    return np.transpose(out)


def main():
    mat = np.array([[1, 5, 6, 2], [7, 10, 3, 4], [1, 5, 3, 7]], dtype=float)
    np_rand = np.random.default_rng()
    max_time = 10
    max_iters = 100
    # for _ in itertools.count():
    #     try:
    if True:
        if True:
            mat = np_rand.random([100, 200])
            res = None
            for f in [original, sort_w_set, lazy_update, julien]:
                res2 = f(mat)
                if res is None:
                    res = res2
                else:
                    assert np.all(res2 == res)
        # except:
        #     print([list(mat[i]) for i in range(mat.shape[0])])
        #     raise
    for m in [100, 200, 300, 500, 1000]:
        n = m + 50
        print(f'{m=} {n=}')
        mat = None

        def setup_func():
            nonlocal mat
            mat = np.round(np_rand.random([m, n]), 3)

        for f in [original, sort_w_set, lazy_update, julien]:
            timing = timeit.Timer(lambda: f(mat), setup_func).repeat(10, 1)
            print(f.__name__, np.median(timing))


if __name__ == "__main__":
    main()

To achieve O(min(m, n) * log(mn))) you probably need to do something much more complicated (or maybe it's impossible).

Results:

m=100 n=150
original 0.05974135349993048
sort_w_set 0.002159014999961073
lazy_update 0.002811349000012342
julien 0.001682339000126376
m=200 n=250
original 0.14275815500013778
sort_w_set 0.005666553500077498
lazy_update 0.007546098000148049
julien 0.007552466000106506
m=300 n=350
original 0.223872012999891
sort_w_set 0.009630193000020881
lazy_update 0.013982388499925946
julien 0.016382625000005646
m=500 n=550
original 0.6410600660001364
sort_w_set 0.02666090999991866
lazy_update 0.0368077354999059
julien 0.05437794899989967
m=1000 n=1050
original 4.4048256545002005
sort_w_set 0.13431666199971914
lazy_update 0.17222096199975567
julien 0.4341630219998933
like image 30
LastDuckStanding Avatar answered Nov 19 '25 21:11

LastDuckStanding