Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Vectorizing calculation in matrix with interdependent values

I am tracking multiple discrete time-series at multiple temporal resolutions, resulting in an SxRxB matrix where S is the number of time-series, R is the number of different resolutions and B is the buffer, i.e. how many values each series remembers. Each series is discrete and uses a limited range of natural numbers to represent its values. I will call these "symbols" here.

For each series I want to calculate how often any of the previous measurement's symbols directly precedes any of the current measurement's symbols, over all measurements. I have solved this with a for-loop as seen below, but would like to vectorize it for obvious reasons.

I'm not sure if my way of structuring data is efficient, so I'm open for suggestions there. Especially the ratios matrix could be done differently I think.

Thanks in advance!

def supports_loop(data, num_series, resolutions, buffer_size, vocab_size):
    # For small test matrices we can calculate the complete matrix without problems
    indices = []
    indices.append(xrange(num_series))
    indices.append(xrange(vocab_size))
    indices.append(xrange(num_series))
    indices.append(xrange(vocab_size))
    indices.append(xrange(resolutions))

    # This is huge! :/
    # dimensions:
    #   series and value for which we calculate,
    #   series and value which precedes that measurement,
    #   resolution
    ratios = np.full((num_series, vocab_size, num_series, vocab_size, resolutions), 0.0)

    for idx in itertools.product(*indices):
        s0, v0 = idx[0],idx[1]  # the series and symbol for which we calculate
        s1, v1 = idx[2],idx[3]  # the series and symbol which should precede the we're calculating for
        res = idx[4]

        # Find the positions where s0==v0
        found0 = np.where(data[s0, res, :] == v0)[0]
        if found0.size == 0:
            continue
        #print('found {}={} at {}'.format(s0, v0, found0))

        # Check how often s1==v1 right before s0==v0
        candidates = (s1, res, (found0 - 1 + buffer_size) % buffer_size)
        found01 = np.count_nonzero(data[candidates] == v1)
        if found01 == 0:
            continue

        print('found {}={} following {}={} at {}'.format(s0, v0, s1, v1, found01))
        # total01 = number of positions where either s0 or s1 is defined (i.e. >=0)
        total01 = len(np.argwhere((data[s0, res, :] >= 0) & (data[s1, res, :] >= 0)))
        ratio = (float(found01) / total01) if total01 > 0 else 0.0
        ratios[idx] = ratio

    return ratios


def stackoverflow_example(fnc):
    data = np.array([
        [[0, 0, 1],  # series 0, resolution 0
         [1, 3, 2]], # series 0, resolution 1

        [[2, 1, 2],  # series 1, resolution 0
         [3, 3, 3]], # series 1, resoltuion 1
    ])

    num_series = data.shape[0]
    resolutions = data.shape[1]
    buffer_size = data.shape[2]
    vocab_size = np.max(data)+1

    ratios = fnc(data, num_series, resolutions, buffer_size, vocab_size)
    coordinates = np.argwhere(ratios > 0.0)
    nz_values = ratios[ratios > 0.0]
    print(np.hstack((coordinates, nz_values[:,None])))
    print('0/0 precedes 0/0 in 1 out of 3 cases: {}'.format(np.isclose(ratios[0,0,0,0,0], 1.0/3.0)))
    print('1/2 precedes 0/0 in 2 out of 3 cases: {}'.format(np.isclose(ratios[0,0,1,2,0], 2.0/3.0)))

Expected output (21 pairs, 5 columns for coordinates, followed by found count):

[[0 0 0 0 0 1]
 [0 0 0 1 0 1]
 [0 0 1 2 0 2]
 [0 1 0 0 0 1]
 [0 1 0 2 1 1]
 [0 1 1 1 0 1]
 [0 1 1 3 1 1]
 [0 2 0 3 1 1]
 [0 2 1 3 1 1]
 [0 3 0 1 1 1]
 [0 3 1 3 1 1]
 [1 1 0 0 0 1]
 [1 1 1 2 0 1]
 [1 2 0 0 0 1]
 [1 2 0 1 0 1]
 [1 2 1 1 0 1]
 [1 2 1 2 0 1]
 [1 3 0 1 1 1]
 [1 3 0 2 1 1]
 [1 3 0 3 1 1]
 [1 3 1 3 1 3]]

In the example above the 0 in series 0 follows a 2 in series 1 in two out of three cases (since the buffers are circular), so the ratio at [0, 0, 1, 2, 0] will be ~0.6666. Also series 0, value 0 follows itself in one out of three cases, so the ratio at [0, 0, 0, 0, 0] will be ~0.3333. There are some others which are >0.0 as well.


I am testing each answer on two datasets: a tiny one (as shown above) and a more realistic one (100 series, 5 resolutions, 10 values per series, 50 symbols).

Results

Answer        Time (tiny)     Time (huge)     All pairs found (tiny=21)
-----------------------------------------------------------------------
Baseline      ~1ms            ~675s (!)       Yes
Saedeas       ~0.13ms         ~1.4ms          No (!)
Saedeas2      ~0.20ms         ~4.0ms          Yes, +cross resolutions
Elliot_1      ~0.70ms         ~100s (!)       Yes
Elliot_2      ~1ms            ~21s (!)        Yes
Kuppern_1     ~0.39ms         ~2.4s (!)       Yes
Kuppern_2     ~0.18ms         ~28ms           Yes
Kuppern_3     ~0.19ms         ~24ms           Yes
David         ~0.21ms         ~27ms           Yes

Saedeas 2nd approach is the clear winner! Thank you so much, all of you :)

like image 526
Managarm Avatar asked Jul 05 '18 14:07

Managarm


4 Answers

To start, you're doing yourself a bit of a disservice by not explicitly nesting the for loops. You wind up repeating a lot of effort and not saving anything in terms of memory. When the loop is nested, you can move some of the computations from one level to another and figure out which inner loops can be vectorized over.

def supports_5_loop(data, num_series, resolutions, buffer_size, vocab_size):
    ratios = np.full((num_series, vocab_size, num_series, vocab_size, resolutions), 0.0)
    for res in xrange(resolutions):
        for s0 in xrange(num_series):
            # Find the positions where s0==v0
            for v0 in np.unique(data[s0, res]):
                # only need to find indices once for each series and value
                found0 = np.where(data[s0, res, :] == v0)[0]
                for s1 in xrange(num_series):
                    # Check how often s1==v1 right before s0==v0
                    candidates = (s1, res, (found0 - 1 + buffer_size) % buffer_size)
                    total01 = np.logical_or(data[s0, res, :] >= 0, data[s1, res, :] >= 0).sum()
                    # can skip inner loops if there are no candidates
                    if total01 == 0:
                        continue
                    for v1 in xrange(vocab_size):
                        found01 = np.count_nonzero(data[candidates] == v1)
                        if found01 == 0:
                            continue

                        ratio = (float(found01) / total01)
                        ratios[(s0, v0, s1, v1, res)] = ratio

    return ratios

You'll see in the timings that the majority of the speed pickup comes from not duplicating effort.

Once you've made the nested structure, you can start looking at vectorizations and other optimizations.

def supports_4_loop(data, num_series, resolutions, buffer_size, vocab_size):
    # For small test matrices we can calculate the complete matrix without problems

    # This is huge! :/
    # dimensions:
    #   series and value for which we calculate,
    #   series and value which precedes that measurement,
    #   resolution
    ratios = np.full((num_series, vocab_size, num_series, vocab_size, resolutions), 0.0)

    for res in xrange(resolutions):
        for s0 in xrange(num_series):
            # find the counts where either s0 or s1 are present
            total01 = np.logical_or(data[s0, res] >= 0,
                                    data[:, res] >= 0).sum(axis=1)
            s1s = np.where(total01)[0]
            # Find the positions where s0==v0
            v0s, counts = np.unique(data[s0, res], return_counts=True)
            # sorting before searching will show gains as the datasets
            # get larger
            indarr = np.argsort(data[s0, res])
            i0 = 0
            for v0, count in itertools.izip(v0s, counts):
                found0 = indarr[i0:i0+count]
                i0 += count
                for s1 in s1s:
                    candidates = data[(s1, res, (found0 - 1) % buffer_size)]
                    # can replace the innermost loop with numpy functions
                    v1s, counts = np.unique(candidates, return_counts=True)
                    ratios[s0, v0, s1, v1s, res] = counts / total01[s1]

    return ratios

Unfortunately I could only really vectorize over the innermost loop, and that only bought an additional 10% speedup. Outside of the innermost loop you can't guarantee that all the vectors are the same size, so you can't build an array.

In [121]: (np.all(supports_loop(data, num_series, resolutions, buffer_size, vocab_size) == supports_5_loop(data, num_series, resolutions, buffer_size, vocab_size)))
Out[121]: True

In [122]: (np.all(supports_loop(data, num_series, resolutions, buffer_size, vocab_size) == supports_4_loop(data, num_series, resolutions, buffer_size, vocab_size)))
Out[122]: True
In [123]: %timeit(supports_loop(data, num_series, resolutions, buffer_size, vocab_size))
2.29 ms ± 73.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [124]: %timeit(supports_5_loop(data, num_series, resolutions, buffer_size, vocab_size))
949 µs ± 5.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [125]: %timeit(supports_4_loop(data, num_series, resolutions, buffer_size, vocab_size))
843 µs ± 3.21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
like image 141
Elliot Avatar answered Nov 14 '22 21:11

Elliot


If I'm understanding your problem correctly, I think this bit of code will get you the symbol pairs you're looking for in a relatively quick, vectorized fashion.

import numpy as np
import time
from collections import Counter

series = 2
resolutions = 2
buffer_len = 3
symbols = range(3)

#mat = np.random.choice(symbols, size=(series, resolutions, buffer_len)).astype('uint8')

mat = np.array([
        [[0, 0, 1],  # series 0, resolution 0
         [1, 3, 2]],  # series 0, resolution 1
        [[2, 1, 2],  # series 1, resolution 0
         [3, 3, 3]],  # series 1, resoltuion 1
    ])

start = time.time()
index_mat = np.indices(mat.shape)

right_shift_indices = np.roll(index_mat, -1, axis=3)
mat_shifted = mat[right_shift_indices[0], right_shift_indices[1], right_shift_indices[2]]

# These construct all the pairs directly
first_series = np.repeat(range(series), series*resolutions*buffer_len)
second_series = np.tile(np.repeat(range(series), resolutions*buffer_len), series)
res_loop = np.tile(np.repeat(range(resolutions), buffer_len), series*series)
mat_unroll = np.repeat(mat, series, axis=0)
shift_unroll = np.tile(mat_shifted, series)

# Constructs the pairs
pairs = zip(np.ravel(first_series),
            np.ravel(second_series),
            np.ravel(res_loop),
            np.ravel(mat_unroll),
            np.ravel(shift_unroll))

pair_time = time.time() - start
results = Counter(pairs)
end = time.time() - start

print("Mat: {}").format(mat)
print("Pairs: {}").format(results)
print("Number of Pairs: {}".format(len(pairs)))
print("Pair time is: {}".format(pair_time))
print("Count time is: {}".format(end-pair_time))
print("Total time is: {}".format(end))

The basic idea was to circularly shift each buffer by the appropriate amount depending on which time series it was (I think this is what your current code was doing). I can then generate all the symbol pairs by simply zipping lists offset by 1 together along the series axis.

Example output:

Mat: [[[0 0 1]
  [1 3 2]]

 [[2 1 2]
  [3 3 3]]]
Pairs: Counter({(1, 1, 1, 3, 3): 3, (1, 0, 0, 2, 0): 2, (0, 0, 0, 0, 0): 1, (1, 1, 0, 2, 2): 1, (1, 1, 0, 2, 1): 1, (0, 1, 0, 0, 2): 1, (1, 0, 1, 3, 3): 1, (0, 0, 1, 1, 3): 1, (0, 0, 1, 3, 2): 1, (1, 0, 0, 1, 1): 1, (0, 1, 0, 0, 1): 1, (0, 1, 1, 2, 3): 1, (0, 1, 0, 1, 2): 1, (1, 1, 0, 1, 2): 1, (0, 1, 1, 3, 3): 1, (1, 0, 1, 3, 2): 1, (0, 0, 0, 0, 1): 1, (0, 1, 1, 1, 3): 1, (0, 0, 1, 2, 1): 1, (0, 0, 0, 1, 0): 1, (1, 0, 1, 3, 1): 1})
Number of Pairs: 24
Pair time is: 0.000135183334351
Count time is: 5.10215759277e-05
Total time is: 0.000186204910278

Edit: True final attempt. Fully vectorized.

like image 29
Saedeas Avatar answered Nov 14 '22 23:11

Saedeas


A trick that makes this vectorizable is to make an array of comb[i] = buffer1[i]+buffer2[i-1]*voc_size for each pair of series. Each combination then gets a unique value in the array. And one can find the combination by doing v1[i] = comb[i] % voc_size, v2[i] = comb[i]//voc_size. As long as the number of series is not very high (<10000 i think) there is no point in doing any further vectorisations.

def support_vectorized(data, num_series, resolutions, buffer_size, vocab_size):
    ratios = np.zeros((num_series, vocab_size, num_series, vocab_size, resolutions))
    prev = np.roll(data, 1, axis=2)  # Get previous values
    prev *= vocab_size  # To separate prev from data
    for i, series in enumerate(data):
        for j, prev_series in enumerate(prev):
            comb = series + prev_series
            for k, buffer in enumerate(comb):
                idx, counts = np.unique(buffer, return_counts=True)
                v = idx % vocab_size    
                v2 = idx // vocab_size
                ratios[i, v, j, v2, k] = counts/buffer_size
    return ratios

If however S or R is large, a full vectorization is possible but this uses a lot of memory:

def row_unique(comb):
    comb.sort(axis=-1)
    changes = np.concatenate((
        np.ones((comb.shape[0], comb.shape[1], comb.shape[2], 1), dtype="bool"),
        comb[:, :,:, 1:] != comb[:, :, :, :-1]), axis=-1)
    vals = comb[changes]
    idxs = np.nonzero(changes)
    tmp = np.hstack((idxs[-1], 0))
    counts = np.where(tmp[1:], np.diff(tmp), comb.shape[-1]-tmp[:-1])
    return idxs, vals, counts


def supports_full_vectorized(data, num_series, resolutions, buffer_size, vocab_size):
    ratios = np.zeros((num_series, vocab_size, num_series, vocab_size, resolutions))
    prev = np.roll(data, 1, axis=2)*vocab_size
    comb = data + prev[:, None]  # Create every combination
    idxs, vals, counts = row_unique(comb)  # Get unique values and counts for each row
    ratios[idxs[1], vals % vocab_size, idxs[0], vals // vocab_size, idxs[2]] = counts/buffer_size
    return ratios

However, for S=100 this is slower than the previos solution. A middle ground is to keep a for loop over the series too reduce the memory usage:

def row_unique2(comb):
    comb.sort(axis=-1)
    changes = np.concatenate((
        np.ones((comb.shape[0], comb.shape[1], 1), dtype="bool"),
        comb[:, :, 1:] != comb[:, :, :-1]), axis=-1)
    vals = comb[changes]
    idxs = np.nonzero(changes)
    tmp = np.hstack((idxs[-1], 0))
    counts = np.where(tmp[1:], np.diff(tmp), comb.shape[-1]-tmp[:-1])
    return idxs, vals, counts


def supports_half_vectorized(data, num_series, resolutions, buffer_size, vocab_size):
    prev = np.roll(data, 1, axis=2)*vocab_size
    ratios = np.zeros((num_series, vocab_size, num_series, vocab_size, resolutions))
    for i, series in enumerate(data):
        comb = series + prev
        idxs, vals, counts = row_unique2(comb)
        ratios[i, vals % vocab_size, idxs[0], vals // vocab_size, idxs[1]] = counts/buffer_size
    return ratios

The running times for the different solutions show that support_half_vectorized is the fastest

In [41]: S, R, B, voc_size = (100, 5, 1000, 29)

In [42]: data = np.random.randint(voc_size, size=S*R*B).reshape((S, R, B))

In [43]: %timeit support_vectorized(data, S, R, B, voc_size)
1 loop, best of 3: 4.84 s per loop

In [44]: %timeit supports_full_vectorized(data, S, R, B, voc_size)
1 loop, best of 3: 5.3 s per loop

In [45]: %timeit supports_half_vectorized(data, S, R, B, voc_size)
1 loop, best of 3: 4.36 s per loop

In [46]: %timeit supports_4_loop(data, S, R, B, voc_size)
1 loop, best of 3: 36.7 s per loop
like image 45
kuppern87 Avatar answered Nov 14 '22 23:11

kuppern87


So this is kind of a cop out answer, but I've been working with @Saedeas's answer and based on timings on my machine have been able to optimize it slightly. I do believe that there is a way to do this without the loop, but the size of the intermediate array may be prohibitive.

The changes I have made have been to remove the concatenation that happens at the end of the run() function. This was creating a new array and is unnecessary. Instead we create the full size array at the beginning and just dont use the last row until the end.

Another change I have made is that the tiling of single was slightly inefficient. I have replaced this with very slightly faster code.

I do believe that this can be made faster, but would take some work. I was testing with larger sizes so please let me know what timings you get on your machine.

Code is below;

import numpy as np
import logging
import sys
import time
import itertools
import timeit


logging.basicConfig(stream=sys.stdout,
                    level=logging.DEBUG,
                    format='%(message)s')


def run():
    series = 2
    resolutions = 2
    buffer_len = 3
    symbols = range(50)

    #mat = np.random.choice(symbols, size=(series, resolutions, buffer_len))

    mat = np.array([
            [[0, 0, 1],  # series 0, resolution 0
             [1, 3, 2]],  # series 0, resolution 1
            [[2, 1, 2],  # series 1, resolution 0
             [3, 3, 3]],  # series 1, resoltuion 1
            # [[4, 5, 6, 10],
            #  [7, 8, 9, 11]],
        ])

    # logging.debug("Original:")
    # logging.debug(mat)

    start = time.time()
    index_mat = np.indices((series, resolutions, buffer_len))

    # This loop shifts all series but the one being looked at, and zips the
    # element being looked at with every other member of that row
    cross_pairs = np.empty((series, resolutions, buffer_len, series, 2), int)
    #cross_pairs = []
    right_shift_indices = [index_mat[0], index_mat[1], (index_mat[2] - 1) % buffer_len]

    for i in range(series):
        right_shift_indices[2][i] = (right_shift_indices[2][i] + 1) % buffer_len


        # create a new matrix from the modified indices
        mat_shifted = mat[right_shift_indices]
        mat_shifted_t = mat_shifted.T.reshape(-1, series)
        single = mat_shifted_t[:, i]

        #print np.tile(single,(series-1,1)).T
        #print single.reshape(-1,1).repeat(series-1,1)
        #print single.repeat(series-1).reshape(-1,series-1)

        mat_shifted_t = np.delete(mat_shifted_t, i, axis=1)

        #cross_pairs[i,:,:,:-1] = (np.dstack((np.tile(single, (mat_shifted_t.shape[1], 1)).T, mat_shifted_t))).reshape(resolutions, buffer_len, (series-1), 2, order='F')
        #cross_pairs[i,:,:,:-1] = (np.dstack((single.reshape(-1,1).repeat(series-1,1), mat_shifted_t))).reshape(resolutions, buffer_len, (series-1), 2, order='F')
        cross_pairs[i,:,:,:-1] = np.dstack((single.repeat(series-1).reshape(-1,series-1), mat_shifted_t)).reshape(resolutions, buffer_len, (series-1), 2, order='F')

        right_shift_indices[2][i] = (right_shift_indices[2][i] - 1) % buffer_len
        #cross_pairs.extend([zip(itertools.repeat(x[i]), np.append(x[:i], x[i+1:])) for x in mat_shifted_t])

    #consecutive_pairs = np.empty((series, resolutions, buffer_len, 2, 2), int)
    #print "1", consecutive_pairs.shape
    # tedious code to put this stuff in the right shape
    in_series_zips = np.stack([mat[:, :, :-1], mat[:, :, 1:]], axis=3)
    circular_in_series_zips = np.stack([mat[:, :, -1], mat[:, :, 0]], axis=2)
    # This creates the final array.
    # Index 0 is the preceding series
    # Index 1 is the resolution
    # Index 2 is the location in the buffer
    # Index 3 is for the first n-1 elements, the following series, and for the last element
    #         it's the next element of the Index 0 series
    # Index 4 is the index into the two element pair
    cross_pairs[:,:,:-1,-1] = in_series_zips
    cross_pairs[:,:,-1,-1] = circular_in_series_zips

    end = time.time()
    #logging.debug("Pairs encountered:")
    #logging.debug(pairs)
    logging.info("Elapsed: {}".format(end - start))

if __name__ == '__main__':
    run()
like image 42
David Avatar answered Nov 14 '22 21:11

David