Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Generate filtered binary cartesian products

Problem statement

I'm looking for an efficient way to generate full binary cartesian products (tables with all combinations of True and False with a certain number of columns), filtered by certain exclusive conditions. For example, for three columns/bits n=3 we would get the full table

df_combs = pd.DataFrame(itertools.product(*([[True, False]] * n)))
       0      1      2
0   True   True   True
1   True   True  False
2   True  False   True
3   True  False  False
...

This is supposed to be filtered by dictionaries defining mutually exclusive combinations as follows:

mutually_excl = [{0: False, 1: False, 2: True},
                 {0: True, 2: True}]

Where the keys denote the columns in the table above. The example would be read as:

  • If 0 is False and 1 is False, 2 can't be True
  • If 0 is True, 2 can't be True

Based on these filters, the expected output is:

       0      1      2
1   True   True  False
3   True  False  False
4  False   True   True
5  False   True  False
7  False  False  False

In my use case, the filtered table is multiple orders of magnitude smaller than the full cartesian product (e.g. some 1000 instead of 2**24 (16777216)).

Below are my three current solutions, each with their own pros and cons, discussed at the very end.


import random
import pandas as pd
import itertools
import wrapt
import time
import operator
import functools

def get_mutually_excl(n, nfilt):  # generate random example filter
    ''' Example: `get_mutually_excl(9, 2)` creates a list of two filters with
    maximum index `n=9` and each filter length between 2 and `int(n/3)`:
    `[{1: True, 2: False}, {3: False, 2: True, 6: False}]` '''
    random.seed(2)
    return [{random.choice(range(n)): random.choice([True, False])
                           for _ in range(random.randint(2, int(n/3)))}
                           for _ in range(nfilt)]

@wrapt.decorator
def timediff(f, _, args, kwargs):
    t = time.perf_counter()
    res = f(*args)
    return res, time.perf_counter() - t

Solution 1: Filter first, then merge.

Expand each single filter entry (e.g. {0: True, 2: True}) into a sub-table with columns corresponding to the indices in this filter entry ([0, 2]). Remove single filtered row from this sub-table ([True, True]). Merge with full table to get the complete list of filtered combinations.

@timediff
def make_df_comb_filt_merge(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # determine missing (unfiltered) columns
    cols_missing = set(range(n)) - set(itertools.chain.from_iterable(mutually_excl))

    # complete dataframe of unfiltered columns with column "temp" for full outer merge
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * len(cols_missing))),
                            columns=cols_missing).assign(temp=1)

    for filt in mutually_excl:  # loop through individual filters

        # get columns and bool values of this filters as two tuples with same order
        list_col, list_bool = zip(*filt.items())

        # construct dataframe
        df = pd.DataFrame(itertools.product(*([[True, False]] * len(list_col))),
                                columns=list_col)

        # filter remove a *single* row (by definition)
        df = df.loc[df.apply(tuple, axis=1) != list_bool]

        # determine which rows to merge on
        merge_cols = list(set(df.columns) & set(df_comb.columns))
        if not merge_cols:
            merge_cols = ['temp']
            df['temp'] = 1

        # merge with full dataframe
        df_comb = pd.merge(df_comb, df, on=merge_cols)

    df_comb.drop('temp', axis=1, inplace=True)
    df_comb = df_comb[range(n)]
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)

    return df_comb.reset_index(drop=True)

Solution 2: Full expansion, then filter

Generate DataFrame for full cartesian product: The entire thing ends up in memory. Loop through filters and create a mask for each. Apply each mask to table.


@timediff
def make_df_comb_exp_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # expand all bool combinations into dataframe
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * n)),
                           dtype=bool)

    for filt in mutually_excl:

        # generate total filter mask for given excluded combination
        mask = pd.Series(True, index=df_comb.index)
        for col, bool_act in filt.items():
            mask = mask & (df_comb[col] == bool_act)

        # filter dataframe
        df_comb = df_comb.loc[~mask]

    return df_comb.reset_index(drop=True)

Solution 3: Filter iterator

Keep the full cartesian product an iterator. Loop while checking for each row whether it is excluded by any of the filters.

@timediff
def make_df_iter_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # switch to [[(1, 13), (True, False)], [(4, 9), (False, True)], ...]
    mutually_excl_index = [list(zip(*comb.items()))
                                for comb in mutually_excl]

    # create iterator
    combs_iter = itertools.product(*([[True, False]] * n))

    @functools.lru_cache(maxsize=1024, typed=True)  # small benefit
    def get_getter(list_):
        # Used to access combs_iter row values as indexed by the filter
        return operator.itemgetter(*list_)

    def check_comb(comb_inp, comb_check):
        return get_getter(comb_check[0])(comb_inp) == comb_check[1]

    # loop through the iterator
    # drop row if any of the filter matches
    df_comb = pd.DataFrame([comb_inp for comb_inp in combs_iter
                       if not any(check_comb(comb_inp, comb_check)
                                  for comb_check in mutually_excl_index)])

    return df_comb.reset_index(drop=True)

Run examples

dict_time = dict.fromkeys(itertools.product(range(16, 23, 2), range(3, 20)))

for n, nfilt in dict_time:
    dict_time[(n, nfilt)] = {'exp_filt': make_df_comb_exp_filt(n, nfilt)[1],
                             'filt_merge': make_df_comb_filt_merge(n, nfilt)[1],
                             'iter_filt': make_df_iter_filt(n, nfilt)[1]}

Analysis

import seaborn as sns
import matplotlib.pyplot as plt

df_time = pd.DataFrame.from_dict(dict_time, orient='index',
                                 ).rename_axis(["n", "nfilt"]
                                 ).stack().reset_index().rename(columns={'level_2': 'solution', 0: 'time'})

g = sns.FacetGrid(df_time.query('n in %s' % str([16,18,20,22])),
                  col="n",  hue="solution", sharey=False)
g = (g.map(plt.plot, "nfilt", "time", marker="o").add_legend())

enter image description here

Solution 3: The iterator based approach (comb_iterator) has dismal running times, but no significant use of memory. I feel there is room for improvement, though the inevitable loop likely imposes hard bounds in terms of running time.

Solution 2: Expanding the full cartesian product into a DataFrame (exp_filt) causes significant spikes in memory, which I would like to avoid. Running times are ok though.

Solution 1: Merging DataFrames created from the individual filters (filt_merge) feels like a good solution for my practical application (note the reduction in running time for greater numbers of filters, which is a result of the smaller cols_missing table). Still, this approach is not entirely satisfying: If a single filter includes all columns, the whole cartesian product (2**n) would end up in memory, making this solution worse than comb_iterator.

Question: Any other ideas? A crazy smart numpy two-liner? Could the iterator based approach be improved somehow?

like image 234
mcsoini Avatar asked Nov 17 '19 10:11

mcsoini


2 Answers

Try timing the following:

def in_filter(arr, arr_filt, n):
    return ((arr[:, None] >> (n-1-arr_filt[:, 0])) & 1 == arr_filt[:, 1]).all(axis=1)

def bits_to_boolean(arr, n):
    return ((arr[:, None] >> np.arange(n, dtype=arr.dtype)[::-1]) & 1).astype(bool)

@timediff
def recursive_filter(n, nfilt, dtype='uint32'):
    filts = get_mutually_excl(n, nfilt)
    out = np.arange(2**n, dtype=dtype)
    for filt in filts:
        arr_filt = np.array(list(filt.items()))
        out = out[~in_filter(out, arr_filt, n)]
    return bits_to_boolean(out, n)[::-1]

It treats the Cartesian binary products as the bits encoded in the range of integers 0..<2**n and uses vectorised functions to recursively remove numbers that have bit sequences that match the given filters.

The memory efficiency is better than allocating all [True, False] Cartesian products as each Boolean would be stored with at least 8 bits each (using 7 bits more than required), but it will use more memory than an iterator-based approach. If you require a solution for large n, you could break down this task by allocating and operating on one sub-range at a time. I did have this in my first implementation, but it didn't offer much benefit for n<=22 and it required the calculation of the size of the output array, which gets complicated when there are overlapping filters.

like image 170
Seb Avatar answered Oct 27 '22 17:10

Seb


Based on @ayhan's comment I implemented an or-tools SAT-based solution. While the idea is great, this really struggles for larger number of binary variables. I suspect this is similar to large IP problems, which ain't no walk in the park either. However, the strong dependency on filter numbers could make this a valid option for certain parameter configurations. But as a general solution I wouldn't use it.

from ortools.sat.python import cp_model

class VarArraySolutionCollector(cp_model.CpSolverSolutionCallback):

    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.solution_list = []

    def on_solution_callback(self):
        self.solution_list.append([self.Value(v) for v in self.__variables])


@timediff
def make_df_comb_sat(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    model = cp_model.CpModel()

    make_var_name = 'x{:02d}'.format
    vrs = dict.fromkeys(map(make_var_name, range(n)))
    for var_name in vrs:
        vrs[var_name] = model.NewBoolVar(var_name)

    for filt in mutually_excl:
        list_expr = [vrs[make_var_name(iv)]
                     if not bool_ else getattr(vrs[make_var_name(iv)], 'Not')()
                     for iv, bool_ in filt.items()]
        model.AddBoolOr(list_expr)

    solver = cp_model.CpSolver()
    solution_printer = VarArraySolutionCollector(vrs.values())
    solver.SearchForAllSolutions(model, solution_printer)

    df_comb = pd.DataFrame(solution_printer.solution_list).astype(bool)
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)
    df_comb = df_comb.reset_index(drop=True)

    return df_comb

enter image description here

like image 23
mcsoini Avatar answered Oct 27 '22 15:10

mcsoini