I'm looking for an efficient way to generate full binary cartesian products
(tables with all combinations of True and False with a certain number of columns),
filtered by certain exclusive conditions. For example, for three columns/bits n=3
we
would get the full table
df_combs = pd.DataFrame(itertools.product(*([[True, False]] * n)))
0 1 2
0 True True True
1 True True False
2 True False True
3 True False False
...
This is supposed to be filtered by dictionaries defining mutually exclusive combinations as follows:
mutually_excl = [{0: False, 1: False, 2: True},
{0: True, 2: True}]
Where the keys denote the columns in the table above. The example would be read as:
Based on these filters, the expected output is:
0 1 2
1 True True False
3 True False False
4 False True True
5 False True False
7 False False False
In my use case, the filtered table is multiple orders of magnitude smaller than the full cartesian product (e.g. some 1000 instead of 2**24 (16777216)
).
Below are my three current solutions, each with their own pros and cons, discussed at the very end.
import random
import pandas as pd
import itertools
import wrapt
import time
import operator
import functools
def get_mutually_excl(n, nfilt): # generate random example filter
''' Example: `get_mutually_excl(9, 2)` creates a list of two filters with
maximum index `n=9` and each filter length between 2 and `int(n/3)`:
`[{1: True, 2: False}, {3: False, 2: True, 6: False}]` '''
random.seed(2)
return [{random.choice(range(n)): random.choice([True, False])
for _ in range(random.randint(2, int(n/3)))}
for _ in range(nfilt)]
@wrapt.decorator
def timediff(f, _, args, kwargs):
t = time.perf_counter()
res = f(*args)
return res, time.perf_counter() - t
Expand each single filter entry (e.g. {0: True, 2: True}
) into a
sub-table with columns corresponding to the indices in this filter entry ([0, 2]
).
Remove single filtered row from this sub-table ([True, True]
). Merge with
full table to get the complete list of filtered combinations.
@timediff
def make_df_comb_filt_merge(n, nfilt):
mutually_excl = get_mutually_excl(n, nfilt)
# determine missing (unfiltered) columns
cols_missing = set(range(n)) - set(itertools.chain.from_iterable(mutually_excl))
# complete dataframe of unfiltered columns with column "temp" for full outer merge
df_comb = pd.DataFrame(itertools.product(*([[True, False]] * len(cols_missing))),
columns=cols_missing).assign(temp=1)
for filt in mutually_excl: # loop through individual filters
# get columns and bool values of this filters as two tuples with same order
list_col, list_bool = zip(*filt.items())
# construct dataframe
df = pd.DataFrame(itertools.product(*([[True, False]] * len(list_col))),
columns=list_col)
# filter remove a *single* row (by definition)
df = df.loc[df.apply(tuple, axis=1) != list_bool]
# determine which rows to merge on
merge_cols = list(set(df.columns) & set(df_comb.columns))
if not merge_cols:
merge_cols = ['temp']
df['temp'] = 1
# merge with full dataframe
df_comb = pd.merge(df_comb, df, on=merge_cols)
df_comb.drop('temp', axis=1, inplace=True)
df_comb = df_comb[range(n)]
df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)
return df_comb.reset_index(drop=True)
Generate DataFrame for full cartesian product: The entire thing ends up in memory. Loop through filters and create a mask for each. Apply each mask to table.
@timediff
def make_df_comb_exp_filt(n, nfilt):
mutually_excl = get_mutually_excl(n, nfilt)
# expand all bool combinations into dataframe
df_comb = pd.DataFrame(itertools.product(*([[True, False]] * n)),
dtype=bool)
for filt in mutually_excl:
# generate total filter mask for given excluded combination
mask = pd.Series(True, index=df_comb.index)
for col, bool_act in filt.items():
mask = mask & (df_comb[col] == bool_act)
# filter dataframe
df_comb = df_comb.loc[~mask]
return df_comb.reset_index(drop=True)
Keep the full cartesian product an iterator. Loop while checking for each row whether it is excluded by any of the filters.
@timediff
def make_df_iter_filt(n, nfilt):
mutually_excl = get_mutually_excl(n, nfilt)
# switch to [[(1, 13), (True, False)], [(4, 9), (False, True)], ...]
mutually_excl_index = [list(zip(*comb.items()))
for comb in mutually_excl]
# create iterator
combs_iter = itertools.product(*([[True, False]] * n))
@functools.lru_cache(maxsize=1024, typed=True) # small benefit
def get_getter(list_):
# Used to access combs_iter row values as indexed by the filter
return operator.itemgetter(*list_)
def check_comb(comb_inp, comb_check):
return get_getter(comb_check[0])(comb_inp) == comb_check[1]
# loop through the iterator
# drop row if any of the filter matches
df_comb = pd.DataFrame([comb_inp for comb_inp in combs_iter
if not any(check_comb(comb_inp, comb_check)
for comb_check in mutually_excl_index)])
return df_comb.reset_index(drop=True)
dict_time = dict.fromkeys(itertools.product(range(16, 23, 2), range(3, 20)))
for n, nfilt in dict_time:
dict_time[(n, nfilt)] = {'exp_filt': make_df_comb_exp_filt(n, nfilt)[1],
'filt_merge': make_df_comb_filt_merge(n, nfilt)[1],
'iter_filt': make_df_iter_filt(n, nfilt)[1]}
import seaborn as sns
import matplotlib.pyplot as plt
df_time = pd.DataFrame.from_dict(dict_time, orient='index',
).rename_axis(["n", "nfilt"]
).stack().reset_index().rename(columns={'level_2': 'solution', 0: 'time'})
g = sns.FacetGrid(df_time.query('n in %s' % str([16,18,20,22])),
col="n", hue="solution", sharey=False)
g = (g.map(plt.plot, "nfilt", "time", marker="o").add_legend())
Solution 3: The iterator based approach (comb_iterator
) has dismal running times, but no significant use
of memory. I feel there is room for improvement, though the inevitable loop
likely imposes hard bounds in terms of running time.
Solution 2: Expanding the full cartesian product into a DataFrame (exp_filt
) causes significant
spikes in memory, which I would like to avoid. Running times are ok though.
Solution 1: Merging DataFrames created from the individual filters (filt_merge
) feels like a good
solution for my practical application (note the reduction in running time for greater numbers of filters, which is a result of the smaller cols_missing
table). Still, this approach is not entirely satisfying:
If a single filter includes all columns, the whole cartesian product (2**n
)
would end up in memory, making this solution worse than comb_iterator
.
Question: Any other ideas? A crazy smart numpy two-liner? Could the iterator based approach be improved somehow?
Try timing the following:
def in_filter(arr, arr_filt, n):
return ((arr[:, None] >> (n-1-arr_filt[:, 0])) & 1 == arr_filt[:, 1]).all(axis=1)
def bits_to_boolean(arr, n):
return ((arr[:, None] >> np.arange(n, dtype=arr.dtype)[::-1]) & 1).astype(bool)
@timediff
def recursive_filter(n, nfilt, dtype='uint32'):
filts = get_mutually_excl(n, nfilt)
out = np.arange(2**n, dtype=dtype)
for filt in filts:
arr_filt = np.array(list(filt.items()))
out = out[~in_filter(out, arr_filt, n)]
return bits_to_boolean(out, n)[::-1]
It treats the Cartesian binary products as the bits encoded in the range of integers 0..<2**n
and uses vectorised functions to recursively remove numbers that have bit sequences that match the given filters.
The memory efficiency is better than allocating all [True, False]
Cartesian products as each Boolean would be stored with at least 8 bits each (using 7 bits more than required), but it will use more memory than an iterator-based approach. If you require a solution for large n
, you could break down this task by allocating and operating on one sub-range at a time. I did have this in my first implementation, but it didn't offer much benefit for n<=22
and it required the calculation of the size of the output array, which gets complicated when there are overlapping filters.
Based on @ayhan's comment I implemented an or-tools SAT-based solution. While the idea is great, this really struggles for larger number of binary variables. I suspect this is similar to large IP problems, which ain't no walk in the park either. However, the strong dependency on filter numbers could make this a valid option for certain parameter configurations. But as a general solution I wouldn't use it.
from ortools.sat.python import cp_model
class VarArraySolutionCollector(cp_model.CpSolverSolutionCallback):
def __init__(self, variables):
cp_model.CpSolverSolutionCallback.__init__(self)
self.__variables = variables
self.solution_list = []
def on_solution_callback(self):
self.solution_list.append([self.Value(v) for v in self.__variables])
@timediff
def make_df_comb_sat(n, nfilt):
mutually_excl = get_mutually_excl(n, nfilt)
model = cp_model.CpModel()
make_var_name = 'x{:02d}'.format
vrs = dict.fromkeys(map(make_var_name, range(n)))
for var_name in vrs:
vrs[var_name] = model.NewBoolVar(var_name)
for filt in mutually_excl:
list_expr = [vrs[make_var_name(iv)]
if not bool_ else getattr(vrs[make_var_name(iv)], 'Not')()
for iv, bool_ in filt.items()]
model.AddBoolOr(list_expr)
solver = cp_model.CpSolver()
solution_printer = VarArraySolutionCollector(vrs.values())
solver.SearchForAllSolutions(model, solution_printer)
df_comb = pd.DataFrame(solution_printer.solution_list).astype(bool)
df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)
df_comb = df_comb.reset_index(drop=True)
return df_comb
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With