Lets say I have the following set:
{(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)}
This gives each number the following frequency:
2: 4, 3: 4, 4: 3, 1: 2
Can you propose a way of reducing the set such that each number exists in it at least 2 times but where the number of tuples in the set is reduced to a minimum?
For example, the tuple (3, 4) could be removed from the set, giving these frequencies:
2: 4, 3: 3, 4: 2, 1: 2
Here is my very feeble attempt at solving this:
def reduce(a, limit):
while True:
remove = None
for i in a:
c = Counter([i for s in a for i in s])
if c.most_common(1)[0][0] in i:
if min([c[j] for j in i]) > limit:
remove = i
break
if remove:
a.remove(remove)
else:
break
reduce(a, 2) # we want at least two of each number
The problem with this solution is that it might well reduce the set but not necessarily such that I am left with the smallest possible set.
For my particular example, the set I wish to reduce contains strings, lets say something like this:
a = [("one","eighty one","three"), ("eighty five","sixty one","three", "eleven"), ...]
where the length of a
is 1000. The length of each tuple in a
is from 3 to 9. There are 100 unique values which tuples can be composed of, for example, "one" is one such value. I want each unique value represented at least 25 times after I have reduced the set. How long time might it take a PC to compute the reduced set? Are we talking a few seconds or many minutes?
As noted in the comments, the NP-hard problem Set Cover is a special
case of this problem where the minimum frequency is k = 1
, making this
problem NP-hard as well. I would recommend a library like
PuLP with the following integer
program.
minimize sum over tuples T of x(T)
subject to
y(e): for all elements e, (sum over tuples T of (count of e in T) * x(T)) >= k
z(T): for all tuples T, x(T) in {0, 1}
The one downside of PuLP is that it requires an external solver. I was in the mood to hack, however, so I wrote a (very lightly tested) pure Python solver. It uses depth-first search with best-first backtracking, with a simple propagation strategy to determine which tuples must or must not be chosen and a heuristic function based on a primal-dual approximation to the following dual of the previous program (so it’s a sophisticated toy, but still a toy).
maximize (sum over elements e of k * y(e)) - (sum over tuples T of z(T))
subject to
x(T): for all tuples T, (sum over elements e in T of y(e)) - z(T) <= 1
for all elements e, y(e) >= 0
for all tuples T, z(T) >= 0
The primal-dual strategy is to increase at the same rate those values of
y
who increase does not require an unprofitable corresponding increase
in z
.
from collections import Counter, defaultdict, namedtuple
from fractions import Fraction
from heapq import heappop, heappush
from math import ceil
from operator import itemgetter
class _BestFirstSearchDepthFirstBacktracking:
def optimize(self):
node = self._make_root_node()
heap = []
upper_bound = None
while True:
lower_bound = ceil(node.lower_bound)
if upper_bound is None or lower_bound < upper_bound:
child_nodes = list(self._make_child_nodes(node))
if child_nodes:
i, node = min(enumerate(child_nodes), key=itemgetter(1))
del child_nodes[i]
for child_node in child_nodes:
heappush(heap, child_node)
continue
upper_bound = lower_bound
solution = node
if not heap:
return (upper_bound, solution)
node = heappop(heap)
Node = namedtuple('Node', ('lower_bound', 'index', 'maybes', 'yeses', 'variable'))
class UnsolvableException(Exception):
pass
class _Optimizer(_BestFirstSearchDepthFirstBacktracking):
def __init__(self, tuples, min_freq):
self._index = 0
self._tuples = set(tuples)
self._min_freq = min_freq
self._elements = set()
for t in self._tuples:
self._elements.update(t)
def _propagate(self, maybes, yeses):
upper_count = Counter()
for t in maybes:
upper_count.update(t)
for t in yeses:
upper_count.update(t)
if any(upper_count[e] < self._min_freq for e in self._elements):
raise UnsolvableException()
forced_yeses = set()
forced_yeses = {t for t in maybes if any(upper_count[e] - k < self._min_freq for e, k in Counter(t).items())}
maybes = maybes - forced_yeses
yeses = yeses | forced_yeses
lower_count = Counter()
for t in yeses:
lower_count.update(t)
residual = {e for e in self._elements if lower_count[e] < self._min_freq}
maybes = {t for t in maybes if any(e in residual for e in t)}
return (maybes, yeses)
def _compute_heuristic(self, maybes, yeses):
lower_count = Counter()
for t in yeses:
lower_count.update(t)
residual_count = {e: max(self._min_freq - lower_count[e], 0) for e in self._elements}
y = defaultdict(int)
z = defaultdict(int)
variable = None
while True:
slack = {t: 1 + z[t] - sum(y[e] for e in t) for t in maybes}
assert all(s >= 0 for s in slack.values())
inactive_maybes = {t for t, s in slack.items() if s > 0}
if not inactive_maybes:
break
active_maybes = {t for t, s in slack.items() if s == 0}
active_count = Counter()
for t in active_maybes:
active_count.update(t)
dy = {e: 1 for e, k in residual_count.items() if active_count[e] < k}
if not dy:
break
delta_inverse, variable = max(((Fraction(sum(dy.get(e, 0) for e in t), slack[t]), t) for t in inactive_maybes), key=itemgetter(0))
delta = Fraction(1, delta_inverse)
for e, dy_e in dy.items():
y[e] += delta * dy_e
for t in active_maybes:
z[t] += delta * sum(dy.get(e, 0) for e in t)
return (sum(residual_count[e] * y_e for e, y_e in y.items()) - sum(z.values()), variable)
def _make_node(self, maybes, yeses):
maybes, yeses = self._propagate(maybes, yeses)
heuristic, variable = self._compute_heuristic(maybes, yeses)
node = Node(len(yeses) + heuristic, self._index, maybes, yeses, variable)
self._index += 1
return node
def _make_root_node(self):
return self._make_node(self._tuples, set())
def _make_child_nodes(self, node):
if node.variable is None:
return
variable = {node.variable}
maybes = node.maybes - variable
yield self._make_node(maybes, node.yeses)
yield self._make_node(maybes, node.yeses | variable)
def optimize(tuples, min_freq):
optimizer = _Optimizer(tuples, min_freq)
node = optimizer.optimize()[1]
print('Nodes examined:', optimizer._index)
return node.yeses
print(optimize({(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)}, 2))
print(optimize({(1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14), (1, 2, 3, 4, 8, 9, 10, 11), (5, 6, 12, 13), (7, 14)}, 1))
Here's a quick and dirty method. Hopefully enough to get you going.
Unfortunately it isn't guaranteed to get the exact smallest result set. It gets rid of smaller tuples first. So if there tends to be more smaller tuples and fewer longer tuples it could work for you.
Also starts off as an ordered set (list) but didn't get around to restoring the order. Needs to be ordered at least in the function so calculated values correlate correctly. I'd like to clean it up and refactor but it's late.
def reduce(source, min_count=2):
print "source: {}".format(source)
# [(2,), (3,), (1, 4), (1, 2, 3), (2, 3), (3, 4), (2, 4)]
answer = []
freq = {}
lens = []
for t in source:
lens.append(len(t))
for i in t:
freq[i] = freq.get(i, 0) + 1
print "freq: {}".format(freq) # {1: 2, 2: 4, 3: 4, 4: 3}
print "lens: {}".format(lens) # [1, 1, 2, 3, 2, 2, 2]
from collections import defaultdict
slens = defaultdict(list)
for l, t in zip(lens, source):
slens[l].append(t)
print "slens: {}".format(slens)
# {1: [(2,), (3,)], 2: [(1, 4), (2, 3), (3, 4), (2, 4)], 3: [(1, 2, 3)]}
for l in sorted(slens.keys()):
for t in slens[l]:
save = False
for i in t:
if (freq[i] <= min_count):
save = True
freq[i] -= 1
if save:
answer.append(t)
print "answer: {}".format(answer) # [(1, 4), (1, 2, 3), (3, 4), (2, 4)]
freq = {}
for t in answer:
for i in t:
freq[i] = freq.get(i, 0) + 1
print "freq: {}".format(freq) # {1: 2, 2: 2, 3: 2, 4: 3}
My original thought was to iterate through, save any tuples below the min_count and reduce the working set. Then score the remaining tuples where lower frequency elements counted for more. Then discard the lowest scoring tuples that wouldn't reduce the frequency of any component below min_count when removed. Then re-calculate the frequencies and start over.
The problem is at least NP hard, which means you will not be able to find an efficient (polynomial time) algorithm. However, there are ways to decrease the constant time factors. Besides using better algorithms, consider using faster runtimes, like PyPy.
The following code, if run to completion, will return an subset of the smallest possible size. Additionally, it only considers valid input, and can progressively output increase small covering subsets.
from collections import defaultdict
from itertools import product, combinations
def covering_set(sets, min_freq, print_approximations=False):
# dictionary mapping each unique value to the sets that contain it
d = defaultdict(list)
for set_ in sets:
for elem in set_:
d[elem].append(set_)
# we need min_freq number of each unique values
combos = [combinations(x, min_freq) for x in d.values()]
#initial solution
min_cover = sets
min_length = len(sets)
#iterate through valid solutions
#cover is a list of list of sets
for cover in product(*combos):
#we must flatten and remove the duplicates in the cover
covering_set = set()
for elem_cover in cover:
for set_ in elem_cover:
if set_ not in covering_set:
covering_set.add(set_)
#now, we check if it the smallest current solution
if len(covering_set) < min_length:
min_cover = covering_set
min_length = len(covering_set)
if print_approximations:
print(min_length, min_cover)
return min_cover
So here is my solution. I saw that you used a set of 1000 elements maximum, so i decided to implement the algorithm in recursive mode.
First of all, let's define the function that get the frequency of each numbers in the tuples :
def get_frequency(tuple_list):
frequency = {}
def mapper(element):
if frequency.has_key(element):
frequency[element] += 1
else:
frequency[element] = 1
map(lambda t: map(mapper, t), tuple_list)
return frequency
This is relatively easy to say, so i would not spend a lot of time on that. After that, i decided to implement the main function, called recursive
. This function returns a tuple composed of the list of the elements able to delete and the maximum depth the algorithm could go.
This is the pre-algorithm i wrote before implementation :
if tuple_list is null : return ([], iteration)
best_deletion = None
for elements:
if element can't be deleted : continue
launch the next recursion without the current element in the list
if the iteration is better than best_iteration or best_iteration is None :
set the result of recursion in best_deletion
if best_deletion is None : return ([], iteration)
return the best_iteration with adding the current Node inside, and increment the iteration
Here is the result :
def recursive(tuple_list, limit, iteration):
if tuple_list == []:
return ([], iteration)
frequency = get_frequency(tuple_list)
value = None
for i in xrange(len(tuple_list)):
impossible_to_delete = False
for number in tuple_list[i]:
frequency[number] -= 1
if frequency[number] < limit:
impossible_to_delete = True
break
if impossible_to_delete:
continue
next_recursion_list = tuple_list[:]
next_recursion_list.pop(i)
maximum_deletion = recursive(next_recursion_list, limit, iteration + 1)
if value == None:
maximum_deletion[0].insert(0, tuple_list[i])
value = (maximum_deletion[0], maximum_deletion[1] + 1)
else:
if value[1] < maximum_deletion[1]:
maximum_deletion[0].insert(0, tuple_list[i])
value = (maximum_deletion[0], maximum_deletion[1] + 1)
if value == None:
return ([], iteration)
return value
After that, just call the function like that :
items_to_delete = recursive(list(tuple_set), 2, 0)
Hope it'll help. I will test which of the precedent algorithm purposed is the fastest if i got some time
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With