I am trying to make a list of groups that I extract from a list of lists. The main list I have contains lists which are of different lengths. I want to group up all the sub-lists that contain at least one value from another sub-list efficiently. For instance I want this:
[[2],[5],[5,8,16],[7,9,12],[9,20]]
to be grouped up like this
my_groups = {"group1":[2], "group2":[5,8,16], "group3":[7,9,12,20]}
The way I thought of doing this is to create convert the sub-lists into sets and using reduce intersect1d
reduce(np.intersect1d, (SUPERLIST))
and then put the results in a dictionary. But I don't know how to convert it to an array of sets without iterating through the list. Is there a way to do this or a more efficient way that I am missing?
Without numpy I would do something like this:
my_dict = dict()
unique_id = 0
for sub_list_ref in super_list:
sub_set_ref = set(sub_list_ref)
for sub_list_comp in super_list:
sub_set_comp = set(sub_list_comp)
if len(sub_set_ref.intersection(sub_set_comp))>0:
my_dict[unique_id] = sub_set_ref.union(sub_set_comp)
updated_id = unique_id+1
if updated_id == unique_id:
my_dict[unique_id] = sub_list_ref
else:
unique_id = updated_id
Yesterday this question got an excellent answer from DarryIG and today I managed to make a more efficient version.
def coalesce_groups(current_list, super_list, iterations):
if iterations <= 0:
print("YOU HAVE ITERATED MORE THAN 3 TIMES")
tmp_list = current_list
for element in current_list:
tmp_list = tmp_list + super_list[element]
# Take only the unique elements
tmp_list = list(set(tmp_list))
if tmp_list == list(set(current_list)):
return tmp_list
else:
iterations-=1
return coalesce_groups(tmp_list, super_list, iterations)
def dict_of_groups(original_list):
lst = list(original_list).copy()
result_list = []
for it in lst:
result = coalesce_groups(it, lst, iterations = 3)
if len(result)!=0:
result_list.append(result)
for t in result:
lst[t] = []
result_dict = { x : result_list[x] for x in range(0, len(result_list) ) }
return result_dict
In test (on jupyter notebook)
lst = [[0], [1], [2], [3], [4], [5], [16, 6], [8, 10, 18, 21, 27, 7], [10, 19, 21, 27, 40, 8], [13, 20, 22, 26, 28, 30, 34, 41, 42, 50, 9], [18, 21, 27, 10], [11], [12], [20, 22, 26, 28, 30, 34, 41, 42, 50, 13], [14], [15], [16], [25, 17], [21, 27, 40, 18], [21, 27, 40, 19], [22, 26, 28, 30, 34, 41, 42, 50, 20], [27, 40, 21], [26, 28, 30, 34, 41, 42, 50, 22], [23], [24], [25], [28, 30, 34, 41, 42, 50, 26], [40, 27], [30, 34, 41, 42, 50, 28], [29], [34, 41, 42, 50, 30], [33, 31], [32], [33], [41, 42, 50, 34], [35], [36], [37], [38], [39], [40], [42, 50, 41], [50, 42], [43], [44], [45], [46], [47], [49, 48], [49], [50]]
%lprun -T lprof0 -f generate_groups generate_groups(lst)
print(open('lprof0', 'r').read())
%lprun -T lprof1 -f dict_of_groups dict_of_groups(lst)
print(open('lprof1', 'r').read())
Additional answers will be taken into consideration but I believe his is still valid and the more complete for this question. DarryIG is still king
This can be done efficienly using Union-Find algorithm from graphs (see https://www.geeksforgeeks.org/union-find-algorithm-set-2-union-by-rank/)
We consider each sublist as a vertex in a graph.
Two vertexes are connected if their sublists overlap (i.e. intersect).
Union-find provides an efficient method of finding all disjoint subsets of non-overlapping vertices.
from collections import defaultdict
# a structure to represent a graph
class Graph:
def __init__(self, num_of_v):
self.num_of_v = num_of_v
self.edges = defaultdict(list)
# graph is represented as an
# array of edges
def add_edge(self, u, v):
self.edges[u].append(v)
class Subset:
def __init__(self, parent, rank):
self.parent = parent
self.rank = rank
def __repr__(self):
return {'name':self.parent, 'age':self.rank}
def __str__(self):
return 'Subset(parent='+str(self.parent)+', rank='+str(self.rank)+ ')'
# A utility function to find set of an element
# node(uses path compression technique)
def find(subsets, node):
if subsets[node].parent != node:
subsets[node].parent = find(subsets, subsets[node].parent)
return subsets[node].parent
# A function that does union of two sets
# of u and v(uses union by rank)
def union(subsets, u, v):
# Attach smaller rank tree under root
# of high rank tree(Union by Rank)
if subsets[u].rank > subsets[v].rank:
subsets[v].parent = u
elif subsets[v].rank > subsets[u].rank:
subsets[u].parent = v
# If ranks are same, then make one as
# root and increment its rank by one
else:
subsets[v].parent = u
subsets[u].rank += 1
def find_disjoint_sets(graph):
# Allocate memory for creating sets
subsets = []
for u in range(graph.num_of_v):
subsets.append(Subset(u, 0))
# Iterate through all edges of graph,
# find sets of both vertices of every
# edge, if sets are same, then there
# is cycle in graph.
for u in graph.edges:
u_rep = find(subsets, u)
for v in graph.edges[u]:
v_rep = find(subsets, v)
if u_rep == v_rep:
continue
else:
union(subsets, u_rep, v_rep)
return subsets
def generate_groups(lst):
""" Finds disjoint sublists in lst. Performs a union of sublists that intersect """
# Generate graph
g = Graph(len(lst))
# Loop over all pairs of subists,
# Place an edge in the graph for sublists that intersect
for i1, v1 in enumerate(lst):
set_v1 = set(v1)
for i2, v2 in enumerate(lst):
if i2 > i1 and set_v1.intersection(v2):
g.add_edge(i1, i2)
# Disjoint subsets of sublists
subsets = find_disjoint_sets(g)
# Union of sublists which are non-disjoint (i.e. have the same parent)
d = {}
for i in range(len(lst)):
sublist_index = find(subsets, i)
if not sublist_index in d:
d[sublist_index] = set()
d[sublist_index] = d[sublist_index].union(lst[i])
return d
# Test Code
lst = [[2],[5],[5,8,16],[7,9,12],[9,20]]
d = generate_groups(lst)
print(d)
Output
{0: {2}, 1: {8, 16, 5}, 3: {9, 12, 20, 7}}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With