I have a directed acyclic simple graph in NetworkX.
Now, for each edge, that edge has a "source" and a "target". If there exists a path from the "source" to the "target" besides this edge, then I want to delete this edge.
I really don't want to re-invent the wheel.
Here is an example of a DAG that needs to be cleaned:
The nodes are:
['termsequence', 'maximumdegree', 'emptymultigraph', 'minimum', 'multiset', 'walk', 'nonemptymultigraph', 'euleriantrail', 'nonnullmultigraph', 'cycle', 'loop', 'abwalk', 'endvertices', 'simplegraph', 'vertex', 'multipletrails', 'edge', 'set', 'stroll', 'union', 'trailcondition', 'nullmultigraph', 'trivialmultigraph', 'sequence', 'multiplepaths', 'path', 'degreevertex', 'onedgesonvertices', 'nontrivialmultigraph', 'adjacentedges', 'adjacentvertices', 'simpleedge', 'maximum', 'multipleloops', 'length', 'circuit', 'class', 'euleriangraph', 'incident', 'minimumdegree', 'orderedpair', 'unique', 'closedwalk', 'multipleedges', 'pathcondition', 'multigraph', 'trail']
The edges are:
[('termsequence', 'endvertices'), ('emptymultigraph', 'nonemptymultigraph'), ('minimum', 'minimumdegree'), ('multiset', 'trailcondition'), ('multiset', 'pathcondition'), ('multiset', 'multigraph'), ('walk', 'length'), ('walk', 'closedwalk'), ('walk', 'abwalk'), ('walk', 'trail'), ('walk', 'endvertices'), ('euleriantrail', 'euleriangraph'), ('loop', 'simplegraph'), ('loop', 'degreevertex'), ('loop', 'simpleedge'), ('loop', 'multipleloops'), ('endvertices', 'abwalk'), ('vertex', 'adjacentvertices'), ('vertex', 'onedgesonvertices'), ('vertex', 'walk'), ('vertex', 'adjacentedges'), ('vertex', 'multipleedges'), ('vertex', 'edge'), ('vertex', 'multipleloops'), ('vertex', 'degreevertex'), ('vertex', 'incident'), ('edge', 'adjacentvertices'), ('edge', 'onedgesonvertices'), ('edge', 'multipleedges'), ('edge', 'simpleedge'), ('edge', 'adjacentedges'), ('edge', 'loop'), ('edge', 'trailcondition'), ('edge', 'pathcondition'), ('edge', 'walk'), ('edge', 'incident'), ('set', 'onedgesonvertices'), ('set', 'edge'), ('union', 'multiplepaths'), ('union', 'multipletrails'), ('trailcondition', 'trail'), ('nullmultigraph', 'nonnullmultigraph'), ('sequence', 'walk'), ('sequence', 'endvertices'), ('path', 'cycle'), ('path', 'multiplepaths'), ('degreevertex', 'maximumdegree'), ('degreevertex', 'minimumdegree'), ('onedgesonvertices', 'multigraph'), ('maximum', 'maximumdegree'), ('circuit', 'euleriangraph'), ('class', 'multiplepaths'), ('class', 'multipletrails'), ('incident', 'adjacentedges'), ('incident', 'degreevertex'), ('incident', 'onedgesonvertices'), ('orderedpair', 'multigraph'), ('closedwalk', 'circuit'), ('closedwalk', 'cycle'), ('closedwalk', 'stroll'), ('pathcondition', 'path'), ('multigraph', 'euleriangraph'), ('multigraph', 'nullmultigraph'), ('multigraph', 'trivialmultigraph'), ('multigraph', 'nontrivialmultigraph'), ('multigraph', 'emptymultigraph'), ('multigraph', 'euleriantrail'), ('multigraph', 'simplegraph'), ('trail', 'path'), ('trail', 'circuit'), ('trail', 'multipletrails')]
My previous answer deals specifically with the direct question of whether there is a good way to test whether a single edge is redundant.
It looks like you really want a way to remove all redundant edges efficiently. This means you'd like a way to do them all at once. It's a different question, but here's an answer to it. I do not believe networkx has something built in for this, but it is not difficult to find a workable algorithm.
The idea is that since it's a DAG, there are some nodes that have no out-edges. Start with them and process them. Once all are processed, then a subset of their parents have no children who haven't been processed. Go through those parents. Repeat. At each stage, the set of unprocessed nodes is a DAG, and we are processing the "end nodes" of that DAG. It is guaranteed to finish (if the original network is finite).
In the implementation, whenever we process a node, we check first to see whether any child is also an indirect descendant. If so, delete the edge. If not, leave it. When all children are dealt with, we update information for its parents by adding all of its descendants to the set of indirect descendants for the parent. If all children of the parent are processed, we now add it to our list for next iteration.
import networkx as nx
from collections import defaultdict
def remove_redundant_edges(G):
processed_child_count = defaultdict(int) #when all of a nodes children are processed, we'll add it to nodes_to_process
descendants = defaultdict(set) #all descendants of a node (including children)
out_degree = {node:G.out_degree(node) for node in G.nodes_iter()}
nodes_to_process = [node for node in G.nodes_iter() if out_degree[node]==0] #initially it's all nodes without children
while nodes_to_process:
next_nodes = []
for node in nodes_to_process:
'''when we enter this loop, the descendants of a node are known, except for direct children.'''
for child in G.neighbors(node):
if child in descendants[node]: #if the child is already an indirect descendant, delete the edge
G.remove_edge(node,child)
else: #otherwise add it to the descendants
descendants[node].add(child)
for predecessor in G.predecessors(node): #update all parents' indirect descendants
descendants[predecessor].update(descendants[node])
processed_child_count[predecessor]+=1 #we have processed one more child of this parent
if processed_child_count[predecessor] == out_degree[predecessor]: #if all children processed, add to list for next iteration.
next_nodes.append(predecessor)
nodes_to_process=next_nodes
To test it:
G=nx.DiGraph()
G.add_nodes_from(['termsequence', 'maximumdegree', 'emptymultigraph', 'minimum', 'multiset', 'walk', 'nonemptymultigraph', 'euleriantrail', 'nonnullmultigraph', 'cycle', 'loop', 'abwalk', 'endvertices', 'simplegraph', 'vertex', 'multipletrails', 'edge', 'set', 'stroll', 'union', 'trailcondition', 'nullmultigraph', 'trivialmultigraph', 'sequence', 'multiplepaths', 'path', 'degreevertex', 'onedgesonvertices', 'nontrivialmultigraph', 'adjacentedges', 'adjacentvertices', 'simpleedge', 'maximum', 'multipleloops', 'length', 'circuit', 'class', 'euleriangraph', 'incident', 'minimumdegree', 'orderedpair', 'unique', 'closedwalk', 'multipleedges', 'pathcondition', 'multigraph', 'trail'])
G.add_edges_from([('termsequence', 'endvertices'), ('emptymultigraph', 'nonemptymultigraph'), ('minimum', 'minimumdegree'), ('multiset', 'trailcondition'), ('multiset', 'pathcondition'), ('multiset', 'multigraph'), ('walk', 'length'), ('walk', 'closedwalk'), ('walk', 'abwalk'), ('walk', 'trail'), ('walk', 'endvertices'), ('euleriantrail', 'euleriangraph'), ('loop', 'simplegraph'), ('loop', 'degreevertex'), ('loop', 'simpleedge'), ('loop', 'multipleloops'), ('endvertices', 'abwalk'), ('vertex', 'adjacentvertices'), ('vertex', 'onedgesonvertices'), ('vertex', 'walk'), ('vertex', 'adjacentedges'), ('vertex', 'multipleedges'), ('vertex', 'edge'), ('vertex', 'multipleloops'), ('vertex', 'degreevertex'), ('vertex', 'incident'), ('edge', 'adjacentvertices'), ('edge', 'onedgesonvertices'), ('edge', 'multipleedges'), ('edge', 'simpleedge'), ('edge', 'adjacentedges'), ('edge', 'loop'), ('edge', 'trailcondition'), ('edge', 'pathcondition'), ('edge', 'walk'), ('edge', 'incident'), ('set', 'onedgesonvertices'), ('set', 'edge'), ('union', 'multiplepaths'), ('union', 'multipletrails'), ('trailcondition', 'trail'), ('nullmultigraph', 'nonnullmultigraph'), ('sequence', 'walk'), ('sequence', 'endvertices'), ('path', 'cycle'), ('path', 'multiplepaths'), ('degreevertex', 'maximumdegree'), ('degreevertex', 'minimumdegree'), ('onedgesonvertices', 'multigraph'), ('maximum', 'maximumdegree'), ('circuit', 'euleriangraph'), ('class', 'multiplepaths'), ('class', 'multipletrails'), ('incident', 'adjacentedges'), ('incident', 'degreevertex'), ('incident', 'onedgesonvertices'), ('orderedpair', 'multigraph'), ('closedwalk', 'circuit'), ('closedwalk', 'cycle'), ('closedwalk', 'stroll'), ('pathcondition', 'path'), ('multigraph', 'euleriangraph'), ('multigraph', 'nullmultigraph'), ('multigraph', 'trivialmultigraph'), ('multigraph', 'nontrivialmultigraph'), ('multigraph', 'emptymultigraph'), ('multigraph', 'euleriantrail'), ('multigraph', 'simplegraph'), ('trail', 'path'), ('trail', 'circuit'), ('trail', 'multipletrails')])
print G.size()
>71
print G.order()
>47
descendants = {} #for testing below
for node in G.nodes():
descendants[node] = nx.descendants(G,node)
remove_redundant_edges(G) #this removes the edges
print G.size() #lots of edges gone
>56
print G.order() #no nodes changed.
>47
newdescendants = {} #for comparison with above
for node in G.nodes():
newdescendants[node] = nx.descendants(G,node)
for node in G.nodes():
if descendants[node] != newdescendants[node]:
print 'descendants changed!!' #all nodes have the same descendants
for child in G.neighbors(node):
if len(list(nx.all_simple_paths(G,node, child)))>1:
print 'bad edge' #no alternate path exists from a node to its child.
This will be efficient: it has to process every node at the beginning to see whether it is an "end" node. Then it processes each edge reaching those and checks whether all children of that parent have been processed. It then looks at those parents and repeats.
Thus it will process each edge once (including glancing at the parent), and each vertex will be dealt with once at the beginning, and then processed once.
Here is a simple general-purpose algorithm. The algorithm could be run forwards or backwards. This and Joel's answer are essentially duals -- his runs backward and this runs forward:
def remove_redundant_edges(G):
"""
Remove redundant edges from a DAG using networkx (nx).
An edge is redundant if there is an alternate path
from its start node to its destination node.
This algorithm could work front to back, or back to front.
We choose to work front to back.
The main persistent variable (in addition to the graph
itself) is indirect_pred_dict, which is a dictionary with
one entry per graph node. Each entry is a set of indirect
predecessors of this node.
The algorithmic complexity of the code on a worst-case
fully-connected graph is O(V**3), where V is the number
of nodes.
"""
indirect_pred_dict = collections.defaultdict(set)
for node in nx.topological_sort(G):
indirect_pred = indirect_pred_dict[node]
direct_pred = G.predecessors(node)
for pred in direct_pred:
if pred in indirect_pred:
G.remove_edge(pred, node)
indirect_pred.update(direct_pred)
for succ in G.successors(node):
indirect_pred_dict[succ] |= indirect_pred
For a minimally connected graph where each node is only connected to a single edge, the complexity is O(V+E)
. However, even for a simple linear graph (where each node has an incoming edge and an outgoing edge), the complexity is O(V*E)
, and for a maximally connected graph (which is the worst case, where each node is connected to every downstream node on the graph), the complexity is O(V**3)
. For this case, the number of ops follows sequence A000292, which is n * (n+1) * (n+2) / 6
where n is the number of nodes (V) minus 3.
Depending on the shape of your graph, there are additional optimizations you can do. Here is a version with a few different optimizers that can dramatically reduce complexity and run-time for some types of graphs:
def remove_redundant_edges(G, optimize_dense=True, optimize_chains=True,
optimize_tree=False, optimize_funnel=False):
"""
Remove redundant edges from a DAG using networkx (nx).
An edge is redundant if there is an alternate path
from its start node to its destination node.
This algorithm could work equally well front to back,
or back to front. We choose to work front to back.
The main persistent variable (in addition to the graph
itself) is indirect_pred_dict, which is a dictionary with
one entry per graph node. Each entry is a set of indirect
predecessors of this node.
The main processing algorithm uses this dictionary to
iteratively calculate indirect predecessors and direct
predecessors for every node, and prune the direct
predecessors edges if they are also accessible indirectly.
The algorithmic complexity is O(V**3), where V is the
number of nodes in the graph.
There are also several graph shape-specific optimizations
provided. These optimizations could actually increase
run-times, especially for small graphs that are not amenable
to the optimizations, so if your execution time is slow,
you should test different optimization combinations.
But for the right graph shape, these optimizations can
provide dramatic improvements. For the fully connected
graph (which is worst-case), optimize_dense reduces the
algorithmic complexity from O(V**3) to O(V**2).
For a completely linear graph, any of the optimize_tree,
optimize_chains, or optimize_funnel options would decrease
complexity from O(V**2) to O(V).
If the optimize_dense option is set to True, then an
optimization phase is before the main algorithm. This
optimization phase works by looking for matches between
each node's successors and that same node's successor's
successors (by only looking one level ahead at a time).
If the optimize_tree option is set true, then a phase is
run that will optimize trees by working right-to-left and
recursively removing leaf nodes with a single predecessor.
This will also optimize linear graphs, which are degenerate
trees.
If the optimize_funnel option is set true, then funnels
(inverted trees) will be optimized.
If the optimize_chains option is set true, then chains
(linear sections) will be optimized by sharing the
indirect_pred_dict sets. This works because Python
checks to see if two sets are the same instance before
combining them.
For a completely linear graph, optimize_funnel or optimize_tree
execute more quickly than optimize_chains. Nonetheless,
optimize_chains option is enabled by default, because
it is a balanced algorithm that works in more cases than
the other two.
"""
ordered = nx.topological_sort(G)
if optimize_dense:
succs= dict((node, set(G.successors(node))) for node in ordered)
for node in ordered:
my_succs = succs.pop(node)
kill = set()
while my_succs:
succ = my_succs.pop()
if succ not in kill:
check = succs[succ]
kill.update(x for x in my_succs if x in check)
for succ in kill:
G.remove_edge(node, succ)
indirect_pred_dict = dict((node, set()) for node in ordered)
if optimize_tree:
remaining_nodes = set(ordered)
for node in reversed(ordered):
if G.in_degree(node) == 1:
if not (set(G.successors(node)) & remaining_nodes):
remaining_nodes.remove(node)
ordered = [node for node in ordered if node in remaining_nodes]
if optimize_funnel:
remaining_nodes = set(ordered)
for node in ordered:
if G.out_degree(node) == 1:
if not (set(G.predecessors(node)) & remaining_nodes):
remaining_nodes.remove(node)
ordered = [node for node in ordered if node in remaining_nodes]
if optimize_chains:
# This relies on Python optimizing the set |= operation
# by seeing if the objects are identical.
for node in ordered:
succs = G.successors(node)
if len(succs) == 1 and len(G.predecessors(succs[0])) == 1:
indirect_pred_dict[succs[0]] = indirect_pred_dict[node]
for node in ordered:
indirect_pred = indirect_pred_dict.pop(node)
direct_pred = G.predecessors(node)
for pred in direct_pred:
if pred in indirect_pred:
G.remove_edge(pred, node)
indirect_pred.update(direct_pred)
for succ in G.successors(node):
indirect_pred_dict[succ] |= indirect_pred
I have not analyzed whether it is possible to construct a dense, yet non-maximally-connected graph that executes with complexity greater than O(V**2)
with the optimize_dense option enabled, but I have no reason, a priori, to believe that it is not possible. The optimization works best for a a maximally-connected graph, and would not do anything, for example, in the case where each node shares successors with its grandchilden instead of its children, and I have not analyzed the run-time of this case.
I have stripped down the code for the base algorithm and added instrumentation that records the number of ops required by the worst-case path, and an example test generator that generates maximally-connected graphs.
import collections
import networkx as nx
def makegraph(numnodes):
"""
Make a fully-connected graph given a number of nodes
"""
edges = []
for i in range(numnodes):
for j in range(i+1, numnodes):
edges.append((i, j))
return nx.DiGraph(edges)
def remove_redundant_edges(G):
ops = 0
indirect_pred_dict = collections.defaultdict(set)
for node in nx.topological_sort(G):
indirect_pred = indirect_pred_dict[node]
direct_pred = G.predecessors(node)
for pred in direct_pred:
if pred in indirect_pred:
G.remove_edge(pred, node)
indirect_pred.update(direct_pred)
for succ in G.successors(node):
indirect_pred_dict[succ] |= indirect_pred
ops += len(indirect_pred)
return ops
def test_1(f, numnodes):
G = makegraph(numnodes)
e1 = nx.number_of_edges(G)
ops = f(G)
e2 = nx.number_of_edges(G)
return ops, e1, e2
for numnodes in range(30):
a = test_1(remove_redundant_edges, numnodes)
print numnodes, a[0]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With