I have a big string say "aaaaaaaaaaabbbbbbbbbcccccccccccddddddddddd" (but maybe longer) and I have a collection of lots of little strings. I want to count (overlap is OK) how many times the little strings are found in the big string. I care only about speed. KMP seemed good but it looked like Rabin-Karp dealt with multiple but was slow.
The problem with most string searching algorithms is that they will take at least time O(k) to return k matches, so if you have a string with say 1 million "a"s, and 1 million queries of the little string "a", then it will take around 1 million, million iterations to count all the matches!
An alternative linear time approach would be to:
This will take time O(n+p) where n is the length of the big string, and p is the total length of all the small strings.
As requested, here is some small(ish) example code in Python that uses this approach:
from collections import defaultdict
class SuffixTree:
def __init__(self):
"""Returns an empty suffix tree"""
self.T=''
self.E={}
self.nodes=[-1] # 0th node is empty string
def add(self,s):
"""Adds the input string to the suffix tree.
This inserts all substrings into the tree.
End the string with a unique character if you want a leaf-node for every suffix.
Produces an edge graph keyed by (node,character) that gives (first,last,end)
This means that the edge has characters from T[first:last+1] and goes to node end."""
origin,first,last = 0,len(self.T),len(self.T)-1
self.T+=s
nc = len(self.nodes)
self.nodes += [-1]*(2*len(s))
T=self.T
E=self.E
nodes=self.nodes
Lm1=len(T)-1
for last_char_index in xrange(first,len(T)):
c=T[last_char_index]
last_parent_node = -1
while 1:
parent_node = origin
if first>last:
if (origin,c) in E:
break
else:
key = origin,T[first]
edge_first, edge_last, edge_end = E[key]
span = last - first
A = edge_first+span
m = T[A+1]
if m==c:
break
E[key] = (edge_first, A, nc)
nodes[nc] = origin
E[nc,m] = (A+1,edge_last,edge_end)
parent_node = nc
nc+=1
E[parent_node,c] = (last_char_index, Lm1, nc)
nc+=1
if last_parent_node>0:
nodes[last_parent_node] = parent_node
last_parent_node = parent_node
if origin==0:
first+=1
else:
origin = nodes[origin]
if first <= last:
edge_first,edge_last,edge_end=E[origin,T[first]]
span = edge_last-edge_first
while span <= last - first:
first+=span+1
origin = edge_end
if first <= last:
edge_first,edge_last,edge_end = E[origin,T[first]]
span = edge_last - edge_first
if last_parent_node>0:
nodes[last_parent_node] = parent_node
last+=1
if first <= last:
edge_first,edge_last,edge_end=E[origin,T[first]]
span = edge_last-edge_first
while span <= last - first:
first+=span+1
origin = edge_end
if first <= last:
edge_first,edge_last,edge_end = E[origin,T[first]]
span = edge_last - edge_first
return self
def make_choices(self):
"""Construct a sorted list for each node of the possible continuing characters"""
choices = [list() for n in xrange(len(self.nodes))] # Contains set of choices for each node
for (origin,c),edge in self.E.items():
choices[origin].append(c)
choices=[sorted(s) for s in choices] # should not have any repeats by construction
self.choices=choices
return choices
def count_suffixes(self,term):
"""Recurses through the tree finding how many suffixes are based at each node.
Strings assumed to use term as the terminating character"""
C = self.suffix_counts = [0]*len(self.nodes)
choices = self.make_choices()
def f(node=0):
t=0
X=choices[node]
if len(X)==0:
t+=1 # this node is a leaf node
else:
for c in X:
if c==term:
t+=1
continue
first,last,end = self.E[node,c]
t+=f(end)
C[node]=t
return t
return f()
def count_matches(self,needle):
"""Return the count of matches for this needle in the suffix tree"""
i=0
node=0
E=self.E
T=self.T
while i<len(needle):
c=needle[i]
key=node,c
if key not in E:
return 0
first,last,node = E[key]
while i<len(needle) and first<=last:
if needle[i]!=T[first]:
return 0
i+=1
first+=1
return self.suffix_counts[node]
big="aaaaaaaaaaabbbbbbbbbcccccccccccddddddddddd"
small_strings=["a","ab","abc"]
s=SuffixTree()
term=chr(0)
s.add(big+term)
s.count_suffixes(term)
for needle in small_strings:
x=s.count_matches(needle)
print needle,'has',x,'matches'
It prints:
a has 11 matches
ab has 1 matches
abc has 0 matches
However, in practice I would recommend you simply use a pre-existing Aho-Corasick implementation as I would expect this to be much faster in your particular case.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With