I have a 50K list of strings (city names) and I need a the smallest list of character tri-grams (prefarably n-grams) where every string is at least hit once by one tri-gram. Consider the following list: ['amsterdam', 'rotterdam', 'haarlem', 'utrecht', 'groningen']
the list of identifying trigrams is 4 long and should be (alternatives possible):
['ter', 'haa', 'utr', 'gro']
I thought my solution finds the correct right answer but it gave the wrong answers when used on other lists.
from collections import Counter
def identifying_grams(list, n=3):
def f7(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
def ngrams(text, n=3):
return [text[i:i + n] for i in range(len(text) - n + 1)]
hits = []
trigrams = []
for item in list:
# trigrams += ngrams(item)
trigrams += f7(ngrams(item))
counts = Counter(trigrams).most_common()
for trigram, count in counts:
items = []
for item in list:
if trigram in item:
hits.append(trigram)
items.append(item)
for i in items:
list.remove(i)
return(f7(hits))
list1 = ['amsterdam','rotterdam','haarlem','utrecht','groningen']
print(identifying_grams(list1))
# Good, we get: ['ter', 'haa', 'utr', 'gro']
list2 = ['amsterdam','schiedam']
print(identifying_grams(list2))
# Good, we get: ['dam']
list3 = ['amsterdam','schiedam','terwolde','wolstad']
print(identifying_grams(list3))
# Ouch, we get: ['ter', 'dam', 'wol']
# this should be ['dam', 'wol'] as this is only 2 trigrams that identify the list...
I got two answers so far, but both of them have flaws. The one from Rupesh is good for lists that are smaller then 10 items. My lists have over 50K items. The one from mujjiga does come up with a solution albeit not the perfect one.
A bounty for the Python Ninja who comes up with a perfect solution that scales. Bonus kuddos if it performs well and gives same solution every time it runs!
Here's a theoretical analysis of @mujjiga answer:
You can create classes of words that share the same ngram. You want to pick the smallest number of those classes (that is the smallest number of ngrams) that covers the whole set of words. This is the set cover problem. Unfortunately, this problem is NP-hard (not NP-complete , thanks @mujjiga). (EDIT: Hence, there is no known solution that will give you the expected result in a reasonable time.) The greedy algorithm is almost the best solution (see https://cs.stackexchange.com/questions/49777/is-greedy-algorithm-the-best-algorithm-for-set-cover-problem).
Note that even the greedy algorithm may give weird results. Take the sets {a, b}, {b, c}, {c, d}
and the superset {a, b, c, d}
. The three subsets are maxmimum. If you take {b, c}
first, you need the two other subsets to cover the superset. If you take {a, b}
or {c, d}
, two subsets are enough.
Let's use the greedy algorithm, and consider the implementation. The code to create the dictionary that maps ngrams to words is pretty straightforward:
all_words= ['amsterdam','schiedam','werkendam','amstelveen','schiebroek','werkstad','den haag','rotjeknor','gouda']
n=3
words_by_ngram = {}
for word in all_words:
for ngram in (word[i:i+n] for i in range(0, len(word)-n+1)):
words_by_ngram.setdefault(ngram, set()).add(word)
The setdefault
is equivalent to get
if the key ngram
exists, and create an empty set otherwise. This is O(|all_words|*|len max word|)
complexity.
Now, we want to take the ngram with the most words and remove those words from the dictionary. Repeat until you get the words you want.
Here's the simple version:
s = set(all_words) # the target
gs = set()
d = words_by_ngram.copy() # for the display
while s:
# take the the best ngram
ngram, words = max(d.items(), key=lambda i: len(i[1])) # sort on word count
# remove the words from the dictionary and delete the ngrams whose words have been already found
d = {k:v for k, v in ((k, v - words) for k, v in d.items()) if len(v)}
gs.add(ngram) # add the ngram to the result
s -= words # remove the words from the target
# check
assert set().union(*[words_by_ngram[g] for g in gs]) == set(all_words)
# display
for g in gs:
print("{} -> {}".format(g, words_by_ngram[g]))
Output:
ams -> {'amstelveen', 'amsterdam'}
gou -> {'gouda'}
wer -> {'werkstad', 'werkendam'}
rot -> {'rotjeknor'}
dam -> {'amsterdam', 'werkendam', 'schiedam'}
sch -> {'schiebroek', 'schiedam'}
den -> {'den haag'}
This second step has a O(|all_words|*|ngrams|)
complexity because of the loop to find the max and the update of the dictionary. Hence, the overall complexity is O(|all_words|*|ngrams|)
It is possible to reduce the complexity with a priority queue. Retrieving the best ngram has a cost of O(1)
, but updating the len
of the words mapped to a ngram has a priority O(lg |ngrams|)
:
import heapq
class PriorityQueue:
"""Adapted from https://docs.python.org/3/library/heapq.html#priority-queue-implementation-notes
A prority of 1 invalidates the entries
"""
def __init__(self, words_by_ngram):
self._d = {ngram:[-len(words), (ngram, words)] for ngram, words in words_by_ngram.items()}
self._pq = list(self._d.values())
heapq.heapify(self._pq)
def pop(self):
"""get the ngram, words tuple with the max word count"""
minus_len, (ngram, words) = heapq.heappop(self._pq)
while minus_len == 1: # entry is not valid
minus_len, (ngram, words) = heapq.heappop(self._pq)
return ngram, words
def update(self, ngram, words_to_remove):
"""remove the words from the sets and update priorities"""
del self._d[ngram]
ngrams_to_inspect = set(word[i:i+n] for i in range(0, len(word)-n+1)
for word in words_to_remove)
for ngram in ngrams_to_inspect:
if ngram not in self._d: continue
self._d[ngram][0] = 1 # use the reference to invalidate the entry
[L, (ngram, words)] = self._d[ngram]
words -= words_to_remove
if words:
self._d[ngram] = [-len(words), (ngram, words)] # new entry
heapq.heappush(self._pq, self._d[ngram]) # add to the pq (O(lg ngrams))
else: # nothing left: remove it from dict
del self._d[ngram]
pq = PriorityQueue(words_by_ngram)
gs = set()
s = set(all_words) # the target
while s:
# take the the best ngram
ngram, words = pq.pop()
gs.add(ngram) # add the ngram to the result
s -= words # remove the words from the target
# remove the words from the dictionary and update priorities
pq.update(ngram, words)
With this code, the overall priority falls to O(|all_words|*|lg ngrams|)
. That being said, I would be curious to know if this is faster than the naive previous version with you 50k items.
Below is an implementation of the greedy algorithm for set cover. It runs in about half a second on 50,000 English dictionary words on my machine. The output isn't always optimal, but it's often close in practice. You could probably solve your instances to optimality with an external library for integer programming, but I don't know if you want to go in that direction.
The code below dynamically maintains the bipartite graph of ngrams and uncovered texts. The one subtle bit is that, since Python lacks an intrusive heap in its standard library, I've exploited the fact that the keys only increase to fake one. Every ngram is in the heap with a score less than or equal to what it should be. When we pull the minimum, if it is less than it should be, we put it back with the updated value. Otherwise, we know that it's the true minimum.
This code should produce deterministic output. At each step it chooses the lexicographically minimum ngram that covers the maximum number of uncovered texts.
import collections
import heapq
def ngrams_from_text(text, n):
return {text[i:i + n] for i in range(len(text) - n + 1)}
def greedy_ngram_cover(texts, n):
neighbors_of_text = {text: ngrams_from_text(text, n) for text in texts}
neighbors_of_ngram = collections.defaultdict(set)
for text, ngrams in neighbors_of_text.items():
for ngram in ngrams:
neighbors_of_ngram[ngram].add(text)
heap = [(-len(neighbors), ngram)
for (ngram, neighbors) in neighbors_of_ngram.items()]
heapq.heapify(heap)
cover = []
while neighbors_of_text:
score, ngram = heapq.heappop(heap)
neighbors = neighbors_of_ngram[ngram]
if score != -len(neighbors):
heapq.heappush(heap, (-len(neighbors), ngram))
continue
cover.append(ngram)
for neighbor in list(neighbors):
for neighbor_of_neighbor in neighbors_of_text[neighbor]:
neighbors_of_ngram[neighbor_of_neighbor].remove(neighbor)
del neighbors_of_text[neighbor]
return cover
print(
greedy_ngram_cover(
['amsterdam', 'rotterdam', 'haarlem', 'utrecht', 'groningen'], 3))
Above solution is failing because,
Counter
is returning trigrams in a not ordered manner, so if you run your solution multiple times, you will get the needed solution also randomlyHere I'm going in the order of least to highest elements contained trigram list, Then returning as soon as I found the solution.
from itertools import permutations
def checkTrigramsPresentInList(trigrams_list,input_list):
for input_string in input_list:
flag = False
for trigram in trigrams_list:
if trigram in input_string:
flag = True
if not flag:
return False
return True
def ngrams(text, n=3):
return [text[i:i + n] for i in range(len(text) - n + 1)]
def identifying_grams(input_list, n=3):
trigrams = []
for item in input_list:
trigrams += ngrams(item)
len_of_trigrams = len(trigrams)
trigrams_unique = list(set(trigrams))
idx =1
correct_tri_lists = []
unique_trigrams_list = []
while idx <= len_of_trigrams:
trigrams_lists = permutations(trigrams_unique,idx)
for trigrams_list in trigrams_lists:
print(trigrams_list)
if not trigrams_list in unique_trigrams_list:
if checkTrigramsPresentInList(list(trigrams_list),input_list):
correct_tri_lists.append(list(trigrams_list))
##Uncomment below lines if only one combination is needed
if correct_tri_lists:
return correct_tri_lists
unique_trigrams_list.append(trigrams_list)
idx = idx+1
list1 = ['amsterdam','rotterdam','haarlem','utrecht','groningen']
print(identifying_grams(list1))
# # Good, we get: ['ter', 'haa', 'utr', 'gro']
list2 = ['amsterdam','schiedam']
print(identifying_grams(list2))
# # Good, we get: ['dam']
list3 = ['amsterdam','schiedam','terwolde','wolstad']
print(identifying_grams(list3))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With