So I have successfully implemented the Levenshtein (edit minimum distance) algorithm with the help of Wikipedia and this Needleman tutorial, whereby custom, insertion and deletion cost 1, and substitution/replacement cost 2.
Executable gist https://gist.github.com/axelmukwena/8696ec4ec72849d3cf384f5d97321407
import numpy as np
def word_edit_distance(x, y):
rows = len(x) + 1
cols = len(y) + 1
distance = np.zeros((rows, cols), dtype=int)
for i in range(1, rows):
for k in range(1, cols):
distance[i][0] = i
distance[0][k] = k
for col in range(1, cols):
for row in range(1, rows):
if x[row - 1] == y[col - 1]:
cost = 0
else:
cost = 2
distance[row][col] = min(distance[row - 1][col] + 1,
distance[row][col - 1] + 1,
distance[row - 1][col - 1] + cost)
print(backtrace(x, y, distance))
edit_distance = distance[row][col]
return edit_distance, distance
result = word_edit_distance("AACGCA", "GAGCTA")
print(result[0])
print(result[1])
# output
4
[[0 1 2 3 4 5 6]
[1 2 1 2 3 4 5]
[2 3 2 3 4 5 4]
[3 4 3 4 3 4 5]
[4 3 4 3 4 5 6]
[5 4 5 4 3 4 5]
[6 5 4 5 4 5 4]]
And, I somehow also understand how to compute the backtracking, see my attempt below. However, there is a slight error. See Bottom
def backtrace(first, second, matrix):
f = [char for char in first]
s = [char for char in second]
new_f, new_s = [], []
row = len(f)
col = len(s)
trace = [[row, col]]
while True:
a = matrix[row - 1][col]
b = matrix[row - 1][col - 1]
c = matrix[row][col - 1]
which = min(a, b, c)
if which == matrix[row][col] or which == matrix[row][col] - 2:
# when diagonal backtrace substitution or no substitution
trace.append([row - 1, col - 1])
new_f = [f[row - 1]] + new_f
new_s = [s[col - 1]] + new_s
row, col = row - 1, col - 1
elif which == matrix[row][col] - 1:
# either deletion or insertion, find if minimum is up or left
if which == matrix[row - 1][col]:
trace.append([row - 1, col])
new_f = [f[row - 1]] + new_f
new_s = ["-"] + new_s
row, col = row - 1, col
elif which == matrix[row][col - 1]:
trace.append([row, col - 1])
new_f = ["-"] + new_f
new_s = [s[col - 1]] + new_s
row, col = row, col - 1
# Exit the loop
if row == 0 or col == 0:
return trace, new_f, new_s
Outcome
# trace => [[6, 6], [5, 5], [5, 4], [4, 3], [3, 2], [2, 2], [1, 2], [0, 1]]
['A', 'A', 'C', 'G', 'C', '-', 'A'], ['A', '-', '-', 'G', 'C', 'T', 'A']
Expected outcome:
# trace => [[6, 6], [5, 5], [5, 4], [4, 3], [3, 2], [2, 2], [1, 1], [0, 0]]
['A', 'A', 'C', 'G', 'C', '-', 'A'], ['A', 'A', '-', 'G', 'C', 'T', 'A']
Whats happening is:
# cost = 2
distance[row - 1][col] + 1 = 2 # orange
distance[row][col - 1] + 1 = 4 # yellow
distance[row - 1][col - 1] + cost = 2 # green
So here, both orange and green are candidates. But the ideal candidate is green because A == A

a = matrix[row - 1][col] # a = 1
b = matrix[row - 1][col - 1] # b = 2
c = matrix[row][col - 1] # c = 3
which = min(a, b, c) # which = a instead of b

Am I even using the correct backtracing algorithm?
It should not be min(a,b,c). You should select the node that minimizes the score of the other node plus cost of operation from the other node to the current one.
r = matrix[row][col] # current node
a = matrix[row - 1][col] # a = 1
b = matrix[row - 1][col - 1] # b = 2
c = matrix[row][col - 1] # c = 3
if x[row - 1] == y[col - 1]:
cost = 0
else:
cost = 2
if r == a + 1: return a
if r == b + cost: return b
if r == c + 1: return c
or in a more compressed form:
which = min(a + 1, b + cost, c + 1)
Based on @tstanisl 's answer, here is my integrated implementation:
def backtrace(first, second, matrix):
f = [char for char in first]
s = [char for char in second]
new_f, new_s = [], []
row = len(f)
col = len(s)
trace = [[row, col]]
while True:
if f[row - 1] == s[col - 1]:
cost = 0
else:
cost = 2
r = matrix[row][col]
a = matrix[row - 1][col]
b = matrix[row - 1][col - 1]
c = matrix[row][col - 1]
if r == b + cost:
# when diagonal backtrace substitution or no substitution
trace.append([row - 1, col - 1])
new_f = [f[row - 1]] + new_f
new_s = [s[col - 1]] + new_s
row, col = row - 1, col - 1
else:
# either deletion or insertion, find if minimum is up or left
if r == a + 1:
trace.append([row - 1, col])
new_f = [f[row - 1]] + new_f
new_s = ["-"] + new_s
row, col = row - 1, col
elif r == c + 1:
trace.append([row, col - 1])
new_f = ["-"] + new_f
new_s = [s[col - 1]] + new_s
row, col = row, col - 1
# Exit the loop
if row == 0 or col == 0:
return trace, new_f, new_s
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With