The query returns 21 million records
They way I loop through this table takes forever. What other solutions are there?
SqlDataReader rd = DbInfo.DataRdr(Conn,
"SELECT a.NAME AS ANAME, b.NAME AS BNAME, a.ID as AID, b.ID AS BUD " +
"FROM myTable a JOIN myTable b ON a.NUM = b.NUM AND a.ID <> b.ID");
while (rd.Read())
{
if (rd["ANAME"].ToString().LevenshteinDistance(rd["BNAME"].ToString()) <= 10)
{
Logging.Write(...);
}
}
public static int LevenshteinDistance(this string s, string t)
{
if (s == null)
throw new ArgumentNullException("s");
if (t == null)
throw new ArgumentNullException("t");
int n = s.Length;
int m = t.Length;
int[,] d = new int[n+1,m+1];
if (n == 0 || m == 0)
return Math.Max(m, n);
for (int i = 0; i <= n; i++)
{
d[i, 0] = i;
}
for (int i = 0; i < m; i++)
{
d[0, i] = i;
}
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
int cost = (t[j] == s[i]) ? 0 : 1;
d[i + 1, j + 1] = Math.Min(Math.Min(d[i, j + 1] + 1, d[i + 1, j] + 1), d[i, j] + cost);
}
}
return d[n, m];
}
You could start by using the following as your query instead, depending on how often the NUM
columns are actually equal:
SELECT a.NAME AS ANAME, b.NAME AS BNAME, other things
FROM myTable a
JOIN myTable b
ON a.NUM = b.NUM
AND a.id < b.id
Then your SQL query will give you pairs of rows with matching NUM
s that you can call LevenshteinDistance
on. Something like:
DataTable dt1 = new DataTable();
dt1.Load(DbInfo.DataRdr(Conn, "[the query I wrote above]"));
for (int i = 0; i < dt1.Rows.Count; i++)
{
if (dt1.Rows[i]["ANAME"].ToString().LevenshteinDistance(dt1.Rows[i]["BNAME"].ToString()) <= 10)
{
Logging.Write(...[modify the query so it returns the things you want to log]...);
}
}
You're comparing dt1.Rows[i]["Name"].ToString()
with dt1.Rows[j]["Name"].ToString()
even when i == j
.
Try looping 0 <= i < dt1.Rows.Count - 1
and i + 1 <= j < dt1.Rows.Count
.
Also, you're logging only if dt1.Rows[i]["NUM"].ToString() == dt1.Rows[i]["NUM"].ToString()
, which is probably a faster check. There's no point in doing Levenshtein if that's false.
EDIT: @John is right about the dt1.Rows[i]["NUM"].ToString() == dt1.Rows[i]["NUM"].ToString()
(both i
?).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With