I am working on C# to find all the common substrings between two strings. For instance, if the input is: S1= "need asssitance with email" S2= "email assistance needed"
The output should be- 'need assistance email'
The below code returns the longest common substring, but I want my code to return all the common substrings. Any help is much appreciated!
static void commonsubstrings()
{
input1 = "need asssitance with email";
input2 = "email assistance needed"
if (input2.Length > input1.Length)
{
swap = input1;
input1 = input2;
input2 = swap;
}
int k = 1;
String temp;
String longTemp = "";
for (int i = 0; (i <= input1.Length); i++)
{
if ((i == input1.Length))
{
if (longest != null)
{
k = longest.Length + 1;
}
else
{
k = 1;
}
temp = input1.Substring(1, input1.Length - 1);
if (temp.Equals(""))
{
break;
}
if (k <= temp.Length)
{
i = k - 1;
input1 = temp;
if ((longest != null) && (longest.Length > longTemp.Length))
{
longTemp = longest;
}
}
}
holder1 = input1.Substring(0, k);
for (int j = 0; (j < input2.Length) && (j + k <= input2.Length); j++)
{
check1 = input2.Substring(j, k);
if (holder1.Equals(check1))
{
longest = holder1;
break;
}
}
k++;
}
Console.WriteLine(longest);
Console.ReadLine();
}
Approach: Count the frequencies of all the characters from both strings. Now, for every character if the frequency of this character in string s1 is freq1 and in string s2 is freq2 then total valid pairs with this character will be min(freq1, freq2). The sum of this value for all the characters is the required answer.
The total number of substrings formed by string of length N is (N*(N+1))/2, initialise count as (N*(N+1))/2.
A substring is a subset or part of another string, or it is a contiguous sequence of characters within a string.
A different approach: you could use levenshtein distance to find the similarity of two words. If the distance is less than a specified value you could consider two strings as equal. Then you can use the levenshtein comparer for Enumerable.Intersect
.
Then it's simple as:
string S1= "need asssitance with email" ;
string S2 = "email assistance needed";
string[] words1 = S1.Split();
string[] words2 = S2.Split();
var wordsIntersecting = words1.Intersect(words2, new LevenshteinComparer());
string output = string.Join(" ", wordsIntersecting);
output: need asssitance email
Here is the custom comparer:
class LevenshteinComparer : IEqualityComparer<string>
{
public int MaxDistance { get; set; }
private Levenshtein _Levenshtein = new Levenshtein();
public LevenshteinComparer() : this(50) { }
public LevenshteinComparer(int maxDistance)
{
this.MaxDistance = maxDistance;
}
public bool Equals(string x, string y)
{
int distance = _Levenshtein.iLD(x, y);
return distance <= MaxDistance;
}
public int GetHashCode(string obj)
{
return 0;
}
}
and here is an implementation of the levenshtein algoritm:
public class Levenshtein
{
///*****************************
/// Compute Levenshtein distance
/// Memory efficient version
///*****************************
public int iLD(String sRow, String sCol)
{
int RowLen = sRow.Length; // length of sRow
int ColLen = sCol.Length; // length of sCol
int RowIdx; // iterates through sRow
int ColIdx; // iterates through sCol
char Row_i; // ith character of sRow
char Col_j; // jth character of sCol
int cost; // cost
/// Test string length
if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
throw (new Exception("\nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));
// Step 1
if (RowLen == 0)
{
return ColLen;
}
if (ColLen == 0)
{
return RowLen;
}
/// Create the two vectors
int[] v0 = new int[RowLen + 1];
int[] v1 = new int[RowLen + 1];
int[] vTmp;
/// Step 2
/// Initialize the first vector
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
v0[RowIdx] = RowIdx;
}
// Step 3
/// Fore each column
for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
{
/// Set the 0'th element to the column number
v1[0] = ColIdx;
Col_j = sCol[ColIdx - 1];
// Step 4
/// Fore each row
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
Row_i = sRow[RowIdx - 1];
// Step 5
if (Row_i == Col_j)
{
cost = 0;
}
else
{
cost = 1;
}
// Step 6
/// Find minimum
int m_min = v0[RowIdx] + 1;
int b = v1[RowIdx - 1] + 1;
int c = v0[RowIdx - 1] + cost;
if (b < m_min)
{
m_min = b;
}
if (c < m_min)
{
m_min = c;
}
v1[RowIdx] = m_min;
}
/// Swap the vectors
vTmp = v0;
v0 = v1;
v1 = vTmp;
}
// Step 7
/// Value between 0 - 100
/// 0==perfect match 100==totaly different
///
/// The vectors where swaped one last time at the end of the last loop,
/// that is why the result is now in v0 rather than in v1
//System.Console.WriteLine("iDist=" + v0[RowLen]);
int max = System.Math.Max(RowLen, ColLen);
return ((100 * v0[RowLen]) / max);
}
///*****************************
/// Compute the min
///*****************************
private int Minimum(int a, int b, int c)
{
int mi = a;
if (b < mi)
{
mi = b;
}
if (c < mi)
{
mi = c;
}
return mi;
}
///*****************************
/// Compute Levenshtein distance
///*****************************
public int LD(String sNew, String sOld)
{
int[,] matrix; // matrix
int sNewLen = sNew.Length; // length of sNew
int sOldLen = sOld.Length; // length of sOld
int sNewIdx; // iterates through sNew
int sOldIdx; // iterates through sOld
char sNew_i; // ith character of sNew
char sOld_j; // jth character of sOld
int cost; // cost
/// Test string length
if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31))
throw (new Exception("\nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sNew.Length, sOld.Length) + "."));
// Step 1
if (sNewLen == 0)
{
return sOldLen;
}
if (sOldLen == 0)
{
return sNewLen;
}
matrix = new int[sNewLen + 1, sOldLen + 1];
// Step 2
for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++)
{
matrix[sNewIdx, 0] = sNewIdx;
}
for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++)
{
matrix[0, sOldIdx] = sOldIdx;
}
// Step 3
for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++)
{
sNew_i = sNew[sNewIdx - 1];
// Step 4
for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++)
{
sOld_j = sOld[sOldIdx - 1];
// Step 5
if (sNew_i == sOld_j)
{
cost = 0;
}
else
{
cost = 1;
}
// Step 6
matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost);
}
}
// Step 7
/// Value between 0 - 100
/// 0==perfect match 100==totaly different
System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]);
int max = System.Math.Max(sNewLen, sOldLen);
return (100 * matrix[sNewLen, sOldLen]) / max;
}
}
Credits to the Levenshtein class to: http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
public static string [] CommonString(string left, string right)
{
List<string> result = new List<string>();
string [] rightArray = right.Split();
string [] leftArray = left.Split();
result.AddRange(rightArray.Where(r => leftArray.Any(l => l.StartsWith(r))));
// must check other way in case left array contains smaller words than right array
result.AddRange(leftArray.Where(l => rightArray.Any(r => r.StartsWith(l))));
return result.Distinct().ToArray();
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With