I have a list of utterances(strings)/corpus as the follows,
List<string> allUtterances = new List<string>
{
"c2's are above the hierarchy than c1's",
"c2's are better than c1's",
"get me a group of 10 c1's",
"he is a c2",
"he was a c two",
"hey i am a c1",
"jdsaxkjhasx",
"khndsmcsdfcs",
"my competency is c2",
"none intent",
"she is still a c 1",
"this is a none intent, please ignore",
"we are hiring fresh c1's"
};
This is the class schema:
public class ListEntity
{
public string name { get; set; }
public List<Sublist> subLists { get; set; }
}
public class Sublist
{
public string canonicalForm { get; set; }
public List<string> list { get; set; }
}
And this is a sample POCO:
List<ListEntity> listEntities = new List<ListEntity>
{
new ListEntity
{
name = "Competency",
subLists = new List<Sublist>
{
new Sublist
{
canonicalForm = "C1",
list = new List<string>
{
"c1",
"c one",
"c 1",
"C 1",
"C1",
"C one",
"C ONE"
}
},
new Sublist
{
canonicalForm = "C2",
list = new List<string>
{
"c2",
"c two",
"c 2",
"C 2",
"C2",
"C two",
"C TWO"
}
}
}
}
};
var canonicalForms = listEntities.Select(x => x.subLists.Select(y => y.list).ToList()).ToList();
Suppose if I have an utterance of the following from the above mentioned list allUtterances:
"query": "C2's are better than C1's"
I want to get the following output for the above utterance:
{
"entity": "c2",
"type": "Competency",
"startIndex": 0,
"endIndex": 1,
"resolution": {
"values": [
"C2"
]
}
},
{
"entity": "c1",
"type": "Competency",
"startIndex": 21,
"endIndex": 22,
"resolution": {
"values": [
"C1"
]
}
}
The rules with which I want to match is as follows:
For every utterance in the allUtterances list
, If the utterance text contains a value from the property list
of the class sublist I want to extract the start and end positions and mark them with the appropriate key which is the canonicalForm
in this case update the type key in my JSON payload with the name
property in the ListEntityClass.
I have tried the following approach:
using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
namespace ListEntityProblem
{
class Program
{
static void Main(string[] args)
{
List<string> allUtterances = new List<string>
{
"c2's are above the hierarchy than c1's",
"c2's are better than c1's",
"get me a group of 10 c1's",
"he is a c2",
"he was a c two",
"hey i am a c1",
"jdsaxkjhasx",
"khndsmcsdfcs",
"my competency is c2",
"none intent",
"she is still a c 1",
"this is a none intent, please ignore",
"we are hiring fresh c1's"
};
List<ListEntity> listEntities = new List<ListEntity>
{
new ListEntity
{
name = "Competency",
subLists = new List<Sublist>
{
new Sublist
{
canonicalForm = "C1",
list = new List<string>
{
"c1",
"c one",
"c 1",
"C 1",
"C1",
"C one",
"C ONE"
}
},
new Sublist
{
canonicalForm = "C2",
list = new List<string>
{
"c2",
"c two",
"c 2",
"C 2",
"C2",
"C two",
"C TWO"
}
}
}
}
};
List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();
//n^2, construct lookup for list entities
foreach (var item in listEntities)
{
string listEntityName = item.name;
foreach (var innerItem in item.subLists)
{
string normalizedValue = innerItem.canonicalForm;
List<string> synonymValues = innerItem.list;
ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
}
}
List<JObject> parsedEntities = new List<JObject>();
//n^3, populate the parsed payload with start and end indices
foreach (var item in allUtterances)
{
foreach (var ll in ListEntityLookup)
{
foreach (var cf in ll.Item3)
{
int start = 0, end = 0;
if (item.Contains(cf))
{
start = item.IndexOf(cf);
end = start + cf.Length;
parsedEntities.Add(new JObject
{
new JProperty("Start", start),
new JProperty("End", end),
new JProperty("Query", item),
new JProperty("CanonicalForm", ll.Item2),
new JProperty("ListEntity", ll.Item1)
});
}
}
}
}
//Group by query
var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();
}
}
}
Edit:
I have tried to re-write the for-each loop, but this resulted, in more nesting.
foreach (var item in allUtterances)
{
foreach (var listEntity in listEntities)
{
foreach (var canonicalForm in listEntity.subLists)
{
foreach(var synonym in canonicalForm.list)
{
int start = item.IndexOf(synonym);
if(start != -1)
{
parsedEntities.Add(new JObject
{
new JProperty("Start", start),
new JProperty("End", start + synonym.Length),
new JProperty("Query", item),
new JProperty("CanonicalForm", canonicalForm.canonicalForm),
new JProperty("ListEntity", listEntity.name)
});
}
}
}
}
}
But this approach seems to slow for large number of utterances and does not scale very well. As the main loop runs n^3 times. Our server will have to do too many computations per second.
I am stuck thinking whether or not, should I use Regex if it would offer me some performance benefits or not.
Please help me with optimizing this algorithm.
Any help will be very much appreciated.
Have you tried using a Linq query, instead of looping?
This does not produce exactly what you need, but I believe that it does extract the relevant data:
allUtterances
.AsParallel()
.SelectMany(utterance => listEntities.SelectMany(l => l.subLists
.Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
.SelectMany(sl => sl.list
.Where(sle => utterance.Contains(sle))
.Select(sle => new {
canonicalForm = sl.canonicalForm,
matchedValue = sle,
startindex = utterance.IndexOf(sle),
endindex = utterance.IndexOf(sle) + sle.Length - 1
})
)
.Select(o => new {
// not sure if 'entity' and 'resolutionValue' are swopped around
utterance = utterance,
entity = o.matchedValue,
type = l.name,
startIndex = o.startindex,
endIndex = o.endindex,
resolutionValue = o.canonicalForm,
}
)
/*
or change the Select above to create the JObjects:
.Select(jo => new JObject {
new JProperty("Start", jo.startIndex),
new JProperty("End", jo.endIndex),
new JProperty("Query", jo.utterance),
new JProperty("CanonicalForm", jo.resolutionValue),
new JProperty("ListEntity", jo.entity)
})
*/
)).ToList();
Or, you could try to parallelize your loops:
allUtterances.AsParallel().ForAll(ut => { .... });
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With