Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Algorithm for tagging list of substrings from a list of strings in C#

I have a list of utterances(strings)/corpus as the follows,

List<string> allUtterances = new List<string>
    {
    "c2's are above the hierarchy than c1's",
    "c2's are better than c1's",
    "get me a group of 10 c1's",
    "he is a c2",
    "he was a c two",
    "hey i am a c1",
    "jdsaxkjhasx",
    "khndsmcsdfcs",
    "my competency is c2",
    "none intent",
    "she is still a c 1",
    "this is a none intent, please ignore",
    "we are hiring fresh c1's"
};

This is the class schema:

public class ListEntity
{
        public string name { get; set; }
        public List<Sublist> subLists { get; set; }
}

public class Sublist
{
    public string canonicalForm { get; set; }
    public List<string> list { get; set; }
}

And this is a sample POCO:

    List<ListEntity> listEntities = new List<ListEntity>
    {
        new ListEntity
        {
            name = "Competency",
            subLists = new List<Sublist>
            {
                new Sublist
                {
                    canonicalForm = "C1",
                    list = new List<string>
                    {
                        "c1",
                        "c one",
                        "c 1",
                        "C 1",
                        "C1",
                        "C one",
                        "C ONE"
                    }
                },
                new Sublist
                {
                    canonicalForm = "C2",
                    list = new List<string>
                    {
                        "c2",
                        "c two",
                        "c 2",
                        "C 2",
                        "C2",
                        "C two",
                        "C TWO"
                    }
                }
            }
        }
    };

    var canonicalForms = listEntities.Select(x => x.subLists.Select(y => y.list).ToList()).ToList();

Suppose if I have an utterance of the following from the above mentioned list allUtterances:

 "query": "C2's are better than C1's"

I want to get the following output for the above utterance:

{
      "entity": "c2",
      "type": "Competency",
      "startIndex": 0,
      "endIndex": 1,
      "resolution": {
        "values": [
          "C2"
        ]
      }
},
{
      "entity": "c1",
      "type": "Competency",
      "startIndex": 21,
      "endIndex": 22,
      "resolution": {
        "values": [
          "C1"
        ]
      }
}

The rules with which I want to match is as follows:

For every utterance in the allUtterances list, If the utterance text contains a value from the property list of the class sublist I want to extract the start and end positions and mark them with the appropriate key which is the canonicalForm in this case update the type key in my JSON payload with the name property in the ListEntityClass.


I have tried the following approach:

using System;
using System.Linq;
using System.IO;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System.Collections.Generic;

namespace ListEntityProblem
{
    class Program
    {
        static void Main(string[] args)
        {
            List<string> allUtterances = new List<string>
            {
                "c2's are above the hierarchy than c1's",
                "c2's are better than c1's",
                "get me a group of 10 c1's",
                "he is a c2",
                "he was a c two",
                "hey i am a c1",
                "jdsaxkjhasx",
                "khndsmcsdfcs",
                "my competency is c2",
                "none intent",
                "she is still a c 1",
                "this is a none intent, please ignore",
                "we are hiring fresh c1's"
            };

            List<ListEntity> listEntities = new List<ListEntity>
            {
                new ListEntity
                {
                    name = "Competency",
                    subLists = new List<Sublist>
                    {
                        new Sublist
                        {
                            canonicalForm = "C1",
                            list = new List<string>
                            {
                                "c1",
                                "c one",
                                "c 1",
                                "C 1",
                                "C1",
                                "C one",
                                "C ONE"
                            }
                        },
                        new Sublist
                        {
                            canonicalForm = "C2",
                            list = new List<string>
                            {
                                "c2",
                                "c two",
                                "c 2",
                                "C 2",
                                "C2",
                                "C two",
                                "C TWO"
                            }
                        }
                    }
                }
            };


            List<Tuple<string, string, List<string>>> ListEntityLookup = new List<Tuple<string, string, List<string>>>();

            //n^2, construct lookup for list entities
            foreach (var item in listEntities)
            {
                string listEntityName = item.name;
                foreach (var innerItem in item.subLists)
                {
                    string normalizedValue = innerItem.canonicalForm;
                    List<string> synonymValues = innerItem.list;

                    ListEntityLookup.Add(Tuple.Create<string, string, List<string>>(listEntityName, normalizedValue, synonymValues));
                }
            }

            List<JObject> parsedEntities = new List<JObject>();

            //n^3, populate the parsed payload with start and end indices
            foreach (var item in allUtterances)
            {
                foreach (var ll in ListEntityLookup)
                {
                    foreach (var cf in ll.Item3)
                    {
                        int start = 0, end = 0;
                        if (item.Contains(cf))
                        {
                            start = item.IndexOf(cf);
                            end = start + cf.Length;




                            parsedEntities.Add(new JObject
                            {
                                new JProperty("Start", start),
                                new JProperty("End", end),
                                new JProperty("Query", item),
                                new JProperty("CanonicalForm", ll.Item2),
                                new JProperty("ListEntity", ll.Item1)
                            });
                        }
                    }
                }
            }

            //Group by query
            var groupedParsedEntities = parsedEntities.GroupBy(x => x["Query"]).ToList();



        }
    }
}

Edit:

I have tried to re-write the for-each loop, but this resulted, in more nesting.

            foreach (var item in allUtterances)
            {
                foreach (var listEntity in listEntities)
                {
                    foreach (var canonicalForm in listEntity.subLists)
                    {
                        foreach(var synonym in canonicalForm.list)
                        {
                            int start = item.IndexOf(synonym);
                            if(start != -1)
                            {
                                parsedEntities.Add(new JObject
                                {
                                    new JProperty("Start", start),
                                    new JProperty("End", start + synonym.Length),
                                    new JProperty("Query", item),
                                    new JProperty("CanonicalForm", canonicalForm.canonicalForm),
                                    new JProperty("ListEntity", listEntity.name)
                                });
                            }
                        }
                    }
                }
            }

But this approach seems to slow for large number of utterances and does not scale very well. As the main loop runs n^3 times. Our server will have to do too many computations per second.

I am stuck thinking whether or not, should I use Regex if it would offer me some performance benefits or not.

Please help me with optimizing this algorithm.

Any help will be very much appreciated.

like image 764
Kunal Mukherjee Avatar asked Nov 07 '22 05:11

Kunal Mukherjee


1 Answers

Have you tried using a Linq query, instead of looping?

This does not produce exactly what you need, but I believe that it does extract the relevant data:

  allUtterances
    .AsParallel()
    .SelectMany(utterance => listEntities.SelectMany(l => l.subLists
                                .Where(sl => sl.list.Any(sle => utterance.Contains(sle)))
                                .SelectMany(sl => sl.list
                                                    .Where(sle => utterance.Contains(sle))
                                                            .Select(sle => new {
                                                                            canonicalForm = sl.canonicalForm,
                                                                            matchedValue = sle, 
                                                                            startindex = utterance.IndexOf(sle),
                                                                            endindex = utterance.IndexOf(sle) + sle.Length - 1
                                                                        })
                                )
                                .Select(o => new {
                                    // not sure if 'entity' and 'resolutionValue' are swopped around
                                        utterance = utterance,
                                        entity = o.matchedValue,
                                        type = l.name,
                                        startIndex = o.startindex,
                                        endIndex = o.endindex,
                                        resolutionValue = o.canonicalForm,
                                    }
                                )
                            /*
                            or change the Select above to create the JObjects:
                            .Select(jo => new JObject { 
                                new JProperty("Start", jo.startIndex),
                                new JProperty("End", jo.endIndex),
                                new JProperty("Query", jo.utterance),
                                new JProperty("CanonicalForm", jo.resolutionValue),
                                new JProperty("ListEntity", jo.entity)
                            })
                            */
                )).ToList();

Or, you could try to parallelize your loops:

allUtterances.AsParallel().ForAll(ut => {  .... });
like image 106
wnutt Avatar answered Nov 11 '22 10:11

wnutt