Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Performance ideas (in-memory C# hashset and contains too slow)

I have the following code

private void LoadIntoMemory()
{
    //Init large HashSet
    HashSet<document> hsAllDocuments = new HashSet<document>();

    //Get first rows from database
    List<document> docsList = document.GetAllAboveDocID(0, 500000);

    //Load objects into dictionary
    foreach (document d in docsList)
    {
        hsAllDocuments.Add(d);
    }

    Application["dicAllDocuments"] = hsAllDocuments;
}

private HashSet<document> documentHits(HashSet<document> hsRawHit, HashSet<document> hsAllDocuments, string query, string[] queryArray)
{
    int counter = 0;
    const int maxCount = 1000;

    foreach (document d in hsAllDocuments)
    {
        //Headline
        if (d.Headline.Contains(query))
        {
            if (counter >= maxCount)
                break;
            hsRawHit.Add(d);
            counter++;
        }

        //Description
        if (d.Description.Contains(query))
        {
            if (counter >= maxCount)
                break;
            hsRawHit.Add(d);
            counter++;
        }

        //splitted query word by word
        //string[] queryArray = query.Split(' ');
        if (queryArray.Count() > 1)
        {
            foreach (string q in queryArray)
            {
                if (d.Headline.Contains(q))
                {
                    if (counter >= maxCount)
                        break;
                    hsRawHit.Add(d);
                    counter++;
                }

                //Description
                if (d.Description.Contains(q))
                {
                    if (counter >= maxCount)
                        break;
                    hsRawHit.Add(d);
                    counter++;
                }
            }
        }

    }

    return hsRawHit;
}       

First I load all the data into a hashset (via Application for later use) - runs fine - totally OK to be slow for what I'm doing.

Will be running 4.0 framework in C# (can't update to the new upgrade for 4.0 with the async stuff).

The documentHits method runs fairly slow on my current setup - considering that it's all in memory. What can I do to speed up this method?

Examples would be awesome - thanks.

like image 246
mch_dk Avatar asked Feb 23 '11 20:02

mch_dk


People also ask

Can you allocate memory in C#?

Stack and HeapThe Common Language Runtime (CLR) allocates memory for objects in these parts. Stack is a simple LIFO(last-in-first-out) structure. Variables allocated on the stack are stored directly to the memory and access to this memory is very fast, and its allocation is done when the program is compiled .

How objects are stored in memory in C#?

In C# there are two places where an object can be stored -- the heap and the stack. Objects allocated on the stack are available only inside of a stack frame (execution of a method), while objects allocated on the heap can be accessed from anywhere.


1 Answers

I see that you are using a HashSet, but you are not using any of it's advantages, so you should just use a List instead.

What's taking time is looping through all the documents and looking for strings in other strings, so you should try to elliminate as much as possible of that.

One possibility is to set up indexes of which documents contains which character pairs. If the string query contains Hello, you would be looking in the documents that contains He, el, ll and lo.

You could set up a Dictionary<string, List<int>> where the dictionary key is the character combinations and the list contains indexes to the documents in your document list. Setting up the dictionary will take some time, of course, but you can focus on the character combinations that are less common. If a character combination exists in 80% of the documents, it's pretty useless for elliminating documents, but if a character combination exists in only 2% of the documents it has elliminated 98% of your work.

If you loop through the documents in the list and add occurances to the lists in the dictionary, the lists of indexes will be sorted, so it will be very easy to join the lists later on. When you add indexes to the list, you can throw away lists when they get too large and you see that they would not be useful for elliminating documents. That way you will only be keeping the shorter lists and they will not consume so much memory.

Edit:

It put together a small example:

public class IndexElliminator<T> {

  private List<T> _items;
  private Dictionary<string, List<int>> _index;
  private Func<T, string> _getContent;

  private static HashSet<string> GetPairs(string value) {
    HashSet<string> pairs = new HashSet<string>();
    for (int i = 1; i < value.Length; i++) {
      pairs.Add(value.Substring(i - 1, 2));
    }
    return pairs;
  }

  public IndexElliminator(List<T> items, Func<T, string> getContent, int maxIndexSize) {
    _items = items;
    _getContent = getContent;
    _index = new Dictionary<string, List<int>>();
    for (int index = 0;index<_items.Count;index++) {
      T item = _items[index];
      foreach (string pair in GetPairs(_getContent(item))) {
        List<int> list;
        if (_index.TryGetValue(pair, out list)) {
          if (list != null) {
            if (list.Count == maxIndexSize) {
              _index[pair] = null;
            } else {
              list.Add(index);
            }
          }
        } else {
          list = new List<int>();
          list.Add(index);
          _index.Add(pair, list);
        }
      }
    }
  }

  private static List<int> JoinLists(List<int> list1, List<int> list2) {
    List<int> result = new List<int>();
    int i1 = 0, i2 = 0;
    while (i1 < list1.Count && i2 < list2.Count) {
      switch (Math.Sign(list1[i1].CompareTo(list2[i2]))) {
        case 0: result.Add(list1[i1]); i1++; i2++; break;
        case -1: i1++; break;
        case 1: i2++; break;
      }
    }
    return result;
  }

  public List<T> Find(string query) {
    HashSet<string> pairs = GetPairs(query);
    List<List<int>> indexes = new List<List<int>>();
    bool found = false;
    foreach (string pair in pairs) {
      List<int> list;
      if (_index.TryGetValue(pair, out list)) {
        found = true;
        if (list != null) {
          indexes.Add(list);
        }
      }
    }
    List<T> result = new List<T>();
    if (found && indexes.Count == 0) {
      indexes.Add(Enumerable.Range(0, _items.Count).ToList());
    }
    if (indexes.Count > 0) {
      while (indexes.Count > 1) {
        indexes[indexes.Count - 2] = JoinLists(indexes[indexes.Count - 2], indexes[indexes.Count - 1]);
        indexes.RemoveAt(indexes.Count - 1);
      }
      foreach (int index in indexes[0]) {
        if (_getContent(_items[index]).Contains(query)) {
          result.Add(_items[index]);
        }
      }
    }
    return result;
  }

}

Test:

List<string> items = new List<string> {
  "Hello world",
  "How are you",
  "What is this",
  "Can this be true",
  "Some phrases",
  "Words upon words",
  "What to do",
  "Where to go",
  "When is this",
  "How can this be",
  "Well above margin",
  "Close to the center"
};
IndexElliminator<string> index = new IndexElliminator<string>(items, s => s, items.Count / 2);

List<string> found = index.Find("this");
foreach (string s in found) Console.WriteLine(s);

Output:

What is this
Can this be true
When is this
How can this be
like image 134
Guffa Avatar answered Oct 03 '22 14:10

Guffa