Inverted Index: Find a phrase in a set of documents

Question

I'm implementing an inverted index structure, in particular one that allows boolean queries, and word-level granularity.

I have large database of text, and I keep an index that tells me, for every word, in which file it is (IDdoc), and where in the file it is (position). (A word can be in many files and in many places in one file.)

Thus I keep a vector for each word:

vector<pair<IDdoc,position>> occurences_of_word;

(The vector is sorted by IDdoc and then by position, in ascending order.)

I have a string object made of words. This is the phrase I'm looking for.

For each word in the phrase I would like to know which documents contain this phrase, hence returning a vector of IDdocs.

Here is my attempt at a solution:

typedef std::string     Word_t;
typedef unsigned int    WordPosition_t;
typedef unsigned int    IDdocument_t;

vector<pair<IDdocument_t,WordPosition_t> > IndiceInvertidoBooleanoConPosicion::_interseccion_dos_listas
    (const vector<pair<IDdocument_t,WordPosition_t>> & v1,
    const vector<pair<IDdocument_t,WordPosition_t>> & v2)
{
vector<pair<IDdocument_t,WordPosition_t> > intersection;

IDdocument_t ID_doc_one, ID_doc_two;

int i = 0;
int j = 0;
const int MAX_INDEX_V1 = v1.size() -1;
const int MAX_INDEX_V2 = v2.size() -1;

while(i <= MAX_INDEX_V1  && j <= MAX_INDEX_V2)
{
    ID_doc_one = v1[i].first;
    ID_doc_two = v2[j].first;
    if (ID_doc_one < ID_doc_two)
        i++;
    else if (ID_doc_one > ID_doc_two)
        j++;
    else // The words were found in the same document!
    {
        WordPosition_t pos_word_one = v1[i].second;
        WordPosition_t pos_word_two = v2[j].second;

        // The words make a phrase!  Return pos_two for the next intersection finding step
        if (pos_word_one + 1 == pos_word_two)
        {
            intersection.push_back(make_pair(ID_doc_one,pos_word_two));
            i++;
            j++;
        }

        // Phrase not found
        else
        {
            if (pos_word_one < pos_word_two)
                i++;
            else
                j++;
        }

    }
}

return intersection;
}

int find_phrase(const string phrase, vector<IDdocument_t> & id_docs)
{
Word_t word;
id_docs.clear();
Text parsed_phrase;
// Extract the relevant words from the phrase
parsed_phrase.parse(phrase); 

vector<pair<IDdocument_t,WordPosition_t> > intersection;
vector<pair<IDdocument_t,WordPosition_t> > second_vector;

while (parsed_phrase.get_next_word(word) != RES_END)
{
    _find_vector_words(word,intersection);

    while (parsed_phrase.get_next_word(word) != RES_END)
    {
        _find_vector_words(word,second_vector);

        intersection = _intersect_two_words(intersection,second_vector);

    }
}

for (unsigned int i = 0; i < intersection.size(); i ++)
{
    IDdocument_t id_doc = intersection[i].first;
    if(std::find(id_docs.begin(), id_docs.end(), id_doc) == id_docs.end())
        id_docs.push_back(id_doc);
}

return RES_OK;
}

kfsone · Accepted Answer

For looking up a particular Word from the string representation, you probably want to look at something like map. For creating a simple union of results you probably want set. This implementation is written more as a demonstration than as a highly desirable final implementation (c.f. sloppy phrase parsing).

#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <string>

typedef std::string IDdoc;
typedef int position;

typedef std::pair<IDdoc,position> Occurrence;
typedef std::vector<Occurrence> OccurrencesOfWord;
typedef std::map<std::string /*word*/, OccurrencesOfWord> Dictionary;
typedef std::set<IDdoc> Matches;

bool findMatchesForPhrase(const std::string& phrase, const Dictionary& dictionary, Matches& matches)
{
    size_t pos = 0;
    size_t len = 0;
    while (pos < phrase.length()) {
        size_t end = phrase.find(' ', pos);
        size_t len = ((end == phrase.npos) ? phrase.length() : end) - pos;
        std::string word(phrase, pos, len);
        pos += len + 1; // to skip the space.

        // ignore words not in the dictionary.
        auto dictIt = dictionary.find(word);
        if (dictIt == dictionary.end())
            continue;

        auto& occurrences = dictIt->second; // shortcut/alias,.
        for (auto& occurIt : occurrences) {
            // Add all the IDdoc's of this occurence to the set.
            matches.insert(occurIt.first);
        }
    }

    return !matches.empty();
}

void addToDictionary(Dictionary& dict, const char* word, const char* doc, int position)
{
    dict[word].push_back(std::make_pair(std::string(doc), position));
}

int main(int argc, const char** argv)
{
    std::string phrase("pizza is life");
    Dictionary dict;

    addToDictionary(dict, "pizza", "book1", 10);
    addToDictionary(dict, "pizza", "book2", 30);
    addToDictionary(dict, "life", "book1", 1);
    addToDictionary(dict, "life", "book3", 1);
    addToDictionary(dict, "goat", "book4", 99);

    Matches matches;
    bool result = findMatchesForPhrase(phrase, dict, matches);

    std::cout << "result = " << result << std::endl;
    for (auto& ent : matches) {
        std::cout << ent << std::endl;
    }

    return 0;
}

Online demo of this at: http://ideone.com/Zlhfua

Follow up to address your changes:

while(i < SIZE_VECTOR_ONE  && j < SIZE_VECTOR_TWO)
{
    if (ID_doc_one < ID_doc_two)
    {
        ID_doc_one = v1[++i].first;

Lets say "SIZE_VECTOR 1" is 1. That means that there is one element in the vector, element[0]. If ID_doc_one is 0 and ID_doc_two is 1, then

if (0 < 1) {
    ID_doc_one = v1[1].first;

which is invalid. You might be better off using iterators or pointers:

while (oneIt != v1.end() && twoIt != v2.end()) {
    if (oneIt->first < twoIt->first) {
        ++oneIt;
        continue;
    } else if (*twoIt < *oneIt) {
        ++twoIt;
        continue;
    }
    // same documentId in both lists, snag positions.
    ...
}

Next, this looks kinda broken:

    else {
    }   // To avoid "out of range" errors <-- but also ends the "else"
        if (i < SIZE_VECTOR_ONE - 1)
            ID_doc_one = v1[++i].first;
        if (j < SIZE_VECTOR_TWO - 1)
            ID_doc_two = v2[++j].first;
    }

And I wonder what happens if you have the same document but multiple positions?

This next is nit-picky, but it took me a long time to parse

    WordPosition_t pos_one = v1[i].second;
    WordPosition_t pos_two = v2[j].second;

    // The words make a phrase!  Return pos_two for the next intersection finding step
    if (pos_one + 1 == pos_two)

it seems vastly clearer to write this as you might say it "(if the second word is in the position after the first word):

    WordPosition_t posFirstWord = v1[i].second;
    WordPosition_t posSecondWord = v2[j].second;

    // The words make a phrase!  Return pos_two for the next intersection finding step
    if (posSecondWord == posFirstWord + 1)

This next part was kind of confusing, since both clauses appeared to be intended to increment i and j and update ID_doc_one and two, it would have made sense to hoist that part into a common section after the if block, but again the else {} made it hard to tell what you were actually doing.

    if (pos_one + 1 == pos_two)
    {
        intersection.push_back(make_pair(ID_doc_one,pos_two));
        ID_doc_one = v1[++i].first;
        ID_doc_two = v2[++j].first;
    }

    else {
    }   // To avoid "out of range" errors
        if (i < SIZE_VECTOR_ONE - 1)
            ID_doc_one = v1[++i].first;
        if (j < SIZE_VECTOR_TWO - 1)
            ID_doc_two = v2[++j].first;
    }

When you match both arrays, you always want to increment both i and j, it's not condition, I'm also not sure why you are using pos_two, since the phrase was actually found at pos_one?

This is how I would have written it:

#include<iostream>
#include<map>
#include<vector>
#include<string>

typedef std::string         Word_t;
typedef unsigned int        WordPosition_t;
typedef unsigned int        IDdocument_t;

typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t;
typedef std::vector<DocumentPosition_t> WordReferences_t;

WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2)
{
    // all the locations where the words occur one after the other.
    WordReferences_t intersection;

    auto firstIt = v1.begin();
    auto secondIt = v2.begin();
    while (firstIt != v1.end() && secondIt != v2.end())
    {
        if (firstIt->first < secondIt->first)
        {
            ++firstIt;
            continue;
        }
        // find the second word in the same document and AFTER the first word.
        if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1)
        {
            ++secondIt;
            continue;
        }
        // first word wasn't just before the second, it's not a phrase.
        if (secondIt->second > firstIt->second + 1)
        {
            ++firstIt;
            continue;
        }
        // We found a phrase.
        intersection.emplace_back(*firstIt);
        ++firstIt;
        ++secondIt;
    }

    return intersection;
}

int main()
{
    WordReferences_t v1, v2;
    v1.push_back(std::make_pair(10, 5));
    v1.push_back(std::make_pair(10, 25));
    v1.push_back(std::make_pair(11, 10));
    v1.push_back(std::make_pair(12, 1));
    v1.push_back(std::make_pair(12, 11));
    v1.push_back(std::make_pair(12, 21));
    v1.push_back(std::make_pair(12, 31));
    v1.push_back(std::make_pair(15, 11));
    v1.push_back(std::make_pair(100, 1));
    v1.push_back(std::make_pair(100, 11));
    v1.push_back(std::make_pair(100, 21));
    v1.push_back(std::make_pair(101, 11));
    v1.push_back(std::make_pair(102, 11));
    v1.push_back(std::make_pair(102, 13));
    v1.push_back(std::make_pair(102, 14));
    v1.push_back(std::make_pair(103, 11));
    v1.push_back(std::make_pair(103, 13));

    v2.push_back(std::make_pair(10, 11));
    v2.push_back(std::make_pair(12, 10));
    v2.push_back(std::make_pair(12, 40));
    v2.push_back(std::make_pair(16, 11));
    v2.push_back(std::make_pair(100, 12)); // match
    v2.push_back(std::make_pair(101, 12)); // match
    v2.push_back(std::make_pair(101, 13));
    v2.push_back(std::make_pair(101, 14));
    v2.push_back(std::make_pair(102, 12)); //match
    v2.push_back(std::make_pair(103, 1));
    v2.push_back(std::make_pair(103, 10));
    v2.push_back(std::make_pair(103, 12)); // match
    v2.push_back(std::make_pair(103, 15));

    auto intersection = _intersect_two_words(v1, v2);
    for (auto entry : intersection)
    {
        std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl;
    }

    return 0;
}

Live example: http://ideone.com/XRfhAI

Inverted Index: Find a phrase in a set of documents

Tags:

c++

algorithm

intersection

inverted-index

Maria Ines Parnisari

1 Answers

kfsone

Recent Activity

Donate For Us

Inverted Index: Find a phrase in a set of documents

Tags:

c++

algorithm

intersection

inverted-index

Maria Ines Parnisari

1 Answers

kfsone

Related questions

Recent Activity

Donate For Us