Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Most efficient way to find the count of matches a string has against an array of words?

Tags:

java

let's say I have a string

String test = "This is a test string and I have some stopwords in here";

and I want to see how many times the words in the array below match against my string

psudocode

array = a,and,the,them,they,I

so the answer would be "3"

just curious what the most efficient way to do that in java is?

like image 201
James Avatar asked Jul 09 '10 00:07

James


2 Answers

Something like this? Not sure about 'most efficient', but simple enough.

Set<String> s1 = new HashSet<String>(Arrays.asList("This is a test string and I have some stopwords in here".split("\\s")));
Set<String> s2 = new HashSet<String>(Arrays.asList("a", "and", "the", "them", "they", "I"));
s1.retainAll(s2);
System.out.println(s1.size());

Just intersection of two sets of words.

like image 185
Nikita Rybak Avatar answered Sep 22 '22 01:09

Nikita Rybak


I'd probably store the words in the input into a HashSet and then iterate over the array and see if each word in the array is .contains in the set.

Here it is in code... the input is "Around the world in 80 days".

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;

public class Main
{
    public static void main(final String[] argv)
        throws FileNotFoundException
    {
        final File     file;
        final String[] wordsToFind;

        file        = new File(argv[0]);
        wordsToFind = getWordsToFind(file);
        a(file, wordsToFind);
        b(file, wordsToFind);
        c(file, wordsToFind);
        d(file, wordsToFind);
    }

    // this just reads the file into the disk cache
    private static String[] getWordsToFind(final File file)
        throws FileNotFoundException
    {
        final Scanner     scanner;
        final Set<String> words;

        scanner = new Scanner(file);
        words   = new HashSet<String>();

        while(scanner.hasNext())
        {
            final String word;

            word = scanner.next();
            words.add(word);
        }

        return (words.toArray(new String[words.size()]));
    }

    // bad way, read intpo a list and then iterate over the list until you find a match
    private static void a(final File     file,
                          final String[] wordsToFind)
        throws FileNotFoundException
    {
        final long start;
        final long end;
        final long total;
        final Scanner      scanner;
        final List<String> words;
        int                matches;

        scanner = new Scanner(file);
        words   = new ArrayList<String>();

        while(scanner.hasNext())
        {
            final String word;

            word = scanner.next();
            words.add(word);
        }

        start = System.nanoTime();

        {
            matches = 0;

            for(final String wordToFind : wordsToFind)
            {
                for(final String word : words)
                {
                    if(word.equals(wordToFind))
                    {
                        matches++;
                        break;
                    }
                }
            }

            System.out.println(matches);
        }

        end   = System.nanoTime();
        total = end - start;
        System.out.println("a: " + total);
    }

    // slightly better way, read intpo a list and then iterate over the set (which reduces the number of things you progbably
    // have to read until you find a match), until you find a match
    private static void b(final File     file,
                          final String[] wordsToFind)
        throws FileNotFoundException
    {
        final long start;
        final long end;
        final long total;
        final Scanner     scanner;
        final Set<String> words;
        int               matches;

        scanner = new Scanner(file);
        words   = new HashSet<String>();

        while(scanner.hasNext())
        {
            final String word;

            word = scanner.next();
            words.add(word);
        }

        start = System.nanoTime();

        {
            matches = 0;

            for(final String wordToFind : wordsToFind)
            {
                for(final String word : words)
                {
                    if(word.equals(wordToFind))
                    {
                        matches++;
                        break;
                    }
                }
            }

            System.out.println(matches);
        }

        end   = System.nanoTime();
        total = end - start;
        System.out.println("b: " + total);
    }

    // my way
    private static void c(final File     file,
                          final String[] wordsToFind)
        throws FileNotFoundException
    {
        final long start;
        final long end;
        final long total;
        final Scanner     scanner;
        final Set<String> words;
        int               matches;

        scanner = new Scanner(file);
        words   = new HashSet<String>();

        while(scanner.hasNext())
        {
            final String word;

            word = scanner.next();
            words.add(word);
        }

        start = System.nanoTime();

        {
            matches = 0;

            for(final String wordToFind : wordsToFind)
            {
                if(words.contains(wordToFind))
                {
                    matches++;
                }
            }

            System.out.println(matches);
        }

        end   = System.nanoTime();
        total = end - start;
        System.out.println("c: " + total);
    }

    // Nikita Rybak way
    private static void d(final File     file,
                          final String[] wordsToFind)
        throws FileNotFoundException
    {
        final long start;
        final long end;
        final long total;
        final Scanner     scanner;
        final Set<String> words;
        int               matches;

        scanner = new Scanner(file);
        words   = new HashSet<String>();

        while(scanner.hasNext())
        {
            final String word;

            word = scanner.next();
            words.add(word);
        }

        start = System.nanoTime();

        {
            words.retainAll(new HashSet<String>(Arrays.asList(wordsToFind)));
            matches = words.size();
            System.out.println(matches);
        }

        end   = System.nanoTime();
        total = end - start;
        System.out.println("d: " + total);
    }
}

results (after a few runs, each run is pretty much the same though):

12596
a: 2440699000
12596
b: 2531635000
12596
c: 4507000
12596
d: 5597000

If you modify it by adding "XXX" to each of the words in getWordsToFind (so no words are found) you get:

0
a: 7415291000
0
b: 4688973000
0
c: 2849000
0
d: 7981000

And, for completeness, I tried it just searching for the word "I", and the results are:

1
a: 235000
1
b: 351000
1
c: 75000
1
d: 10725000
like image 27
TofuBeer Avatar answered Sep 24 '22 01:09

TofuBeer