Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Count occurences in byte list/array using another byte list/array

I am trying to get a count of all the times a byte sequences occurs in another byte sequences. It cannot however re-use a bytes if it already counted them. For example given the string
k.k.k.k.k.k. let's assume the byte sequence was k.k it would then find only 3 occurrences rather than 5 because they would be broke down like: [k.k].[k.k].[k.k]. and not like [k.[k].[k].[k].[k].k] where they over lap and essentially just shift 2 to the right.

Ideally the idea is to get an idea how a compression dictionary or run time encoding might look. so the goal would be to get

k.k.k.k.k.k. down to just 2 parts, as (k.k.k.) is the biggest and best symbol you can have.

Here is source so far:

using System;
using System.Collections.Generic;
using System.Collections;
using System.Linq;
using System.Text;
using System.IO;


    static class Compression 
    {
        static int Main(string[] args)
        {

            List<byte> bytes = File.ReadAllBytes("ok.txt").ToList();
            List<List<int>> list = new List<List<int>>();

            // Starting Numbers of bytes - This can be changed manually.
            int StartingNumBytes = bytes.Count;
            for (int i = StartingNumBytes; i > 0; i--)
            {
                Console.WriteLine("i: " + i);

                for (int ii = 0; ii < bytes.Count - i; ii++)
                {
                    Console.WriteLine("ii: " + i);
                    // New pattern comes with refresh data.
                    List<byte> pattern = new List<byte>();

                    for (int iii = 0; iii < i; iii++)
                    {
                        pattern.Add(bytes[ii + iii]);
                    }



                    DisplayBinary(bytes, "red");
                    DisplayBinary(pattern, "green");

                    int matches = 0;
                   // foreach (var position in bytes.ToArray().Locate(pattern.ToArray()))
                    for (int position = 0; position < bytes.Count; position++) {
                        if (pattern.Count > (bytes.Count - position))
                        {
                            continue;
                        }


                        for (int iiii = 0; iiii < pattern.Count; iiii++)
                        {
                            if (bytes[position + iiii] != pattern[iiii])
                            {
                                //Have to use goto because C# doesn't support continue <level>
                                goto outer;
                            }

                        }

                        // If it made it this far, it has found a match.
                        matches++;
                        Console.WriteLine("Matches: " + matches + " Orig Count: " + bytes.Count + " POS: " + position);
                        if (matches > 1)
                        {
                            int numBytesToRemove = pattern.Count;
                            for (int ra = 0; ra < numBytesToRemove; ra++)
                            {
                                // Remove it at the position it was found at, once it
                                // deletes the first one, the list will shift left and you'll need to be here again.
                                bytes.RemoveAt(position);
                            }
                            DisplayBinary(bytes, "red");
                            Console.WriteLine(pattern.Count + " Bytes removed.");

                            // Since you deleted some bytes, set the position less because you will need to redo the pos.
                            position = position - 1;
                        }


                        outer:
                            continue;
                    }

                    List<int> sublist = new List<int>();
                    sublist.Add(matches);
                    sublist.Add(pattern.Count);
                    // Some sort of calculation to determine how good the symbol was
                    sublist.Add(bytes.Count-((matches * pattern.Count)-matches));
                    list.Add(sublist);

                }

            }



            Display(list);
            Console.Read();
            return 0;
        }


        static void DisplayBinary(List<byte> bytes, string color="white")
        {
            switch(color){
                case "green":
                    Console.ForegroundColor = ConsoleColor.Green;
                    break;
                case "red":
                    Console.ForegroundColor = ConsoleColor.Red;
                    break;
                default:
                    break;
            }


            for (int i=0; i<bytes.Count; i++)
            {
                if (i % 8 ==0)
                    Console.WriteLine();
                Console.Write(GetIntBinaryString(bytes[i]) + " ");
            }
            Console.WriteLine();
            Console.ResetColor();
        }
        static string GetIntBinaryString(int n)
        {
            char[] b = new char[8];
            int pos = 7;
            int i = 0;

            while (i < 8)
            {
                if ((n & (1 << i)) != 0)
                {
                    b[pos] = '1';
                }
                else
                {
                    b[pos] = '0';
                }
                pos--;
                i++;
            }
            //return new string(b).TrimStart('0');
            return new string(b);
        }

        static void Display(List<List<int>> list)
        {
            //
            // Display everything in the List.
            //
            Console.WriteLine("Elements:");
            foreach (var sublist in list)
            {
                foreach (var value in sublist)
                {
                    Console.Write("{0,4}", value);

                }
                Console.WriteLine();
            }

            //
            // Display total count.
            //
            int count = 0;
            foreach (var sublist in list)
            {
                count += sublist.Count;
            }
            Console.WriteLine("Count:");
            Console.WriteLine(count);
        }

        static public int SearchBytePattern(byte[] pattern, byte[] bytes)
        {
            int matches = 0;
            // precomputing this shaves some seconds from the loop execution
            int maxloop = bytes.Length - pattern.Length;
            for (int i = 0; i < maxloop; i++)
            {
                if (pattern[0] == bytes[i])
                {
                    bool ismatch = true;
                    for (int j = 1; j < pattern.Length; j++)
                    {
                        if (bytes[i + j] != pattern[j])
                        {
                            ismatch = false;
                            break;
                        }
                    }
                    if (ismatch)
                    {
                        matches++;
                        i += pattern.Length - 1;
                    }
                }
            }
            return matches;
        }
    }

Refer to the post to get the non binary of the file should be, here is the binary data: 011010110010111001101011001011100110101100101110011010110010111001101011001011100110101100101110 I am hope to have it smaller than how it started.

like image 705
ParoX Avatar asked Jun 20 '11 00:06

ParoX


3 Answers

private static int CountOccurences(byte[] target, byte[] pattern)
{
    var targetString = BitConverter.ToString(target);
    var patternString = BitConverter.ToString(pattern);
    return new Regex(patternString).Matches(targetString).Count;
}
like image 140
Yuriy Rozhovetskiy Avatar answered Oct 21 '22 21:10

Yuriy Rozhovetskiy


With this solution you'd have access to the individual indexes that matched (while enumerating) or you could call Count() on the result to see how many matches there were:

public static IEnumerable<int> Find<T>(T[] pattern, T[] sequence, bool overlap)
{
    int i = 0;
    while (i < sequence.Length - pattern.Length + 1)
    {
        if (pattern.SequenceEqual(sequence.Skip(i).Take(pattern.Length)))
        {
            yield return i;
            i += overlap ? 1 : pattern.Length;
        }
        else
        {
            i++;
        }
    }
}

Call it with overlap: false to solve your problem or overlap: true to see the overlapped matches (if you're interested.)

I have a couple of other methods with slightly different API (along with better performance) here, including one that work directly on streams of bytes.

like image 25
Jono Avatar answered Oct 21 '22 21:10

Jono


quick and dirty with no regex. although i'm not sure if it answers the intent of the question, it should be relatively fast. i think i am going to run some timing tests against regex to see for sure the relative speeds:

    private int CountOccurrences(string TestString, string TestPattern)
    {
        int PatternCount = 0;
        int SearchIndex = 0;

        if (TestPattern.Length == 0)
            throw new ApplicationException("CountOccurrences: Unable to process because TestPattern has zero length.");

        if (TestString.Length == 0)
            return 0;

        do
        {
            SearchIndex = TestString.IndexOf(TestPattern, SearchIndex);

            if (SearchIndex >= 0)
            {
                ++PatternCount;
                SearchIndex += TestPattern.Length;
            }
        }
        while ((SearchIndex >= 0) && (SearchIndex < TestString.Length));

        return PatternCount;
    }

    private void btnTest_Click(object sender, EventArgs e)
    {
        string TestString1 = "k.k.k.k.k.k.k.k.k.k.k.k";
        string TestPattern1 = "k.k";

        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1).ToString()); // outputs 6
        System.Console.WriteLine(CountOccurrences(TestString1 + ".k", TestPattern1).ToString()); // still 6
        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1 + ".").ToString()); // only 5
    }
like image 1
shelleybutterfly Avatar answered Oct 21 '22 20:10

shelleybutterfly