Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

First and last occurrence for binary search in C

I'm trying to understand how do I modify the binary search for it work for first and last occurrences, surely I can find some code on the web but I'm trying to reach deep understanding, here is some basic non-recursive binary search I found:

int BinarySearch(int *array, int number_of_elements, int key)
{
    int low = 0, high = number_of_elements-1, mid;
    while(low <= high)
    {
            mid = (low + high)/2;
            if(array[mid] < key)
            {
                    low = mid + 1; 
            }
            else if(array[mid] == key)
            {
                    return mid;
            }
            else if(array[mid] > key)
            {
                    high = mid-1;
            }

    }
    return -1;
}

What modifications do I need to do and what are the logic behind them?

Edit: I would like for it to be efficient and not done linearly.

like image 249
Dannz Avatar asked Sep 25 '22 12:09

Dannz


2 Answers

Binary search on an array which contains a sorted set of values, where values may occur more than once does not necessarily yield the first or last element.

It yields the first matching element it finds.

Since this element could be surrounded by by more matching elements, a second step is required, in order to find the first and last matching element. This can be done with linear search as suggested by other posts or it can also be done in logarithmic time.

Let i be the index of the first found match, as reported by binary search.

Then, the start of the "sequence of equals" is in [0..i]. And the end of the "sequence of equals" is in [i..N-1] where N is the length of the sequence. Recursively bisecting those intervals until the border is found eventually yields the first and last match.

The following (f#) program shows the idea in a few lines. It should be a trivial matter to write an equivalent C-function.

let equal_range (a : int[]) i =
    let rec first i0 i1 = 
        if a.[i0] = a.[i1] || (i1-i0) < 2 then
            if a.[i0] <> a.[i1] 
            then
                i1
            else
                i0
        else
            let mid = (i1 - i0) / 2 + i0
            if a.[mid] = a.[i1] then first i0 mid else first mid i1
    let rec last i0 i1 = 
        if a.[i1] = a.[i0] || i1-i0 < 2 then 
            if a.[i0] <> a.[i1] 
            then
                i0
            else
                i1
        else
            let mid = (i1 - i0) / 2 + i0
            if a.[mid] = a.[i0] then last mid i1 else last i0 mid
    (first 0 i),(last i (Array.length a - 1))

let test_arrays = 
    [
        Array.ofList ([1..4] @ [5;5;5;5;5] @ [6..10])
        [|1|]
        [|1;1;1;1;1|]
    ]

test_arrays
|> List.iter(fun a -> 
        printfn "%A" a 
        for i = 0 to Array.length a - 1 do
            printfn "%d(a.[%d] = %d): %A" i i (a.[i]) (equal_range a i)
    )

Here the equivalent, non-recursive C- code:

#include <stdio.h>
#include <assert.h>
#include <stdbool.h>

typedef struct IndexPair_tag
{
    size_t a;
    size_t b;
} IndexPair_t;

bool equal_range(const int * a, size_t n, size_t i, IndexPair_t * result)
{
    if (NULL == a) return false;
    if (NULL == result) return false;
    if (i >= n) return false;

    size_t i0, i1, mid;

    i0 = 0;
    i1 = i;
    while (a[i0] != a[i1] && ((i1 - i0) > 1))
    {
        mid = (i1 - i0) / 2 + i0;
        if (a[mid] == a[i1])
        {
            i1 = mid;
        }
        else
        {
            i0 = mid;
        }
    }
    if (a[i0] != a[i1])
        result->a = i1;
    else
        result->a = i0;

    i0 = i;
    i1 = n - 1;
    while (a[i0] != a[i1] && ((i1 - i0) > 1))
    {
        mid = (i1 - i0) / 2 + i0;
        if (a[mid] == a[i0])
        {
            i0 = mid;
        }
        else
        {
            i1 = mid;
        }
    }
    if (a[i0] != a[i1] )
        result->b = i0;
    else
        result->b = i1;

    return true;
}

static void ShowArray(int *a, size_t N)
{
    if (N > 0)
    {
        printf("[%d", a[0]);
        for (size_t i = 1; i < N; i++)
        {
            printf(", %d", a[i]);
        }
        printf("]\n");
    }
    else
        printf("[]\n");

}

int main()
{
    {
        const size_t N = 14;
        int a[N] = { 1,2,3,4,5,5,5,5,5,6,7,8,9,10 };
        ShowArray(a, N);
        IndexPair_t result;
        for (size_t i = 0; i < N; i++)
        {
            if (equal_range(a, 14, i, &result))
            {
                printf("%d(a[%d] = %d): (%d,%d)\n", i, i, a[i], result.a, result.b);
                assert(a[result.a] == a[result.b]);
            }
            else
            {
                printf("For i = %d, equal_range() returned false.\n", i);
                assert(false);
            }
        }
    }
    {
        const size_t N = 1;
        int a[N] = { 1 };
        ShowArray(a, N);
        IndexPair_t result;
        for (size_t i = 0; i < N; i++)
        {
            if (equal_range(a, N, i, &result))
            {
                printf("%d(a[%d] = %d): (%d,%d)\n", i, i, a[i], result.a, result.b);
                assert(a[result.a] == a[result.b]);
            }
            else
            {
                printf("For i = %d, equal_range() returned false.\n", i);
                assert(false);
            }
        }
    }
    {
        const size_t N = 5;
        int a[N] = { 1,1,1,1,1 };
        ShowArray(a, N);
        IndexPair_t result;
        for (size_t i = 0; i < N; i++)
        {
            if (equal_range(a, N, i, &result))
            {
                printf("%d(a[%d] = %d): (%d,%d)\n", i, i, a[i], result.a, result.b);
                assert(a[result.a] == a[result.b]);
            }
            else
            {
                printf("For i = %d, equal_range() returned false.\n", i);
                assert(false);
            }
        }
    }

    return 0;
}

Update: Jonathan was right, the design of the function was sloppy and had some corner case issues.

  • Fixed the fact that the function cannot report argument errors.
  • Added defensive argument tests to equal_range().
  • Fixed the fact, that for edge cases, wrong results were produced.
  • Changed test driver (main) so all edge cases are covered.

The fact, that the function takes an index, not a value is okay, IMHO, as it is supposed to be the second step, after a first step which produces the index of the element looked for.

like image 152
BitTickler Avatar answered Oct 11 '22 16:10

BitTickler


Here are 4 variants of binary search, plus test code. In abstract terms, these search an ordered array, X[1..N], which may contain repeated data values, for a particular value T.

  • BinSearch_A() — find an arbitrary index P for which X[P] = T.
  • BinSearch_B() — find the minimum index P for which X[P] = T.
  • BinSearch_C() — find the maximum index P for which X[P] = Y.
  • BinSearch_D() — find the range of indexes P..Q for which X[P] = T (but X[P-1] != T) and for which X[Q] = T (but X[Q+1] != T).

BinSearch_D() is what is wanted in the question, but understanding the other three will help with understanding the final result.

The code is based on material from chapter 8 of Programming Pearls, 1st Edn. It shows BinSearch_A() — which it develops in an earlier chapter — and then modifies it to BinSearch_B() as the first step in the optimization of the search for a value in an array of exactly 1000 entries.

The code presented contains the pseudo-code for 1-based arrays, actual C code for 0-based arrays, and a test harness which validates the data and results extensively. It is 330 lines, of which 30 are blank, about nearly 100 are comments, 12 are headers, types and declarations, 90 are the 4 implementations, and 100 are test code.

Note that if you know that the values in the array are unique, then BinSearch_D() is not the correct algorithm to use (use BinSearch_A()). Similarly, if it doesn't matter which element of a set of identical elements is selected, BinSearch_D() is not the correct algorithm to use (use BinSearch_A()). The two variants BinSearch_B() and BinSearch_C() are specialized, but do less work than BinSearch_D() if you only need the minimum index or maximum index at which the searched-for value occurs.

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

typedef struct Pair
{
    int lo;
    int hi;
} Pair;

extern Pair BinSearch_D(int N, const int X[N], int T);
extern int BinSearch_A(int N, const int X[N], int T);
extern int BinSearch_B(int N, const int X[N], int T);
extern int BinSearch_C(int N, const int X[N], int T);

/*
** Programming Pearls, 1st Edn, 8.3 Major Surgery - Binary Search
**
** In each fragment, P contains the result (0 indicates 'not found').
**
** Search for T in X[1..N] - BinSearch-A
**
**      L := 1; U := N
**      loop
**          # Invariant: if T is in X, it is in X[L..U]
**          if L > U then
**              P := 0; break;
**          M := (L+U) div 2
**          case
**              X[M] < T:  L := M+1
**              X[M] = T:  P := M; break
**              X[M] > T:  U := M-1
**
** Search for first occurrence of T in X[1..N] - BinSearch-B
**
**      L := 0; U := N+1
**      while L+1 != U do
**          # Invariant: X[L] < T and X[U] >= T and L < U
**          M := (L+U) div 2
**          if X[M] < T then
**              L := M
**          else
**              U := M
**      # Assert: L+1 = U and X[L] < T and X[U] >= T
**      P := U
**      if P > N or X[P] != T then P := 0
**
** Search for last occurrence of T in X[1..N] - BinSearch-C
** Adapted from BinSearch-B (not in Programming Pearls)
**
**      L := 0; U := N+1
**      while L+1 != U do
**          # Invariant: X[L] <= T and X[U] > T and L < U
**          M := (L+U) div 2
**          if X[M] <= T then
**              L := M
**          else
**              U := M
**      # Assert: L+1 = U and X[L] < T and X[U] >= T
**      P := L
**      if P = 0 or X[P] != T then P := 0
**
** C implementations of these deal with X[0..N-1] instead of X[1..N],
** and return -1 when the value is not found.
*/

int BinSearch_A(int N, const int X[N], int T)
{
    int L = 0;
    int U = N-1;
    while (1)
    {
        /* Invariant: if T is in X, it is in X[L..U] */
        if (L > U)
            return -1;
        int M = (L + U) / 2;
        if (X[M] < T)
            L = M + 1;
        else if (X[M] > T)
            U = M - 1;
        else
            return M;
    }
    assert(0);
}

int BinSearch_B(int N, const int X[N], int T)
{
    int L = -1;
    int U = N;
    while (L + 1 != U)
    {
        /* Invariant: X[L] < T and X[U] >= T and L < U */
        int M = (L + U) / 2;
        if (X[M] < T)
            L = M;
        else
            U = M;
    }
    assert(L+1 == U && (L == -1 || X[L] < T) && (U >= N || X[U] >= T));
    int P = U;
    if (P >= N || X[P] != T)
        P = -1;
    return P;
}

int BinSearch_C(int N, const int X[N], int T)
{
    int L = -1;
    int U = N;
    while (L + 1 != U)
    {
        /* Invariant: X[L] <= T and X[U] > T and L < U */
        int M = (L + U) / 2;
        if (X[M] <= T)
            L = M;
        else
            U = M;
    }
    assert(L+1 == U && (L == -1 || X[L] <= T) && (U >= N || X[U] > T));
    int P = L;
    if (P < 0 || X[P] != T)
        P = -1;
    return P;
}

/*
** If value is found in the array (elements array[0]..array[a_size-1]),
** then the returned Pair identifies the lowest index at which value is
** found (in lo) and the highest value (in hi).  The lo and hi values
** will be the same if there's only one entry that matches.
**
** If the value is not found in the array, the pair (-1, -1) is returned.
**
** -- If the values in the array are unique, then this is not the binary
**    search to use.
** -- If it doesn't matter which one of multiple identical keys are
**    returned, this is not the binary search to use.
**
** ------------------------------------------------------------------------
**
** Another way to look at this is:
** -- Lower bound is largest  index lo such that a[lo] < value and a[lo+1] >= value
** -- Upper bound is smallest index hi such that a[hi] > value and a[hi-1] <= value
** -- Range is then lo+1..hi-1.
** -- If the values is not found, the value (-1, -1) is returned.
**
** Let's review:
** == Data: 2, 3, 3, 3, 5, 5, 6, 8 (N = 8)
** Searches and results:
** == 1 .. lo = -1, hi = -1  R = (-1, -1) not found
** == 2 .. lo = -1, hi =  1  R = ( 0,  0) found
** == 3 .. lo =  0, hi =  4  R = ( 1,  3) found
** == 4 .. lo = -1, hi = -1  R = (-1, -1) not found
** == 5 .. lo =  3, hi =  6  R = ( 4,  5) found
** == 6 .. lo =  5, hi =  7  R = ( 6,  6) found
** == 7 .. lo = -1, hi = -1  R = (-1, -1) not found
** == 8 .. lo =  6, hi =  8  R = ( 7,  7) found
** == 9 .. lo = -1, hi = -1  R = (-1, -1) not found
**
** Code created by combining BinSearch_B() and BinSearch_C() into a
** single function.  The two separate ranges of values to be searched
** (L_lo:L_hi vs U_lo:U_hi) are almost unavoidable as if there are
** repeats in the data, the values diverge.
*/

Pair BinSearch_D(int N, const int X[N], int T)
{
    int L_lo = -1;
    int L_hi = N;
    int U_lo = -1;
    int U_hi = N;

    while (L_lo + 1 != L_hi || U_lo + 1 != U_hi)
    {
        if (L_lo + 1 != L_hi)
        {
            /* Invariant: X[L_lo] < T and X[L_hi] >= T and L_lo < L_hi */
            int L_md = (L_lo + L_hi) / 2;
            if (X[L_md] < T)
                L_lo = L_md;
            else
                L_hi = L_md;
        }
        if (U_lo + 1 != U_hi)
        {
            /* Invariant: X[U_lo] <= T and X[U_hi] > T and U_lo < U_hi */
            int U_md = (U_lo + U_hi) / 2;
            if (X[U_md] <= T)
                U_lo = U_md;
            else
                U_hi = U_md;
        }
    }

    assert(L_lo+1 == L_hi && (L_lo == -1 || X[L_lo] < T) && (L_hi >= N || X[L_hi] >= T));
    int L = L_hi;
    if (L >= N || X[L] != T)
        L = -1;
    assert(U_lo+1 == U_hi && (U_lo == -1 || X[U_lo] <= T) && (U_hi >= N || X[U_hi] > T));
    int U = U_lo;
    if (U < 0 || X[U] != T)
        U = -1;

    return (Pair) { .lo = L, .hi = U };
}

/* Test support code */

static void check_sorted(const char *a_name, int size, const int array[size])
{
    int ok = 1;
    for (int i = 1; i < size; i++)
    {
        if (array[i-1] > array[i])
        {
            fprintf(stderr, "Out of order: %s[%d] = %d, %s[%d] = %d\n",
                    a_name, i-1, array[i-1], a_name, i, array[i]);
            ok = 0;
        }
    }
    if (!ok)
        exit(1);
}

static void dump_array(const char *a_name, int size, const int array[size])
{
    printf("%s Data: ", a_name);
    for (int i = 0; i < size; i++)
        printf(" %2d", array[i]);
    putchar('\n');
}

/* This interface could be used instead of the Pair returned by BinSearch_D() */
static void linear_search(int size, const int array[size], int value, int *lo, int *hi)
{
    *lo = *hi = -1;
    for (int i = 0; i < size; i++)
    {
        if (array[i] == value)
        {
            if (*lo == -1)
                *lo = i;
            *hi = i;
        }
        else if (array[i] > value)
            break;
    }
}

static void test_abc_search(const char *a_name, int size, const int array[size])
{
    dump_array(a_name, size, array);
    check_sorted(a_name, size, array);

    for (int i = array[0] - 1; i < array[size - 1] + 2; i++)
    {
        int a_idx = BinSearch_A(size, array, i);
        int b_idx = BinSearch_B(size, array, i);
        int c_idx = BinSearch_C(size, array, i);
        printf("T %2d:  A %2d,  B %2d,  C %2d\n", i, a_idx, b_idx, c_idx);
        assert(a_idx != -1 || (b_idx == -1 && c_idx == -1));
        assert(b_idx != -1 || (c_idx == -1 && a_idx == -1));
        assert(c_idx != -1 || (a_idx == -1 && b_idx == -1));
        assert(a_idx == -1 || (array[a_idx] == i && array[b_idx] == i && array[c_idx] == i));
        assert(a_idx == -1 || (a_idx >= b_idx && a_idx <= c_idx));
        int lo;
        int hi;
        linear_search(size, array, i, &lo, &hi);
        assert(lo == b_idx && hi == c_idx);
    }
}

static void test_d_search(const char *a_name, int size, const int array[size], const Pair results[])
{
    dump_array(a_name, size, array);
    check_sorted(a_name, size, array);

    for (int i = array[0] - 1, j = 0; i < array[size - 1] + 2; i++, j++)
    {
        Pair result = BinSearch_D(size, array, i);
        printf("%2d: (%d, %d) vs (%d, %d)\n", i, result.lo, result.hi, results[j].lo, results[j].hi);
        int lo;
        int hi;
        linear_search(size, array, i, &lo, &hi);
        assert(lo == result.lo && hi == result.hi);
    }
}

int main(void)
{
    int A[] = { 1, 2, 2, 4, 5, 5, 5, 5, 5, 6, 8, 8, 9, 10 };
    enum { A_SIZE = sizeof(A) / sizeof(A[0]) };

    test_abc_search("A", A_SIZE, A);

    int B[] = { 10, 12, 12, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 18, 18, 18, 19, 19, 19, 19, };
    enum { B_SIZE = sizeof(B) / sizeof(B[0]) };

    test_abc_search("B", B_SIZE, B);

    int C[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, };
    enum { C_SIZE = sizeof(C) / sizeof(C[0]) };

    test_abc_search("C", C_SIZE, C);

    /* Results for BinSearch_D() on array A */
    static const Pair results_A[] =
    {
        { -1, -1 }, {  0,  0 }, {  1,  2 },
        { -1, -1 }, {  3,  3 }, {  4,  8 },
        {  9,  9 }, { -1, -1 }, { 10, 11 },
        { 12, 12 }, { 13, 13 }, { -1, -1 },
    };

    test_d_search("A", A_SIZE, A, results_A);

    int D[] = { 2, 3, 3, 3, 5, 5, 6, 8 };
    enum { D_SIZE = sizeof(D) / sizeof(D[0]) };
    static const Pair results_D[] =
    {
        { -1, -1 }, {  0,  0 }, {  1,  3 },
        { -1, -1 }, {  4,  5 }, {  6,  6 },
        { -1, -1 }, {  7,  7 }, { -1, -1 },
    };

    test_d_search("D", D_SIZE, D, results_D);

    return 0;
}

Example output:

A Data:   1  2  2  4  5  5  5  5  5  6  8  8  9 10
T  0:  A -1,  B -1,  C -1
T  1:  A  0,  B  0,  C  0
T  2:  A  2,  B  1,  C  2
T  3:  A -1,  B -1,  C -1
T  4:  A  3,  B  3,  C  3
T  5:  A  6,  B  4,  C  8
T  6:  A  9,  B  9,  C  9
T  7:  A -1,  B -1,  C -1
T  8:  A 10,  B 10,  C 11
T  9:  A 12,  B 12,  C 12
T 10:  A 13,  B 13,  C 13
T 11:  A -1,  B -1,  C -1
B Data:  10 12 12 12 14 15 15 15 15 15 15 15 16 16 16 18 18 18 19 19 19 19
T  9:  A -1,  B -1,  C -1
T 10:  A  0,  B  0,  C  0
T 11:  A -1,  B -1,  C -1
T 12:  A  1,  B  1,  C  3
T 13:  A -1,  B -1,  C -1
T 14:  A  4,  B  4,  C  4
T 15:  A 10,  B  5,  C 11
T 16:  A 13,  B 12,  C 14
T 17:  A -1,  B -1,  C -1
T 18:  A 16,  B 15,  C 17
T 19:  A 19,  B 18,  C 21
T 20:  A -1,  B -1,  C -1
C Data:   1  2  3  4  5  6  7  8  9 10 11 12
T  0:  A -1,  B -1,  C -1
T  1:  A  0,  B  0,  C  0
T  2:  A  1,  B  1,  C  1
T  3:  A  2,  B  2,  C  2
T  4:  A  3,  B  3,  C  3
T  5:  A  4,  B  4,  C  4
T  6:  A  5,  B  5,  C  5
T  7:  A  6,  B  6,  C  6
T  8:  A  7,  B  7,  C  7
T  9:  A  8,  B  8,  C  8
T 10:  A  9,  B  9,  C  9
T 11:  A 10,  B 10,  C 10
T 12:  A 11,  B 11,  C 11
T 13:  A -1,  B -1,  C -1
A Data:   1  2  2  4  5  5  5  5  5  6  8  8  9 10
 0: (-1, -1) vs (-1, -1)
 1: (0, 0) vs (0, 0)
 2: (1, 2) vs (1, 2)
 3: (-1, -1) vs (-1, -1)
 4: (3, 3) vs (3, 3)
 5: (4, 8) vs (4, 8)
 6: (9, 9) vs (9, 9)
 7: (-1, -1) vs (-1, -1)
 8: (10, 11) vs (10, 11)
 9: (12, 12) vs (12, 12)
10: (13, 13) vs (13, 13)
11: (-1, -1) vs (-1, -1)
D Data:   2  3  3  3  5  5  6  8
 1: (-1, -1) vs (-1, -1)
 2: (0, 0) vs (0, 0)
 3: (1, 3) vs (1, 3)
 4: (-1, -1) vs (-1, -1)
 5: (4, 5) vs (4, 5)
 6: (6, 6) vs (6, 6)
 7: (-1, -1) vs (-1, -1)
 8: (7, 7) vs (7, 7)
 9: (-1, -1) vs (-1, -1)
like image 35
Jonathan Leffler Avatar answered Oct 11 '22 14:10

Jonathan Leffler