Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Quickselect with array of structs has non linear running time

Tags:

c

optimization

Updated, original question below the line:

I need to compute a median, and would like to use the O(N) quickselect algorithm. It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).

The following flat array version has approximately linear runtime:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

double quickselect(unsigned long k, unsigned long n, double *arr)
{
  unsigned long i, ir, j, l, mid;
  double a, temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir] < arr[l]) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l] > arr[ir]) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1] > arr[ir]) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l] > arr[l+1]) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i] < a);
        do j--; while (arr[j] > a);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  double *arr = (double*) malloc(N[4] * sizeof(double));

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        arr[k] = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], arr);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

Gives:

1000 0.00228
10000 0.02014
100000 0.19868
1000000 2.01272
10000000 20.41286

However the following version with structs has non linear runtime:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;

typedef struct {
  double x;
  double y;
  double z;
  int id;
} point_t;

point_t* quickselect(unsigned long k, unsigned long n, point_t **arr)
{
  unsigned long i, ir, j, l, mid;
  point_t *a, *temp;

  l=1;
  ir=n-1;
  for (;;) {
    if (ir <= l+1) {
      if (ir == l+1 && arr[ir]->x < arr[l]->x) {
        SWAP(arr[l],arr[ir])
      }
      return arr[k];
    } else {
      mid=(l+ir) >> 1;
      SWAP(arr[mid],arr[l+1])
      if (arr[l]->x > arr[ir]->x) {
        SWAP(arr[l],arr[ir])
      }
      if (arr[l+1]->x > arr[ir]->x) {
        SWAP(arr[l+1],arr[ir])
      }
      if (arr[l]->x > arr[l+1]->x) {
        SWAP(arr[l],arr[l+1])
      }
      i=l+1;
      j=ir;
      a=arr[l+1];
      for (;;) {
        do i++; while (arr[i]->x < a->x);
        do j--; while (arr[j]->x > a->x);
        if (j < i) break;
        SWAP(arr[i],arr[j])
      }
      arr[l+1]=arr[j];
      arr[j]=a;
      if (j >= k) ir=j-1;
      if (j <= k) l=i;
    }
  }
}

int main()
{
  unsigned long i, j, k, l, m;
  unsigned long ntest = 1e2;
  unsigned long N[5] = {1e3, 1e4, 1e5, 1e6, 1e7};
  clock_t start, diff;

  int seed = 215342512; //time(NULL);
  srand(seed);

  point_t **ap, *a;

  ap = (point_t**) malloc(N[4] * sizeof(point_t*));

  if (ap == NULL) printf("Error in ap\n");

  a = (point_t*) malloc(N[4] * sizeof(point_t));

  if (a == NULL) printf("Error in a\n");

  for (i=0; i<N[4]; i++)
  {
    ap[i] = a+i;
  }

  for (i=0; i<5; i++)
  {
    start = clock();

    for (j=0; j<ntest; j++)
    {
      for (k=0; k<N[i]; k++)
      {
        ap[k]->x = (double) rand() / (double) RAND_MAX;
      }

      quickselect(N[i] / 2, N[i], ap);
    }

    diff = clock() - start;

    printf("%lu %.5f\n", N[i], (double) diff / CLOCKS_PER_SEC);
  }
}

Gives:

1000 0.00224
10000 0.02587
100000 0.37574
1000000 7.18962
10000000 96.34863

Both versions were compiled with gcc -O2 (but -O0 gives the same scaling).

Where does this change in scaling come from and how can it be fixed?

Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well. Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).


I need to compute a median, and would like to use the O(N) quickselect algorithm. It turns out however that when the array is no longer a flat array of doubles, but rather an array of structs (of which one element is the element to use for the median computation) the run time no longer scales with O(N).

I use the following implementation:

#define SWAP(a,b) temp=(a); (a)=(b); (b)=temp;

typedef struct point_t point_t;

struct point_t {
  double y;

//  unsigned long something;
//
//  double *something_else;
//
//  double yet_another thing;
//
//  point_t* again_something;
};

void median(point_t *arr, unsigned long n)
{
  unsigned long k = n / 2;
  unsigned long i, ir, j, l, mid;
  point_t a, temp;

  l=0;
  ir=n-1;
  for (;;)
  {
    if (ir <= l+1)
    {
      if (ir == l+1 && arr[ir].y < arr[l].y)
      {
        SWAP(arr[l], arr[ir])
      }
      return arr + k;
    }
    else
    {
      mid = (l + ir) >> 1;
      SWAP(arr[mid], arr[l+1])
      if (arr[l].y > arr[ir].y)
      {
        SWAP(arr[l], arr[ir])
      }
      if (arr[l+1].y > arr[ir].y)
      {
        SWAP(arr[l+1], arr[ir])
      }
      if (arr[l].y > arr[l+1].y)
      {
        SWAP(arr[l], arr[l+1])
      }
      i = l+1;
      j = ir;
      a = arr[l+1];
      for (;;)
      {
        do i++; while (arr[i].y < a.y);
        do j--; while (arr[j].y > a.y);
        if (j < i) break;
        SWAP(arr[i],  arr[j])
      }
      arr[l+1] = arr[j];
      arr[j] = a;
      if (j >= k) ir = j-1;
      if (j <= k) l = i;
    }
  }
}

with -O2 the struct is optimized away (I think, at least the scaling looks the same as with a plain array) and the scaling is linear. However when uncommenting the other components of the struct the scaling is no longer linear. How can this be? And how can this be fixed?

Note that while I can change the struct, I cannot just median y because I need to know the other parameters corresponding to the median point as well. Additionally I need the quickselect behavior for the resulting array (e.g. a.y <= m.y for all a left of m and b.y > m.y for all b right of m).

like image 335
Pim Schellart Avatar asked Dec 12 '11 14:12

Pim Schellart


2 Answers

I think memory cache mishits explain the non-linear growth of the execution time. In my x86_64 architecture PC (Linux + gcc), sizeof(double) is 8 and sizeof(point_t) is 32 so less elements fit in cache memory. But bigger reason for the non-linear growth is that memory accesses to the point_t structures through the pointer array in your code will be quickly highly randomized and more and more cache misses occur because of this...

I changed the code as follows:

--- test2.c
+++ test3.c
         -80,14 +80,12
     if (a == NULL)
         printf("Error in an");

-    for (i = 0; i < N[4]; i++) {
-        ap[i] = a + i;
-    }
- 
     for (i = 0; i < 5; i++) {
         start = clock();

         for (j = 0; j < ntest; j++) {
            for (k = 0; k < N[i]; k++) {
+                ap[k] = a + k;
                 ap[k]->x = (double) rand() / (double) RAND_MAX;
             }

and the execution time growth is more linear.

Original quicselect() with double array:

    1000 0.00000
   10000 0.04000
  100000 0.22000
 1000000 1.98000
10000000 20.73000

Original quickselect() with point_t * array:

    1000 0.01000
   10000 0.02000
  100000 0.71000
 1000000 8.64000
10000000 157.77000

Exactly the same quickselect() with point_t * array as above, but making sure the pointers in the array are in consequtive order before calling quickselect() by applying the above patch:

    1000 0.00000
   10000 0.02000
  100000 0.40000
 1000000 4.71000
10000000 49.80000

Note that even if the modified version does the extra sorting in the timing loop, it is still faster.

I am running 3.2GHz Pentium(R) Dual-Core CPU E6700, 64-bit Linux, gcc 4.6, optimization -O2. (My machine is not idle, so my benchmark figures have some fluctuations - also I would consider using clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) for increased accuracy in Linux system if I were making more serious benchmarks to calculate out the time when kernel is not scheduling the benchmarked process to run.)

UPDATE: for example, valgrind (if supported by your platform) can be used to analyse the impact of the cache hits. I modified the program to take two arguments, the array size (corresponding to the elements of the array N[]) and the test count (corresponding to ntest). Execution times without valgrind, wheretest2 is essentially the unmodified program listed in the question and test4 is the modifed version which rearranges the ap[] array before calling the quickselect() function:

bash$ ./test2 10000000 100
Array of 10000000 elements, 100 times: 154.40000 seconds
bash$ ./test4 10000000 100
Array of 10000000 elements, 100 times: 48.45000 seconds

Here's the result of running valgrind using cachegrind tool:

bash$ valgrind --tool=cachegrind ./test2 10000000 100
==23563== Cachegrind, a cache and branch-prediction profiler
==23563== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
==23563== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
==23563== Command: ./test2 10000000 100
==23563== 
Array of 10000000 elements, 100 times: 1190.24000 seconds
==23563== 
==23563== I   refs:      80,075,384,594
==23563== I1  misses:             1,091
==23563== LLi misses:             1,087
==23563== I1  miss rate:           0.00%
==23563== LLi miss rate:           0.00%
==23563== 
==23563== D   refs:      36,670,292,139  (25,550,518,762 rd   + 11,119,773,377 wr)
==23563== D1  misses:     4,218,722,190  ( 3,223,975,942 rd   +    994,746,248 wr)
==23563== LLd misses:     4,190,889,241  ( 3,198,934,125 rd   +    991,955,116 wr)
==23563== D1  miss rate:           11.5% (          12.6%     +            8.9%  )
==23563== LLd miss rate:           11.4% (          12.5%     +            8.9%  )
==23563== 
==23563== LL refs:        4,218,723,281  ( 3,223,977,033 rd   +    994,746,248 wr)
==23563== LL misses:      4,190,890,328  ( 3,198,935,212 rd   +    991,955,116 wr)
==23563== LL miss rate:             3.5% (           3.0%     +            8.9%  )

and

bash$ valgrind --tool=cachegrind ./test4 10000000 100
==24436== Cachegrind, a cache and branch-prediction profiler
==24436== Copyright (C) 2002-2010, and GNU GPL'd, by Nicholas Nethercote et al.
==24436== Using Valgrind-3.6.1-Debian and LibVEX; rerun with -h for copyright info
==24436== Command: ./test4 10000000 100
==24436== 
Array of 10000000 elements, 100 times: 862.89000 seconds
==24436== 
==24436== I   refs:      82,985,384,485
==24436== I1  misses:             1,093
==24436== LLi misses:             1,089
==24436== I1  miss rate:           0.00%
==24436== LLi miss rate:           0.00%
==24436== 
==24436== D   refs:      36,640,292,192  (24,530,518,829 rd   + 12,109,773,363 wr)
==24436== D1  misses:     2,814,232,350  ( 2,189,229,679 rd   +    625,002,671 wr)
==24436== LLd misses:     2,796,287,872  ( 2,171,294,250 rd   +    624,993,622 wr)
==24436== D1  miss rate:            7.6% (           8.9%     +            5.1%  )
==24436== LLd miss rate:            7.6% (           8.8%     +            5.1%  )
==24436== 
==24436== LL refs:        2,814,233,443  ( 2,189,230,772 rd   +    625,002,671 wr)
==24436== LL misses:      2,796,288,961  ( 2,171,295,339 rd   +    624,993,622 wr)
==24436== LL miss rate:             2.3% (           2.0%     +            5.1%  )

See Valgrind manual for how to read these statistics. An essential point is: "On a modern machine, an L1 miss will typically cost around 10 cycles, an LL miss can cost as much as 200 cycles". Calculating LLd (last level data cache) misshit cost difference between the two cases (each difference in mishits times the assumed 200 cycles per 3.2e9 cycles/second for 3.2GHz CPU) gives

bash$ echo $(( (4190889241 - 2796287872) * 200 / 3200000000 )) seconds
87 seconds

The D1 misses contribute quite little here (given total of 91 seconds if D1 misshit cost is independent of LLd misshit cost); with all our inaccuraces (most notably about the actual cost of LLd misshit in this computer) the D1 misses can be just ignored.

The run-time differences for test2 and test4 come to about 106 seconds which is reasonably close to the above 86 seconds. This all could be made more accurate, but this seems to demonstrate already the effect of cache misses in the test arrangement.

P.S. valgrind writes a log file, where I could verify that it seemed to detect the L2 and L1 cache sizes and types correctly.

like image 185
FooF Avatar answered Oct 21 '22 20:10

FooF


It looks like you're sorting and swapping the actual structs, which means you're moving around a lot more stuff. You should try sorting with an array of pointers to the structs, and swap those.

So your median function would have a prototype of:

void median(point_t **arr, unsigned long n);

And then you'd have to allocate and free the structs and put their pointers in the array before calling it, rather than populating the array's elements with the struct directly.

like image 34
Dan Fego Avatar answered Oct 21 '22 20:10

Dan Fego