How to find the largest element of an array of known size?

Tags:

c

I need to find the largest element in an array that contains exactly 16 integers. I'm considering two possible implementations. First, the sensible implementation:

int largest = array[0];
for (int i = 1; i < 16; i++) {
  const int val = array[i];
  if (val > largest) {
    largest = val;
  }
}

And then there's the slightly insane implementation that takes advantage of the fact that the size of the array is known:

const int max_value =
  max(
    max(
      max(
        max(array[0], array[1]),
        max(array[2], array[3])),
      max(
        max(array[4], array[5]),
        max(array[6], array[7]))),
    max(
      max(
        max(array[8], array[9])
        max(array[10], array[11])),
      max(
        max(array[12], array[13])
        max(array[14], array[15]))));

Which is the better implementation? Is max typically implemented in hardware?

890

asked Sep 20 '15 18:09

2 Answers

Let's compile them and see what we get!

First of all, AFAIK, there is no "max" function/macro defined in the C standard. So I added one (which looks convoluted because it avoids double evaluation of its inputs).

#define max(a,b) ({ \
    const __typeof__ (a) _a = (a); \
    const __typeof__ (b) _b = (b); \
    _a > _b ? _a : _b; \
})

int __attribute__ ((noinline)) test1(const int* array) {
    int largest = array[0];
    for (int i = 1; i < 16; i++) {
      const int val = array[i];
      if (val > largest) {
        largest = val;
      }
    }
    return largest;
}

int __attribute__ ((noinline)) test2(const int* array) {
    const int max_value =
      max(
        max(
          max(
            max(array[0], array[1]),
            max(array[2], array[3])),
          max(
            max(array[4], array[5]),
            max(array[6], array[7]))),
        max(
          max(
            max(array[8], array[9]),
            max(array[10], array[11])),
          max(
            max(array[12], array[13]),
            max(array[14], array[15]))));
    return max_value;
}

My gcc version, which is relevant when speaking about optimizations:

tmp$ gcc --version
gcc (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

-O2 for optimizations, -S to output assembly, -o - to output to stdout.

tmp$ gcc -std=c99 -O2 -S test.c -o -
    .file   "test.c"
    .text
    .p2align 4,,15
    .globl  test1
    .type   test1, @function
test1:
.LFB0:
    .cfi_startproc
    movl    (%rdi), %eax
    xorl    %edx, %edx
    .p2align 4,,10
    .p2align 3
.L3:
    movl    4(%rdi,%rdx), %ecx
    cmpl    %ecx, %eax
    cmovl   %ecx, %eax
    addq    $4, %rdx
    cmpq    $60, %rdx
    jne .L3
    rep ret
    .cfi_endproc
.LFE0:
    .size   test1, .-test1
    .p2align 4,,15
    .globl  test2
    .type   test2, @function
test2:
.LFB1:
    .cfi_startproc
    movl    (%rdi), %edx
    cmpl    %edx, 4(%rdi)
    cmovge  4(%rdi), %edx
    movl    8(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    12(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    16(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    20(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    24(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    28(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    32(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    36(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    40(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    44(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    48(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    52(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    56(%rdi), %eax
    cmpl    %eax, %edx
    cmovl   %eax, %edx
    movl    60(%rdi), %eax
    cmpl    %eax, %edx
    cmovge  %edx, %eax
    ret
    .cfi_endproc
.LFE1:
    .size   test2, .-test2
    .ident  "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
    .section    .note.GNU-stack,"",@progbits

Okay, so test2() certainly looks much longer. However, it doesn't branch at all. And it's only ~3 instructions (memory load, comparison, conditional move) per element. test1() has 6 instructions (memory load, comparison, conditional move, loop counter increment, loop counter comparison, conditional branch). Lots of branches in test1, which can be bothersome (depending on how good your architecture's branch prediction is). On the other hand, test2 increases code size, which will necessarily push something else out of instruction cache. And there are lots of data hazards in test2 (well, and test1...) -- maybe we could rewrite it to use a few extra registers to reduce the number of pipeline stalls?

So, as you can probably see by now, this is not an easy question to answer.

The only real way to know is to measure it. And even then, it'll vary depending on internal implementation/optimizations/cache size of each CPU model.

So I wrote a small benchmark:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>

#define N (1000000)

int main() {
    printf("    %12s %12s  %12s %12s\n", "test1 time", "test2 time", "test1 out", "test2 out");
    int* data = malloc(N * 16 * sizeof(int));
    srand(1);
    for (int i=0; i<16*N; ++i) {
        data[i] = rand();
    }

    const int* a;
    struct timespec t1, t2, t3;
    for (int attempt=0; attempt<10; ++attempt) {
        uint32_t sum1 = 0;
        uint32_t sum2 = 0;

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1);
        a = data;
        for (int i=0; i<N; ++i) {
            sum1 += test1(a);
            a += 16;
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t2);
        a = data;
        for (int i=0; i<N; ++i) {
            sum2 += test2(a);
            a += 16;
        }

        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t3);
        uint64_t nanos1 = (t2.tv_sec - t1.tv_sec) * 1000000000L + (t2.tv_nsec - t1.tv_nsec);
        uint64_t nanos2 = (t3.tv_sec - t2.tv_sec) * 1000000000L + (t3.tv_nsec - t2.tv_nsec);
        printf("%2d: %12lu %12lu  %12u %12u\n", attempt+1, nanos1, nanos2, sum1, sum2);
    }
    return 0;
}

And the results:

tmp$ gcc -std=gnu99 -O2 test.c -o test
tmp$ ./test 
      test1 time   test2 time     test1 out    test2 out
 1:     16251659     10431322    4190722540   4190722540
 2:     16796884     10639081    4190722540   4190722540
 3:     16443265     10314624    4190722540   4190722540
 4:     17194795     10337678    4190722540   4190722540
 5:     16966405     10380047    4190722540   4190722540
 6:     16803840     10556222    4190722540   4190722540
 7:     16795989     10871508    4190722540   4190722540
 8:     16389862     11511950    4190722540   4190722540
 9:     16304850     11704787    4190722540   4190722540
10:     16309371     11269446    4190722540   4190722540
tmp$ gcc -std=gnu99 -O3 test.c -o test
tmp$ ./test 
      test1 time   test2 time     test1 out    test2 out
 1:      9090364      8813462    4190722540   4190722540
 2:      8745093      9394730    4190722540   4190722540
 3:      8942015      9839356    4190722540   4190722540
 4:      8849960      8834056    4190722540   4190722540
 5:      9567597      9195950    4190722540   4190722540
 6:      9130245      9115883    4190722540   4190722540
 7:      9680596      8930225    4190722540   4190722540
 8:      9268440      9998824    4190722540   4190722540
 9:      8851503      8960392    4190722540   4190722540
10:      9767021      8875165    4190722540   4190722540
tmp$ gcc -std=gnu99 -Os test.c -o test
tmp$ ./test 
      test1 time   test2 time     test1 out    test2 out
 1:     17569606     10447512    4190722540   4190722540
 2:     17755450     10811861    4190722540   4190722540
 3:     17718714     10372411    4190722540   4190722540
 4:     17743248     10378728    4190722540   4190722540
 5:     18747440     10306748    4190722540   4190722540
 6:     17877105     10782263    4190722540   4190722540
 7:     17787171     10522498    4190722540   4190722540
 8:     17771172     10445461    4190722540   4190722540
 9:     17683935     10430900    4190722540   4190722540
10:     17670540     10543926    4190722540   4190722540
tmp$ gcc -std=gnu99 -O2 -funroll-loops test.c -o test
tmp$ ./test 
      test1 time   test2 time     test1 out    test2 out
 1:      9840366     10008656    4190722540   4190722540
 2:      9826522     10529205    4190722540   4190722540
 3:     10208039     10363219    4190722540   4190722540
 4:      9863467     10284608    4190722540   4190722540
 5:     10473329     10054511    4190722540   4190722540
 6:     10298968     10520570    4190722540   4190722540
 7:      9846157     10595723    4190722540   4190722540
 8:     10340026     10041021    4190722540   4190722540
 9:     10434750     10404669    4190722540   4190722540
10:      9982403     10592842    4190722540   4190722540

Conclusion: the max() version is faster on my Intel Core i7-3517U with 4 MB cache (and I wouldn't claim any more than that because, again, the results may vary with microarchitecture).

Also, -funroll-loops or the extra-aggressive (and less safe) optimizations enabled by -O3 really make a huge difference for the test1 case, essentially making it equal in time to test2 -- perhaps even slightly better with -funroll-loops, but close enough that we can't draw a confident conclusion from the numbers I got. It would probably be interesting to look at the assembly for test1 there, but I'll leave that as an exercise for the reader. ;)

So, I guess the answer is "it depends".

146

answered Nov 05 '22 02:11

Snild Dolkow

The first one is clearly the most straightforward implementation.

Nonetheless, this problem is related to the concept of Sorting Networks, which is a surprisingly complex theory on sorting fixed-size sets of data.

answered Nov 05 '22 00:11

alecov

Related questions
                            
                                Deep-copying with ProtoBuf in C/C++
                            
                                How does function named connect() prevent MPI C program from running?
                            
                                Optimizing neighbor count function for conway's game of life in C
                            
                                return pointer from a C function is good/bad design? [closed]
                            
                                Makefile Fatal error: can't create obj/calc.o
                            
                                How feasible is it to virtualise the FILE* interfaces of C?
                            
                                Returning From Catching A Floating Point Exception
                            
                                Free C-malloc()'d memory in Swift?
                            
                                Change lua variable from C
                            
                                How to handle portability issues in a binary file format
                            
                                Webrtc2sip: error adding symbols: DSO missing from command line
                            
                                Different behaviour of C macro for different cases
                            
                                X11 - Xrandr giving me false monitors
                            
                                CRC32 calculation with CRC hash at the beginning of the message in C
                            
                                Assign an array to *mut c_void
                            
                                'memcpy'-like function that supports offsets by individual bits?
                            
                                how to redirect posix_spawn stdout to /dev/null
                            
                                Eliminating modulo bias: how is it achieved in the arc4random_uniform() function?
                            
                                c reading non ASCII characters
                            
                                How can I align function parameter names in clang-format?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

How to find the largest element of an array of known size?

Tags:

c

alexgolec

People also ask

2 Answers

Snild Dolkow

alecov

Recent Activity

Donate For Us