Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

_mm_sad_epu8 faster than _mm_sad_pu8

Tags:

c

sse

intrinsics

In a benchmark test, the 128-bit intrinsic function performs faster than the 64-bit intrinsic?

_mm_sad_epu8(__m128i, __m128i) //Clocks: 0.0300
_mm_sad_pu8(__m64, __m64)      //Clocks: 0.0491

From what I understand, The Intel Reference Manual, states that (PSADBW) has a latency of 5 and throughput of 1 on mmx registers, but does not state performance for mm registers.

Should they not be equally fast, and is this general for intrinsic functions taking 128-bit arguments?

like image 670
sigvardsen Avatar asked Oct 18 '22 19:10

sigvardsen


1 Answers

My measurement program (see below) reveals that on a Core-i5 3450 (Ivy Bridge) the performance of _mm_sad_epu8 equals that of _mm_sad_pu8. The latter is even slightly faster.

The output of my program is:

warmup:              1.918 sec total
measure_mm_sad_epu8: 1.904 sec total, 0.372 nsec per operation
measure_mm_sad_pu8:  1.872 sec total, 0.366 nsec per operation

The turbo clock frequency of my processor is 3.5 GHz (single thread), and the throughput of _mm_sad_epu8 should be 1 clock cycle according to the Intrinsics Guide. Thus, each operation should take at least 0.286 nsec. So my measurement program reached about 77% of the max performance.

I used Visual Studio C++ 2010 Express and created a new Win32 console application. The program has been compiled with the standard "Release" settings. This is the code of the cpp file:

#include "stdafx.h"
#include <cassert>
#include <ctime>
#include <iostream>
#include <iomanip>

extern "C" {
  #include <emmintrin.h>
}

float measure_mm_sad_epu8(int n, int repeat) {
    assert(n % 16 == 0);
    // Didn't get an aligned "new" to work :-(
    __m128i *input  = (__m128i *) _aligned_malloc(n * sizeof *input,  16);
    __m128i *output = (__m128i *) _aligned_malloc(n * sizeof *output, 16);
    if(!input || !output) exit(1);
    __m128i zero = _mm_setzero_si128();

    for(int i=0; i < n; i++) {
        input[i].m128i_i64[0] = 0x0123456789abcdef;
        input[i].m128i_i64[1] = 0xfedcba9876543210;
    }

    clock_t startTime = clock();
    for(int r = 0; r < repeat; r++) {
        for(int i = 0; i < n; i+=16) { // loop unrolled
            output[i  ] = _mm_sad_epu8(input[i  ], zero);
            output[i+1] = _mm_sad_epu8(input[i+1], zero);
            output[i+2] = _mm_sad_epu8(input[i+2], zero);
            output[i+3] = _mm_sad_epu8(input[i+3], zero);
            output[i+4] = _mm_sad_epu8(input[i+4], zero);
            output[i+5] = _mm_sad_epu8(input[i+5], zero);
            output[i+6] = _mm_sad_epu8(input[i+6], zero);
            output[i+7] = _mm_sad_epu8(input[i+7], zero);
            output[i+8] = _mm_sad_epu8(input[i+8], zero);
            output[i+9] = _mm_sad_epu8(input[i+9], zero);
            output[i+10] = _mm_sad_epu8(input[i+10], zero);
            output[i+11] = _mm_sad_epu8(input[i+11], zero);
            output[i+12] = _mm_sad_epu8(input[i+12], zero);
            output[i+13] = _mm_sad_epu8(input[i+13], zero);
            output[i+14] = _mm_sad_epu8(input[i+14], zero);
            output[i+15] = _mm_sad_epu8(input[i+15], zero);
        }
    }
    _mm_empty();
    clock_t endTime = clock();

    _aligned_free(input);
    _aligned_free(output);
    return (endTime-startTime)/(float)CLOCKS_PER_SEC;
}

float measure_mm_sad_pu8(int n, int repeat) {
    assert(n % 16 == 0);
    // Didn't get an aligned "new" to work :-(
    __m64 *input  = (__m64 *) _aligned_malloc(n * sizeof *input,  16);
    __m64 *output = (__m64 *) _aligned_malloc(n * sizeof *output, 16);
    if(!input || !output) exit(1);
    __m64 zero = _mm_setzero_si64();

    for(int i=0; i < n; i+=2) {
        input[i  ].m64_i64 = 0x0123456789abcdef;
        input[i+1].m64_i64 = 0xfedcba9876543210;
    }

    clock_t startTime = clock();
    for(int r = 0; r < repeat; r++) {
        for(int i = 0; i < n; i+=16) { // loop unrolled
            output[i  ] = _mm_sad_pu8(input[i  ], zero);
            output[i+1] = _mm_sad_pu8(input[i+1], zero);
            output[i+2] = _mm_sad_pu8(input[i+2], zero);
            output[i+3] = _mm_sad_pu8(input[i+3], zero);
            output[i+4] = _mm_sad_pu8(input[i+4], zero);
            output[i+5] = _mm_sad_pu8(input[i+5], zero);
            output[i+6] = _mm_sad_pu8(input[i+6], zero);
            output[i+6] = _mm_sad_pu8(input[i+7], zero);
            output[i+7] = _mm_sad_pu8(input[i+8], zero);
            output[i+8] = _mm_sad_pu8(input[i+9], zero);
            output[i+10] = _mm_sad_pu8(input[i+10], zero);
            output[i+11] = _mm_sad_pu8(input[i+11], zero);
            output[i+12] = _mm_sad_pu8(input[i+12], zero);
            output[i+13] = _mm_sad_pu8(input[i+13], zero);
            output[i+14] = _mm_sad_pu8(input[i+14], zero);
            output[i+15] = _mm_sad_pu8(input[i+15], zero);
        }
    }
    _mm_empty();
    clock_t endTime = clock();

    _aligned_free(input);
    _aligned_free(output);
    return (endTime-startTime)/(float)CLOCKS_PER_SEC;
}

int _tmain(int argc, _TCHAR* argv[])
{
    int n = 256, repeat = 20000000;
    float time;

    std::cout << std::setprecision(3) << std::fixed;

    time = measure_mm_sad_epu8(n,repeat);
    std::cout << "warmup:              " << time << " sec total" << std::endl;
    time = measure_mm_sad_epu8(n,repeat);
    std::cout << "measure_mm_sad_epu8: " << time << " sec total, " << time/n/repeat*1e9 << " nsec per operation" << std::endl;

    n*=2;      // same memory footprint
    repeat/=2; // but with same amount of calculations
    time = measure_mm_sad_pu8(n,repeat);
    std::cout << "measure_mm_sad_pu8:  " << time << " sec total, " << time/n/repeat*1e9 << " nsec per operation" << std::endl;
    return 0;
}

And this is the unmodified "stdafx.h":

#pragma once
#include "targetver.h"
#include <stdio.h>
#include <tchar.h>

EDIT: For each operation output[i] = _mm_sad_epu8(input[i], zero); in the unrolled loop, the compiler generates a vector load, a psadbw and a vector store like this (just with differing pointer arithmetic):

013410D0  movdqa      xmm1,xmmword ptr [ecx-30h]  
013410D5  psadbw      xmm1,xmm0  
013410D9  movdqa      xmmword ptr [eax-10h],xmm1  

013410DE  ...

The IvyBridge has enough (pipeline) ports to execute this at the "same" time. The generated code only uses the xmm1 and xmm0 register, thus it relies on the register renaming of the processor. EDIT2: Due to the changing address arithmetic, the code length varies from 13 to 20 bytes. Thus, the code may suffer from a decoder bottleneck because Ivy Bridge can only decode 16 bytes per clock cycle (and max 4 instructions). On the other hand, it has a loop cache to handle this.

The generated code for the MMX version is almost the same:

013412D4  movq        mm1,mmword ptr [ecx-18h]  
013412D8  psadbw      mm1,mm0  
013412DB  movq        mmword ptr [eax-8],mm1  

013412DF  ...

The memory footprint is 2*4 KiB for both versions, because I doubled the number of elements in the MMX version (see main).

like image 86
Martin Zabel Avatar answered Oct 26 '22 23:10

Martin Zabel