Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Accurate vectorizable implementation of acosf()

Simple implementations of acosf() can easily achieve an error bound of 1.5 ulp with respect to the infinitely precise (mathematical) result, if the platform has support for fused multiply-add (FMA). This means results never differ by more than one ulp from the correctly rounded result in round-to-nearest-or-even mode.

However, such an implementation usually comprises two major code branches that split the primary approximation interval [0,1] roughly in half, as in the exemplary code below. This branchyness inhibits automatic vectorization by compilers when targeting SIMD architectures.

Is there an alternative algorithmic approach that lends itself more readily to automatic vectorization, while maintaining the same error bound of 1.5 ulps? Platform support for FMA can be assumed.

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
like image 539
njuffa Avatar asked Mar 03 '18 00:03

njuffa


3 Answers

The closest I have come to a satisfactory solution is based on an idea from a news posting by Robert Harley, in which he observed that for x in [0,1], acos(x) ≈ √(2*(1-x)), and that a polynomial can provide the scale factor necessary to make this an accurate approximation across the entire interval. As can be seen in the code below, this approach results in straight-line code with just two uses of the ternary operator to handle arguments in the negative half-plane.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

Although the structure of this code appears conducive to auto-vectorization, I have not had much luck when targeting AVX2 with the compilers offered by Compiler Explorer. The only compiler that appears to be able to vectorize this code in the context of the inner loop of my test app above is Clang. But Clang seems to be able to do this only if I specify -ffast-math, which, however, has the undesired side-effect of turning the sqrtf() invocation into an approximate square root computed via rsqrt. I experimented with some less intrusive switches, e.g. -fno-honor-nans, -fno-math-errno, -fno-trapping-math, but my_acosf() did not vectorize even when I used those in combination.

For now I have resorted to manual translation of the above code into AVX2 + FMA intrinsics, like so:

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}
like image 81
njuffa Avatar answered Sep 30 '22 14:09

njuffa


A branchless version of the code in the question is possible (with hardly any redundant work, just some compare/blends to create constants for the FMAs), but IDK if compilers will auto-vectorize it.

The main extra work is a useless sqrt / fma if all the elements had -|a| > -0.5625f, unfortunately on the critical path.


The arg to asinf_core is (r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f)).

In parallel with that, you (or the compiler) can be blending the coefficients for the FMA on the output.

If you sacrifice the accuracy of the pi/2 constant by putting it into one float instead of creating it with 2 constant multiplicands to fmaf, you can

fmaf( condition?-1:2,  asinf_core_result,  condition ? pi/2 : 0)

So you select between two constants, or andps a constant with a SIMD compare result to conditionally zero it (x86 SSE for example).


The final fixup is based on a range-check of the original input, so again there's instruction-level parallelism between FP blends and the FMA work of asinf_core.

In fact, we can optimize this into the previous FMA on the output of asinf_core, by blending the constant inputs to that on a second condition. We want asinf_core as one of the multiplicands for it, so we can negate or not by negating the constant. (A SIMD implementation might do a_cmp = andnot( a>0.0f, a>=-1.0f), and then multiplier ^ (-0.0f & a_cmp), where multiplier was conditionally done before.

The additive constant for that FMA on the output is 0, pi/2, pi, or pi + pi/2. Given the two compare results (on a and on r=-|a| for the non-NaN case), we could maybe combine that into a 2-bit integer and use it as a shuffle control to select an FP constant from a vector of all 4 constants, e.g. using AVX vpermilps (fast in-lane shuffle with a variable control). i.e. instead of blending 4 different ways, use a shuffle as a 2-bit LUT!

If we're doing that, we should also do it for the multiplicative constant, because creating the constant is the main cost. Variable blends are more expensive than shuffles on x86 (usually 2 uops vs. 1). On Skylake, variable blends (like vblendvps) can use any port (while shuffles only run on port 5). There's enough ILP that this probably bottlenecks on overall uop throughput, or overall ALU ports, not port 5. (Variable blend on Haswell is 2 uops for port 5, so it's strictly worse than vpermilps ymm,ymm,ymm).

We'd be selecting from -1, 1, -2, and 2.


Scalar with ternary operators, auto-vectorizes (with 8 vblendvps) with gcc7.3 -O3 -march=skylake -ffast-math. fast-math required for autovectorization :/ And unfortunately, gcc still uses rsqrtps + a Newton iteration (without FMA?!?), even with -mrecip=none, which I thought was supposed to disable this.

Autovectorizes with only 5 vblendvps with clang5.0 (with the same options). See both on the Godbolt compiler explorer. This compiles, and looks like probably the right amount of instructions, but otherwise untested.

// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory.
static const float pi_2 = 3.1415926535897932384626433 / 2;
static const float pi = 3.1415926535897932384626433;
//static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2;

/* maximum error UNKNOWN, completely UNTESTED */
float my_acosf_branchless (float a)
{
    float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    bool a_in_range = !(a > 0.0f) && (a >= -1.0f);

    bool rsmall = (r > -0.5625f);
    float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f));

    float asinf_res = asinf_core(asinf_arg);

#if 0
    r = fmaf( rsmall?-1.0f:2.0f,  asinf_res,  rsmall ? pi_2 : 0);
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
#else
    float fma_mul = rsmall? -1.0f:2.0f;
    fma_mul = a_in_range ? -fma_mul : fma_mul;
    float fma_add = rsmall ? pi_2 : 0;
    fma_add = a_in_range ? fma_add + pi : fma_add;
    // to vectorize, turn the 2 conditions into a 2-bit integer.
    // Use vpermilps as a 2-bit LUT of float constants

    // clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending.

    r = fmaf(asinf_res, fma_mul, fma_add);
#endif
    return r;
}

Auto-vectorization tested with a loop that runs it over an array of 1024 aligned float elements; see the Godbolt link.

TODO: intrinsics version.

like image 32
Peter Cordes Avatar answered Sep 30 '22 14:09

Peter Cordes


This is not quite an alternative algorithmic approach, but nevertheless this extended remark might be of interest to you.

It seems that, with gcc, the function copysignf() is easier to vectorize than the ternary operator. In the following code I have rewritten your scalar solution with copysignf() instead of ternary operators.

The code vectorizes even with the fairly old gcc 4.9 compiler, with the options gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno. The sqrtf() function is vectorized to a vsqrtps instruction. The Godbolt link is here.

#include <stdio.h>
#include <immintrin.h>
#include <math.h>

float acosf_cpsgn (float a)
{
    float r, s, t, pi2;
/*    s = (a < 0.0f) ? 2.0f : (-2.0f); */
    s = copysignf(2.0f, -a);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
/*    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r      */
/*    r = (a < 0.0f) ? t : r;                                           */
    r = copysignf(r, a);
    pi2 = 0x1.ddcb02p+0f * 0.5f;                   /* no rounding here  */
    pi2 = pi2 - copysignf(pi2, a);                 /* no rounding here  */
    t = fmaf (pi2, 0x1.aee9d6p+0f, r);   // PI-r
    return t;
}



float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}


/* The code from the next 2 functions is copied from the godbold link in Peter cordes'  */
/* answer https://stackoverflow.com/a/49091530/2439725  and modified                    */
int autovec_test_a (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = my_acosf(src[i]);
    }
    return 0;
}

int autovec_test_b (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = acosf_cpsgn(src[i]);
    }
    return 0;
}
like image 39
wim Avatar answered Sep 30 '22 14:09

wim