Simple implementations of acosf()
can easily achieve an error bound of 1.5 ulp with respect to the infinitely precise (mathematical) result, if the platform has support for fused multiply-add (FMA). This means results never differ by more than one ulp from the correctly rounded result in round-to-nearest-or-even mode.
However, such an implementation usually comprises two major code branches that split the primary approximation interval [0,1] roughly in half, as in the exemplary code below. This branchyness inhibits automatic vectorization by compilers when targeting SIMD architectures.
Is there an alternative algorithmic approach that lends itself more readily to automatic vectorization, while maintaining the same error bound of 1.5 ulps? Platform support for FMA can be assumed.
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
The closest I have come to a satisfactory solution is based on an idea from a news posting by Robert Harley, in which he observed that for x in [0,1], acos(x) ≈ √(2*(1-x)), and that a polynomial can provide the scale factor necessary to make this an accurate approximation across the entire interval. As can be seen in the code below, this approach results in straight-line code with just two uses of the ternary operator to handle arguments in the negative half-plane.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define VECTORIZABLE 1
#define ARR_LEN (1 << 24)
#define MAX_ULP 1 /* deviation from correctly rounded result */
#if VECTORIZABLE
/*
Compute arccos(a) with a maximum error of 1.496766 ulp
This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
#else // VECTORIZABLE
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
#endif // VECTORIZABLE
int main (void)
{
double darg, dref;
float ref, *a, *b;
uint32_t argi, resi, refi;
printf ("%svectorizable implementation of acos\n",
VECTORIZABLE ? "" : "non-");
a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
b = (float *)malloc (sizeof(b[0]) * ARR_LEN);
argi = 0x00000000;
do {
for (int i = 0; i < ARR_LEN; i++) {
memcpy (&a[i], &argi, sizeof(a[i]));
argi++;
}
for (int i = 0; i < ARR_LEN; i++) {
b[i] = my_acosf (a[i]);
}
for (int i = 0; i < ARR_LEN; i++) {
darg = (double)a[i];
dref = acos (darg);
ref = (float)dref;
memcpy (&refi, &ref, sizeof(refi));
memcpy (&resi, &b[i], sizeof(resi));
if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
printf ("error > 1 ulp a[i]=% 14.6a b[i]=% 14.6a ref=% 14.6a dref=% 21.13a\n",
a[i], b[i], ref, dref);
printf ("test FAILED\n");
return EXIT_FAILURE;
}
}
printf ("^^^^ argi = %08x\n", argi);
} while (argi);
printf ("test PASSED\n");
free (a);
free (b);
return EXIT_SUCCESS;
}
Although the structure of this code appears conducive to auto-vectorization, I have not had much luck when targeting AVX2
with the compilers offered by Compiler Explorer. The only compiler that appears to be able to vectorize this code in the context of the inner loop of my test app above is Clang. But Clang seems to be able to do this only if I specify -ffast-math
, which, however, has the undesired side-effect of turning the sqrtf()
invocation into an approximate square root computed via rsqrt
. I experimented with some less intrusive switches, e.g. -fno-honor-nans
, -fno-math-errno
, -fno-trapping-math
, but my_acosf()
did not vectorize even when I used those in combination.
For now I have resorted to manual translation of the above code into AVX2
+ FMA
intrinsics, like so:
#include "immintrin.h"
/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
const __m256 zero= _mm256_set1_ps ( 0.0f);
const __m256 two = _mm256_set1_ps ( 2.0f);
const __m256 mtwo= _mm256_set1_ps (-2.0f);
const __m256 c0 = _mm256_set1_ps ( 0x1.c86000p-22f); // 4.25032340e-7
const __m256 c1 = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
const __m256 c2 = _mm256_set1_ps ( 0x1.90c5c4p-18f); // 5.97197595e-6
const __m256 c3 = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
const __m256 c4 = _mm256_set1_ps ( 0x1.c3f78ap-16f); // 2.69393295e-5
const __m256 c5 = _mm256_set1_ps ( 0x1.e8f446p-14f); // 1.16575764e-4
const __m256 c6 = _mm256_set1_ps ( 0x1.6df072p-11f); // 6.97973708e-4
const __m256 c7 = _mm256_set1_ps ( 0x1.3332a6p-8f); // 4.68746712e-3
const __m256 c8 = _mm256_set1_ps ( 0x1.555550p-5f); // 4.16666567e-2
const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f); // 1.86637890e+0
const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f); // 1.68325555e+0
__m256 s, r, t, m;
s = two;
t = mtwo;
m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
t = _mm256_blendv_ps (t, s, m);
t = _mm256_fmadd_ps (x, t, s);
s = _mm256_sqrt_ps (t);
r = c0;
r = _mm256_fmadd_ps (r, t, c1);
r = _mm256_fmadd_ps (r, t, c2);
r = _mm256_fmadd_ps (r, t, c3);
r = _mm256_fmadd_ps (r, t, c4);
r = _mm256_fmadd_ps (r, t, c5);
r = _mm256_fmadd_ps (r, t, c6);
r = _mm256_fmadd_ps (r, t, c7);
r = _mm256_fmadd_ps (r, t, c8);
r = _mm256_mul_ps (r, t);
r = _mm256_fmadd_ps (r, s, s);
t = _mm256_sub_ps (zero, r);
t = _mm256_fmadd_ps (pi0, pi1, t);
r = _mm256_blendv_ps (r, t, m);
return r;
}
A branchless version of the code in the question is possible (with hardly any redundant work, just some compare/blends to create constants for the FMAs), but IDK if compilers will auto-vectorize it.
The main extra work is a useless sqrt
/ fma
if all the elements had -|a| > -0.5625f
, unfortunately on the critical path.
The arg to asinf_core
is (r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f))
.
In parallel with that, you (or the compiler) can be blending the coefficients for the FMA on the output.
If you sacrifice the accuracy of the pi/2
constant by putting it into one float
instead of creating it with 2 constant multiplicands to fmaf
, you can
fmaf( condition?-1:2, asinf_core_result, condition ? pi/2 : 0)
So you select between two constants, or andps
a constant with a SIMD compare result to conditionally zero it (x86 SSE for example).
The final fixup is based on a range-check of the original input, so again there's instruction-level parallelism between FP blends and the FMA work of asinf_core
.
In fact, we can optimize this into the previous FMA on the output of asinf_core
, by blending the constant inputs to that on a second condition. We want asinf_core
as one of the multiplicands for it, so we can negate or not by negating the constant. (A SIMD implementation might do a_cmp = andnot( a>0.0f, a>=-1.0f)
, and then multiplier ^ (-0.0f & a_cmp)
, where multiplier
was conditionally done before.
The additive constant for that FMA on the output is 0
, pi/2
, pi
, or pi + pi/2
. Given the two compare results (on a
and on r=-|a|
for the non-NaN case), we could maybe combine that into a 2-bit integer and use it as a shuffle control to select an FP constant from a vector of all 4 constants, e.g. using AVX vpermilps
(fast in-lane shuffle with a variable control). i.e. instead of blending 4 different ways, use a shuffle as a 2-bit LUT!
If we're doing that, we should also do it for the multiplicative constant, because creating the constant is the main cost. Variable blends are more expensive than shuffles on x86 (usually 2 uops vs. 1). On Skylake, variable blends (like vblendvps
) can use any port (while shuffles only run on port 5). There's enough ILP that this probably bottlenecks on overall uop throughput, or overall ALU ports, not port 5. (Variable blend on Haswell is 2 uops for port 5, so it's strictly worse than vpermilps ymm,ymm,ymm
).
We'd be selecting from -1, 1, -2, and 2.
Scalar with ternary operators, auto-vectorizes (with 8 vblendvps
) with gcc7.3 -O3 -march=skylake -ffast-math
. fast-math required for autovectorization :/ And unfortunately, gcc still uses rsqrtps
+ a Newton iteration (without FMA?!?), even with -mrecip=none
, which I thought was supposed to disable this.
Autovectorizes with only 5 vblendvps
with clang5.0 (with the same options). See both on the Godbolt compiler explorer. This compiles, and looks like probably the right amount of instructions, but otherwise untested.
// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory.
static const float pi_2 = 3.1415926535897932384626433 / 2;
static const float pi = 3.1415926535897932384626433;
//static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2;
/* maximum error UNKNOWN, completely UNTESTED */
float my_acosf_branchless (float a)
{
float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
bool a_in_range = !(a > 0.0f) && (a >= -1.0f);
bool rsmall = (r > -0.5625f);
float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f));
float asinf_res = asinf_core(asinf_arg);
#if 0
r = fmaf( rsmall?-1.0f:2.0f, asinf_res, rsmall ? pi_2 : 0);
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
#else
float fma_mul = rsmall? -1.0f:2.0f;
fma_mul = a_in_range ? -fma_mul : fma_mul;
float fma_add = rsmall ? pi_2 : 0;
fma_add = a_in_range ? fma_add + pi : fma_add;
// to vectorize, turn the 2 conditions into a 2-bit integer.
// Use vpermilps as a 2-bit LUT of float constants
// clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending.
r = fmaf(asinf_res, fma_mul, fma_add);
#endif
return r;
}
Auto-vectorization tested with a loop that runs it over an array of 1024 aligned float
elements; see the Godbolt link.
TODO: intrinsics version.
This is not quite an alternative algorithmic approach, but nevertheless this extended remark might be of interest to you.
It seems that, with gcc, the function copysignf()
is easier to vectorize than the
ternary operator. In the following code I have rewritten your scalar
solution with copysignf()
instead of ternary operators.
The code vectorizes even with the fairly old gcc 4.9 compiler, with the
options gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno
.
The sqrtf()
function is vectorized to a vsqrtps
instruction. The Godbolt link is here.
#include <stdio.h>
#include <immintrin.h>
#include <math.h>
float acosf_cpsgn (float a)
{
float r, s, t, pi2;
/* s = (a < 0.0f) ? 2.0f : (-2.0f); */
s = copysignf(2.0f, -a);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
/* t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r */
/* r = (a < 0.0f) ? t : r; */
r = copysignf(r, a);
pi2 = 0x1.ddcb02p+0f * 0.5f; /* no rounding here */
pi2 = pi2 - copysignf(pi2, a); /* no rounding here */
t = fmaf (pi2, 0x1.aee9d6p+0f, r); // PI-r
return t;
}
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
/* The code from the next 2 functions is copied from the godbold link in Peter cordes' */
/* answer https://stackoverflow.com/a/49091530/2439725 and modified */
int autovec_test_a (float *__restrict dst, float *__restrict src) {
dst = __builtin_assume_aligned(dst,32);
src = __builtin_assume_aligned(src,32);
for (int i=0 ; i<1024 ; i++ ) {
dst[i] = my_acosf(src[i]);
}
return 0;
}
int autovec_test_b (float *__restrict dst, float *__restrict src) {
dst = __builtin_assume_aligned(dst,32);
src = __builtin_assume_aligned(src,32);
for (int i=0 ; i<1024 ; i++ ) {
dst[i] = acosf_cpsgn(src[i]);
}
return 0;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With