Say I have a float X. I want to find the largest number that is less than X and can be losslessly stored in a float.
IIRC the IEEE standard says you can do this by converting the float's bits to an int representation, subtracting one, and convert back to float.
(edit: this is true for positive numbers that are not NaN or inf. For negative numbers, you must add. See Rawling's answer for more info.)
To change between representations, I only know of C#'s (cast) operator, which truncates. That's not what I want.
Is there a way to do this in C#?
Here is how you can simply turn a float
into an int
, change it, and then turn it back into a float
:
float myFloat = 10.3f;
// Get the bytes making up the float
byte[] bytes = BitConverter.GetBytes(myFloat);
// Make an int out of them
int myInt = BitConverter.ToInt32(bytes, 0);
// Change it
myInt--;
// Get the bytes making up the int
bytes = BitConverter.GetBytes(myInt);
// Make a float out of them
myFloat = BitConverter.ToSingle(bytes, 0);
// gives 10.2999992 or so
BitConverter
even has this built in for the 64-bit equivalent:
double myDouble = 10.3;
long myLong = BitConverter.DoubleToInt64Bits(myDouble);
myLong--;
myDouble = BitConverter.Int64BitsToDouble(myLong); // gives 10.2999999...
However, as Peter Ruderman points out, a simple decrement of the underlying int
doesn't reliably give you the next-smallest float
.
In particular, for negative numbers, you need to increment the integer in order to make the float more negative. For float
zero, the next smallest int
actually corresponds to NaN
, so you need a special case there.
Here are a couple of functions I've knocked together that should cope with these cases in general; it also looks like it sensibly travels between large numbers and positive/negative infinity, too! I've used unsafe conversions to reduce code length, but you can stick to the byte conversions above if you wish:
static unsafe float Increment(float f)
{
int val = *(int*)&f;
if (f > 0)
val++;
else if (f < 0)
val--;
else if (f == 0)
return float.Epsilon;
return *(float*)&val;
}
static unsafe float Decrement(float f)
{
int val = *(int*)&f;
if (f > 0)
val--;
else if (f < 0)
val++;
else if (f == 0)
return -float.Epsilon; // thanks to Sebastian Negraszus
return *(float*)&val;
}
As Jeppe points out, you probably also want to
if (float.IsNaN(f)) return f;
, so you don't accidentally increment or decrement a NaN
and give something which is a number.float.PositiveInfinity
or .NegativeInfinity
, since, mathematically speaking, those should probably stay constant under increment/decrement.There is a library routine for this, nexttowardf(x, -INFINITY);
.
If you want to do it with your own code, you can do it natively (in IEEE 754 floating-point operations, without accessing the floating-point encoding or components) as shown below (in C).
Two versions are provided, one that uses a subnormal value in each case (might be slow on some processors) and one that uses a subnormal only if the input is small (but it has a branch). +INFINITY is not supported as an input, although support could be added with a simple test. This is written for double
, but the changes for float
are straightforward.
If CompileMain
is defined, it also includes a test program.
#include <float.h>
#include <math.h>
/* Return the next floating-point value before the finite value q.
This was inspired by Algorithm 3.5 in Siegfried M. Rump, Takeshi Ogita, and
Shin'ichi Oishi, "Accurate Floating-Point Summation", _Technical Report
05.12_, Faculty for Information and Communication Sciences, Hamburg
University of Technology, November 13, 2005.
*/
double NextBefore(double q)
{
// SmallestPositive is the smallest positive floating-point number.
static const double SmallestPositive = DBL_EPSILON * DBL_MIN;
/* Scale is .625 ULP, so multiplying it by any significand in [1, 2)
yields something in [.625 ULP, 1.25 ULP].
*/
static const double Scale = 0.625 * DBL_EPSILON;
#if 0
/* This version has a branch but uses subnormal values only if q is so
small that q * Scale is subnormal.
*/
double increment = fabs(q)*Scale;
if (0. == increment)
return q - SmallestPositive;
return q - increment;
#else
/* This version uses a subnormal, SmallestPositive, in each case.
This might cause poor performance on some processors.
*/
return q - fmax(SmallestPositive, fabs(q)*Scale);
#endif
}
#if defined CompileMain
#include <stdio.h>
#include <stdlib.h>
#define NumberOf(a) (sizeof (a) / sizeof *(a))
int main(void)
{
int status = EXIT_SUCCESS;
static const struct { double in, out; } cases[] =
{
{ -INFINITY, -INFINITY },
{ -0x1.fffffffffffffp1023, -INFINITY },
{ -0x1.ffffffffffffep1023, -0x1.fffffffffffffp1023 },
{ -0x1.ffffffffffffdp1023, -0x1.ffffffffffffep1023 },
{ -0x1.ffffffffffffcp1023, -0x1.ffffffffffffdp1023 },
{ -0x1.0000000000003p1023, -0x1.0000000000004p1023 },
{ -0x1.0000000000002p1023, -0x1.0000000000003p1023 },
{ -0x1.0000000000001p1023, -0x1.0000000000002p1023 },
{ -0x1.0000000000000p1023, -0x1.0000000000001p1023 },
{ -0x1.fffffffffffffp1022, -0x1.0000000000000p1023 },
{ -0x1.fffffffffffffp1, -0x1.0000000000000p2 },
{ -0x1.ffffffffffffep1, -0x1.fffffffffffffp1 },
{ -0x1.ffffffffffffdp1, -0x1.ffffffffffffep1 },
{ -0x1.ffffffffffffcp1, -0x1.ffffffffffffdp1 },
{ -0x1.0000000000003p1, -0x1.0000000000004p1 },
{ -0x1.0000000000002p1, -0x1.0000000000003p1 },
{ -0x1.0000000000001p1, -0x1.0000000000002p1 },
{ -0x1.0000000000000p1, -0x1.0000000000001p1 },
{ -0x1.fffffffffffffp-1022, -0x1.0000000000000p-1021 },
{ -0x1.ffffffffffffep-1022, -0x1.fffffffffffffp-1022 },
{ -0x1.ffffffffffffdp-1022, -0x1.ffffffffffffep-1022 },
{ -0x1.ffffffffffffcp-1022, -0x1.ffffffffffffdp-1022 },
{ -0x1.0000000000003p-1022, -0x1.0000000000004p-1022 },
{ -0x1.0000000000002p-1022, -0x1.0000000000003p-1022 },
{ -0x1.0000000000001p-1022, -0x1.0000000000002p-1022 },
{ -0x1.0000000000000p-1022, -0x1.0000000000001p-1022 },
{ -0x0.fffffffffffffp-1022, -0x1.0000000000000p-1022 },
{ -0x0.ffffffffffffep-1022, -0x0.fffffffffffffp-1022 },
{ -0x0.ffffffffffffdp-1022, -0x0.ffffffffffffep-1022 },
{ -0x0.ffffffffffffcp-1022, -0x0.ffffffffffffdp-1022 },
{ -0x0.0000000000003p-1022, -0x0.0000000000004p-1022 },
{ -0x0.0000000000002p-1022, -0x0.0000000000003p-1022 },
{ -0x0.0000000000001p-1022, -0x0.0000000000002p-1022 },
{ -0x0.0000000000000p-1022, -0x0.0000000000001p-1022 },
{ +0x1.fffffffffffffp1023, +0x1.ffffffffffffep1023 },
{ +0x1.ffffffffffffep1023, +0x1.ffffffffffffdp1023 },
{ +0x1.ffffffffffffdp1023, +0x1.ffffffffffffcp1023 },
{ +0x1.0000000000004p1023, +0x1.0000000000003p1023 },
{ +0x1.0000000000003p1023, +0x1.0000000000002p1023 },
{ +0x1.0000000000002p1023, +0x1.0000000000001p1023 },
{ +0x1.0000000000001p1023, +0x1.0000000000000p1023 },
{ +0x1.0000000000000p1023, +0x1.fffffffffffffp1022 },
{ +0x1.0000000000000p2, +0x1.fffffffffffffp1 },
{ +0x1.fffffffffffffp1, +0x1.ffffffffffffep1 },
{ +0x1.ffffffffffffep1, +0x1.ffffffffffffdp1 },
{ +0x1.ffffffffffffdp1, +0x1.ffffffffffffcp1 },
{ +0x1.0000000000004p1, +0x1.0000000000003p1 },
{ +0x1.0000000000003p1, +0x1.0000000000002p1 },
{ +0x1.0000000000002p1, +0x1.0000000000001p1 },
{ +0x1.0000000000001p1, +0x1.0000000000000p1 },
{ +0x1.0000000000000p-1021, +0x1.fffffffffffffp-1022 },
{ +0x1.fffffffffffffp-1022, +0x1.ffffffffffffep-1022 },
{ +0x1.ffffffffffffep-1022, +0x1.ffffffffffffdp-1022 },
{ +0x1.ffffffffffffdp-1022, +0x1.ffffffffffffcp-1022 },
{ +0x1.0000000000004p-1022, +0x1.0000000000003p-1022 },
{ +0x1.0000000000003p-1022, +0x1.0000000000002p-1022 },
{ +0x1.0000000000002p-1022, +0x1.0000000000001p-1022 },
{ +0x1.0000000000001p-1022, +0x1.0000000000000p-1022 },
{ +0x1.0000000000000p-1022, +0x0.fffffffffffffp-1022 },
{ +0x0.fffffffffffffp-1022, +0x0.ffffffffffffep-1022 },
{ +0x0.ffffffffffffep-1022, +0x0.ffffffffffffdp-1022 },
{ +0x0.ffffffffffffdp-1022, +0x0.ffffffffffffcp-1022 },
{ +0x0.0000000000004p-1022, +0x0.0000000000003p-1022 },
{ +0x0.0000000000003p-1022, +0x0.0000000000002p-1022 },
{ +0x0.0000000000002p-1022, +0x0.0000000000001p-1022 },
{ +0x0.0000000000001p-1022, +0x0.0000000000000p-1022 },
};
for (int i = 0; i < NumberOf(cases); ++i)
{
double in = cases[i].in, expected = cases[i].out;
double observed = NextBefore(in);
printf("NextBefore(%a) = %a.\n", in, observed);
if (! (observed == expected))
{
printf("\tError, expected %a.\n", expected);
status = EXIT_FAILURE;
}
}
return status;
}
#endif // defined CompileMain
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With