cast 32bit-float to 64bit-double on system where sizeof double == sizeof float == 4

Question

I am trying to serialize a float according to the BSON spec which only has support for 64bit double. so i need to cast my float to a double.

On a system where sizeof(double) == 8 i would just do

float f = 3.14;
serialize((double)f);

but since sizeof(double) == 4 on the my target system i have to do something like

float f = 3.14;
uint64_t d;
float32_to_float64(f, &d);
serialize(d);

i have written some test code (on a machine where sizeof(double) == 8) trying to correctly converting the float32 to float64 and storing the result as a uint64_t but i am not getting the expected result.

#include <stdio.h>
#include <stdint.h>

#define FLOAT_FRACTION_MSK  0xFFFFFF

#define DOUBLE_FRACTION_S   52 // Fraction is 52 bits
#define DOUBLE_EXPONENT_S   11 // Exponent is 11 bits

#define FLOAT_FRACTION_S    23 // Fraction is 23 bits
#define FLOAT_EXPONENT_S    8  // Exponent is  8 bits

int main(void) {
    // float af = 3.14;
    float af = 0.15625;

    double bd = 0;
    //uint8_t buff[sizeof(int64_t)] = {0};

    *(uint64_t*)&bd |= (*(uint32_t*)&af & (1UL << 31)) << 32; // check sign bit


    uint8_t exponent32 = (*(uint32_t*)&af & 0x7F800000) >> (FLOAT_FRACTION_S+1);
    if (exponent32 == 0xFF) return 1; // Error (infiniti if fraction is zero,
                                      // Nan ortherwise)


    printf("exponent32=%.4x
", exponent32);
    int64_t temp = *(uint64_t*)&bd;
    *(uint64_t*)&bd |= ((uint64_t)exponent32 << (DOUBLE_FRACTION_S+4)); //& 0x7FF0000000000000; // (33); // 28
    printf("exponent64=%llx, %d
", *(uint64_t*)&bd, (DOUBLE_FRACTION_S+4));

// Do the fraction
{
    printf("fraction64=%#.8llx
", (
        (uint64_t)(
            (*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
        ) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S-4)//((52-22)-1) // 33
    ) );

    *(uint64_t*)&bd |= (
        (uint64_t)(
            (*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
        ) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S)
    ) ;
}


    double expected = af;
    printf("Original float=%#.4x, converted double=%#.8llx expected=%.8llx,
", *(uint32_t*)&af, *(uint64_t*)&bd, *(uint64_t*)&expected);
    printf("Original float=%f, converted double=%lf

", af, bd);

    *(uint64_t*)&bd = temp;

    return 0;
}

The output of this gives Original float=0x3e200000, converted double=0x3e04000000000000 expected=3fc4000000000000,

So it seems i am missing something when converting the exponent but i am at a loss to what that is.

Anonymous · Accepted Answer

fixed denormals, infinites & nans

unsigned __int64 Float2Double(float v)
{
    unsigned int f = *(unsigned int*)&v; // reinterpret 
    if ( !(f&0x7fffffff) )
        return (unsigned __int64)f<<32; // return +/-0.0

    unsigned int s = f>>31; // get sign
    unsigned int e = ((f&0x7f800000)>>23) -128; // get exponent and unbias from 128

    unsigned int m = f&0x007fffff; // get mantisa

    if (e==-128)
    {
        // handle denormals
        while ( !(m&0x00800000) )
        {
            m<<=1;
            e--;
        }
        m&=0x007fffff; // remove implicit 1
        e++;           //
    }
    else
    if (e==127)
    {
        // +/-infinity
        e = 1023;
    }

    unsigned __int64 d = s; // store sign (in lowest bit)

    d <<= 11; // make space for exponent
    d |= e +1024;   // store rebiased exponent

    d <<= 23; // add space for 23 most significant bits of mantisa
    d |= m;   // store 23 bits of mantisa

    d <<= 52-23; // trail zeros in place of lower significant bit of mantisa

    return d;
}

chux - Reinstate Monica · Answer

After accept answer that works with all float.

Tested successfully with all float including typical normal finites, sub normals, +/- zero, +/- infinity and NaN.

#include <assert.h>
#include <math.h>
#include <stdint.h>

#define F_SIGN_SHIFT (31)
#define F_EXPO_MAX (0xFF)
#define F_EXPO_SHIFT (23)
#define F_EXPO_MASK ((uint32_t) F_EXPO_MAX << F_EXPO_SHIFT)
#define F_EXPO_BIAS (127)
#define F_SFCT_MASK (0x7FFFFF)
#define F_SFCT_IMPLIEDBIT (F_SFCT_MASK + 1)

#define D_SIGN_SHIFT (63)
#define D_EXPO_MAX (0x7FF)
#define D_EXPO_SHIFT (52)
#define D_EXPO_MASK ((uint64_t) D_EXPO_MAX << D_EXPO_SHIFT)
#define D_EXPO_BIAS (1023)

uint64_t IEEEbinary32float_to_IEEEbinary64int(float f) {
  assert(sizeof f == sizeof(uint32_t));
  union {
    float f;
    uint32_t u;
  } x = { f };
  uint64_t y;

  y = (uint64_t) (x.u >> F_SIGN_SHIFT) << D_SIGN_SHIFT;
  unsigned expo = (x.u & F_EXPO_MASK) >> F_EXPO_SHIFT;
  uint32_t significant = x.u & F_SFCT_MASK;
  if (expo > 0) {
    if (expo == F_EXPO_MAX) {    // Infinity NaN
      expo = D_EXPO_MAX;
    } else {                     // typical normal finite numbers
      expo += D_EXPO_BIAS - F_EXPO_BIAS;
    }
  } else {
    if (significant) {           // Subnormal
      expo += D_EXPO_BIAS - F_EXPO_BIAS + 1;
      while ((significant & F_SFCT_IMPLIEDBIT) == 0) {
        significant <<= 1;
        expo--;
      }
      significant &= F_SFCT_MASK;
    } else {                    // Zero
      expo = 0;
    }
  }
  y |= (uint64_t) expo << D_EXPO_SHIFT;
  y |= (uint64_t) significant << (D_EXPO_SHIFT - F_EXPO_SHIFT);
  return y;
}

cast 32bit-float to 64bit-double on system where sizeof double == sizeof float == 4

Tags:

c

floating-point-conversion

FnuGk

2 Answers

Anonymous

chux - Reinstate Monica

Recent Activity

Donate For Us

cast 32bit-float to 64bit-double on system where sizeof double == sizeof float == 4

Tags:

c

floating-point-conversion

FnuGk

2 Answers

Anonymous

chux - Reinstate Monica

Related questions

Recent Activity

Donate For Us