How to convert floating point algorithm to fixed point?

Tags:

After reading quite a bit about fixed-point arithmetic I think I can say I've understood the basics, unfortunately I don't know yet how to convert routines that use sin/cos/sqrt or any other fp function.

Consider this simple mcve:

#include <math.h>
#include <stdio.h>
#include <ctime>
#include <fstream>
#include <iostream>

typedef char S8;
typedef short S16;
typedef int S32;
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned int U32;
typedef float F32;
typedef double F64;

// -------- Fixed point helpers QM.N(32bits) --------
typedef S32 FP32;

#define LUT_SIZE_BITS 9  // 0xffffffff>>(32-9)=511; 32-23=9; 2^9=512
#define LUT_SIZE 512

#define FRACT_BITS 28        // Number fractional bits
#define M (1 << FRACT_BITS)  // Scaling factor

inline F32 Q2F(FP32 X) { return ((F32)X / (F32)(M)); }
inline FP32 F2Q(F32 X) { return (FP32)(X * (M)); }

const F32 PI = 3.141592653589793f;
const F32 pi = 3.141592653589793f;
const U32 WIDTH = 256;
const U32 HEIGHT = 256;

FP32 cos_table[LUT_SIZE];
FP32 sin_table[LUT_SIZE];

void init_luts() {
    const F32 deg_to_rad = PI / 180.f;
    const F32 sample_to_deg = 1 / LUT_SIZE * 360.f;
    for (S32 i = 0; i < LUT_SIZE; i++) {
        F32 rad = ((F32)i * sample_to_deg) * deg_to_rad;
        F32 c = cos(rad);
        F32 s = sin(rad);
        cos_table[i] = F2Q(c);
        sin_table[i] = F2Q(s);
    }
}

// -------- Image processing --------
U8 clamp(F32 valor) { return valor > 255 ? 255 : (valor < 0 ? 0 : (U8)valor); }

struct Pbits {
    U32 width;
    U32 height;
    U8 *data;

    Pbits(U32 width, U32 height, U8 *data) {
        this->width = width;
        this->height = height;
        this->data = data;
    }

    Pbits(Pbits *src) {
        this->width = src->width;
        this->height = src->height;
        this->data = new U8[src->width * src->height * 3];
        memcpy(this->data, src->data, width * height * 3);
    }

    ~Pbits() { delete this->data; }

    void to_bgr() {
        U8 r, g, b;
        for (S32 y = 0; y < height; y++) {
            for (S32 x = 0; x < width; x++) {
                get_pixel(y, x, r, g, b);
                set_pixel(y, x, b, g, r);
            }
        }
    }

    void get_pixel(U32 y, U32 x, U8 &r, U8 &g, U8 &b) {
        U32 offset = (y * height * 3) + (x * 3);
        r = this->data[offset + 0];
        g = this->data[offset + 1];
        b = this->data[offset + 2];
    }

    void set_pixel(U32 y, U32 x, U8 c1, U8 c2, U8 c3) {
        U32 offset = (y * height * 3) + (x * 3);

        data[offset] = c1;
        data[offset + 1] = c2;
        data[offset + 2] = c3;
    }
};

void fx1_plasma(Pbits *dst, F32 t, F32 k1, F32 k2, F32 k3, F32 k4, F32 k5, F32 k6) {
    U32 height = dst->height;
    U32 width = dst->width;

    for (U32 y = 0; y < height; y++) {
        F32 uv_y = (F32)y / height;
        for (U32 x = 0; x < width; x++) {
            F32 uv_x = (F32)x / width;

            F32 v1 = sin(uv_x * k1 + t);
            F32 v2 = sin(k1 * (uv_x * sin(t) + uv_y * cos(t / k2)) + t);
            F32 cx = uv_x + sin(t / k1) * k1;
            F32 cy = uv_y + sin(t / k2) * k1;
            F32 v3 = sin(sqrt(k3 * (cx * cx + cy * cy)) + t);
            F32 vf = v1 + v2 + v3;

            U8 r = (U8)clamp(255 * cos(vf * pi));
            U8 g = (U8)clamp(255 * sin(vf * pi + k4 * pi / k2));
            U8 b = (U8)clamp(255 * cos(vf * pi + k5 * pi / k2));

            dst->set_pixel(y, x, r, g, b);
        }
    }
}

// -------- Image helpers --------
inline void _write_s32(U8 *dst, S32 offset, S32 v) {
    dst[offset] = (U8)(v);
    dst[offset + 1] = (U8)(v >> 8);
    dst[offset + 2] = (U8)(v >> 16);
    dst[offset + 3] = (U8)(v >> 24);
}

void write_bmp(Pbits *src, S8 *filename) {
    Pbits *dst = new Pbits(src);
    dst->to_bgr();

    S32 w = dst->width;
    S32 h = dst->height;
    U8 *img = dst->data;

    S32 filesize = 54 + 3 * w * h;

    U8 bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    U8 bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    U8 bmppad[3] = {0, 0, 0};

    _write_s32(bmpfileheader, 2, filesize);
    _write_s32(bmpinfoheader, 4, w);
    _write_s32(bmpinfoheader, 8, h);

    FILE *f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (S32 i = 0; i < h; i++) {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    delete dst;
}

void write_ppm(Pbits *dst, S8 *filename) {
    std::ofstream file(filename, std::ofstream::trunc);
    if (!file.is_open()) {
        std::cout << "yep! file is not open" << std::endl;
    }
    file << "P3\n" << dst->width << " " << dst->height << "\n255\n";

    U8 r, g, b, a;
    for (U32 y = 0; y < dst->height; y++) {
        for (U32 x = 0; x < dst->width; x++) {
            dst->get_pixel(y, x, r, g, b);
            file << (S32)r << " " << (S32)g << " " << (S32)b << "\n";
        }
    }
}

S32 main() {
    Pbits *dst = new Pbits(WIDTH, HEIGHT, new U8[WIDTH * HEIGHT * 3]);

    init_luts();

    clock_t begin = clock();
    fx1_plasma(dst, 0, 8, 36, 54, 51, 48, 4);
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;

    std::cout << "Generated plasma in " << elapsed_secs << "s" << std::endl;

    write_ppm(dst, "plasma.ppm");
    write_bmp(dst, "plasma.bmp");

    delete dst;
}

This code will generate this image:

enter image description here

QUESTION: How would you convert this floating point algorithm into a fast fixed-point one? Right now the basics of the floating point arithmetic is +/- clear to me, as in:

fa,fb=floating point values; a,b=fixed_point ones; M=scaling factor

fa = a*M
fb = b*M
fa+fb = (a+b)*M
fa-fb = (a-b)*M
fa*fb = (a*b)*M^2
fa/fb = (a/b)

but how to use sin/cos/sqrt et al in fixed-point is still eluding me. I've found this related thread but I still don't understand how to use the trigonometric luts with random fp values.

513

asked Aug 03 '19 00:08

BPL

3 Answers

The basic idea for a lookup table is simple -- you use the fixed point value as an index into an array to look up the value. The problem is if your fixed point values are large, your tables become huge. For a full table with a 32-bit FP type you need 4*2³² bytes (16GB) which is impractically large. So what you generally do is use a smaller table (smaller by a factor of N) and the linearly interpolate between two values in the table to do the lookup.

In your case, you appear to want to use a 2²³ reduction so you need a table with just 513 elements. To do the lookup, you then use the upper 9 bits as an index into the table and use the lower 23 bits to interpolate. eg:

FP32 cos_table[513] = { 268435456, ...
FP32 cosFP32(FP32 x) {
    int i = x >> 23;  // upper 9 bits to index the table
    int fract = x & 0x7fffff;  // lower 23 bits to interpolate
    return ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
}

Note that we have to do the multiplies in 64 bits to avoid overflows, same as any other multiplies of FP32 values.

Since cos is symmetric, you could use that symmetry to reduce the table size by another factor of 4, and use the same table for sin, but that is more work.

If your're using C++, you can define a class with overloading to encapsulate your fixed point type:

class fixed4_28 {
    int32_t  val;
    static const int64_t fract_val = 1 << 28;
 public:
    fixed4_28 operator+(fixed4_28 a) const { a.val = val + a.val; return a; }
    fixed4_28 operator-(fixed4_28 a) const { a.val = val - a.val; return a; }
    fixed4_28 operator*(fixed4_28 a) const { a.val = ((int64_t)val * a.val) >> 28; return a; }
    fixed4_28 operator/(fixed4_28 a) const { a.val = ((int64_t)val << 28) / a.val; return a; }

    fixed4_28(double v) : val(v * fract_val + 0.5) {}
    operator double() { return (double)val / fract_val; }

    friend fixed4_28 cos(fixed_4_28);
};

inline fixed4_28 cos(fixed4_28 x) {
    int i = x.val >> 23;  // upper 9 bits to index the table
    int fract = x.val & 0x7fffff;  // lower 23 bits to interpolate
    x.val = ((int64_t)cos_table[i] * ((1 << 23) - fract) + (int64_t)cos_table[i+1] * fract + (1 << 22)) >> 23;
    return x;
}

and then your code can use this type directly and you can write equations just as if you were using float or double

119

answered Oct 04 '22 16:10

Chris Dodd

For sin() and cos() the first step is range reduction, which looks like "angle = angle % degrees_in_a_circle". Sadly, these functions typically use radians, and radians are nasty because that range reduction becomes "angle = angle % (2 * PI)", which means that precision depends on the modulo of an irrational number (which is guaranteed to be "not fun").

With this in mind; you want to throw radians in the trash and invent a new "binary degrees" such that a circle is split into "powers of 2" pieces. This means that the range reduction becomes "angle = angle & MASK;" with no precision loss (and no expensive modulo). The rest of sin() and cos() (if you're using a table driven approach) is adequately described by existing answers so I won't repeat it in this answer.

The next step is to realize that "globally fixed point" is awful. Far better is what I'll call "moving point". To understand this, consider multiplication. For "globally fixed point" you might do "result_16_16 = (x_16_16 * y_16_16) >> 16" and throw away 16 bits of precision and have to worry about overflows. For "moving point" you might do "result_32_32 = x_16_16 * y_16_16" (where the decimal point is moved) and know that there is no precision loss, know that there can't be overflow, and make it faster by avoiding a shift.

For "moving point", you'd start with the actual requirements of inputs (e.g. for a number from 0.0 to 100.0 you might start with "7.4 fixed point" with 5 bits of a uint16_t unused) and explicitly manage precision and range throughput a calculation to arrive at a result that is guaranteed to be unaffected by overflow and has the best possible compromise between "number of bits" and precision at every step.

For example:

 uint16_t inputValue_7_4 = 50 << 4;                   // inputValue is actually 50.0
 uint16_t multiplier_1_1 = 3;                         // multiplier is actually 1.5
 uint16_t k_0_5 = 28;                                 // k is actually 0.875
 uint16_t divisor_2_5 = 123;                          // divisor is actually 3.84375

 uint16_t x_8_5 = inputValue_7_4 * multiplier_1_1;    // Guaranteed no overflow and no precision loss
 uint16_t y_9_5 = x_8_5 + k+0_5;                      // Guaranteed no overflow and no precision loss
 uint32_t result_9_23 = (y_9_5 << 23) / divisor_2_5;  // Guaranteed no overflow, max. possible precision kept

I'd like to do it as "mechanically" as possible

There is no reason why "moving point" can't be done purely mechanically, if you specify the characteristics of the inputs and provide a few other annotations (the desired precision of divisions, plus either any intentional precision losses or the total bits of results); given that the rules that determine the size of the result of any operation and where the point will be in that result are easily determined. However; I don't know of an existing tool that will do this mechanical conversion, so you'd have to invent your own language for "annotated expressions" and write your own tool that converts it into another language (e.g. C). It's likely to cost less developer time to just do the conversion by hand instead.

answered Oct 04 '22 16:10

Brendan

/*
very very fast
float sqrt2(float);

(-1) ^ s* (1 + n * 2 ^ -23)* (2 ^ (x - 127)) float
sxxxxxxxxnnnnnnnnnnnnnnnnnnnnnnn  float f
000000000000sxxxxxxxxnnnnnnnnnnn  int indis  20 bit
*/

#define LUT_SIZE2 0x000fffff   //1Mb  20 bit
float sqrt_tab[LUT_SIZE2];
#define sqrt2(f)     sqrt_tab[*(int*)&f>>12]  //float to int


int main()
{
    //init_luts();
    for (int i = 0; i < LUT_SIZE2; i++)
    {
        int ii = i << 12;        //i to float 
        sqrt_tab[i] = sqrt(*(float*)& ii);
    }

    float f=1234.5678;
    printf("test\n");
    printf(" sqrt(1234.5678)=%12.6f\n", sqrt(f));
    printf("sqrt2(1234.5678)=%12.6f\n", sqrt2(f));


    printf("\n\ntest mili second\n");
    int begin;
    int free;

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        ;
    free = clock() - begin;
    printf("free        %4d\n", free);

    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt(f);
    printf("sqrt()      %4d\n", clock() - begin - free);


    begin = clock();
    for (float f = 0; f < 10000000.f; f++)
        sqrt2(f);
    printf("sqrt2()     %4d\n", clock() - begin - free);


    return 0;

}

/*
 sgrt(1234.5678)   35.136416
sgrt2(1234.5678)  35.135452

test mili second
free       73
sqrt()    146
sqrt2()    7
*/

answered Oct 04 '22 16:10

ismail akın çelik

Related questions
                            
                                How do I make CMake run a python script before building, in order to generate files for my project to be used in the build?
                            
                                Pybind11: Create and return numpy array from C++ side
                            
                                Is it safe to pass const reference to temporary/annonymous lambda into std::thread constructor?
                            
                                Shuffling by mask with Intel AVX
                            
                                std::set iterator with same key and different comparators
                            
                                template class specialization enable_if and default values
                            
                                c++ when will while(cin>>s) stop
                            
                                Is `reinterpret_cast`ing between hardware SIMD vector pointer and the corresponding type an undefined behavior?
                            
                                What does CString::GetBuffer() with no size parameter do?
                            
                                Function not returning desired string
                            
                                Compile error "error: expected ';' at end of declaration list" using Catch testing framework in C++
                            
                                disable template member function if return type is an array
                            
                                How do I suppress the "ISO C++ does not support ‘__int128’ warning?
                            
                                What is the calling convention for extern "C" in C++? [duplicate]
                            
                                Infinite loop vs infinite recursion. Are both undefined?
                            
                                const auto reference binding to (null) pointer - what is the actual type?
                            
                                Initialize array whose size is a compile-time constant to single value
                            
                                Removing class pointer from member function pointer type
                            
                                Is it bad practice to move an l-value reference parameter?
                            
                                Initialisation of aggregate with default constructor deleted in c++20

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

How to convert floating point algorithm to fixed point?

Tags:

c++

optimization

floating-point

fixed-point

BPL

People also ask

3 Answers

Chris Dodd

Brendan

ismail akın çelik

Recent Activity

Donate For Us