How can I optimize conversion from half-precision float16 to single-precision float32?

Question

I'm trying improve performance for my function. Profiler points to the code at inner loop. Can I improve perfomance of that code, maybe using SSE intrinsics?

void ConvertImageFrom_R16_FLOAT_To_R32_FLOAT(char* buffer, void* convertedData, DWORD width, DWORD height, UINT rowPitch)
{
    struct SINGLE_FLOAT
    {
        union {
            struct {
                unsigned __int32 R_m : 23;
                unsigned __int32 R_e : 8;
                unsigned __int32 R_s : 1;
            };
            struct {
                float r;
            };
        };
    };
    C_ASSERT(sizeof(SINGLE_FLOAT) == 4); // 4 bytes
    struct HALF_FLOAT
    {
        unsigned __int16 R_m : 10;
        unsigned __int16 R_e : 5;
        unsigned __int16 R_s : 1;
    };
    C_ASSERT(sizeof(HALF_FLOAT) == 2);
    SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
    for(DWORD j = 0; j< height; j++)
    {
        HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
        for(DWORD i = 0; i< width; i++)
        {
            d->R_s = s->R_s;
            d->R_e = s->R_e - 15 + 127;
            d->R_m = s->R_m << (23-10);
            d++;
            s++;
        }
    }
}

Update:

Disassembly

; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 

    TITLE   Utils.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES

PUBLIC  ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
; Function compile flags: /Ogtp
;   COMDAT ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z
_TEXT   SEGMENT
_buffer$ = 8                        ; size = 4
tv83 = 12                       ; size = 4
_convertedData$ = 12                    ; size = 4
_width$ = 16                        ; size = 4
_height$ = 20                       ; size = 4
_rowPitch$ = 24                     ; size = 4
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z PROC ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT, COMDAT

; 323  : {

    push    ebp
    mov ebp, esp

; 343  :    for(DWORD j = 0; j< height; j++)

    mov eax, DWORD PTR _height$[ebp]
    push    esi
    mov esi, DWORD PTR _convertedData$[ebp]
    test    eax, eax
    je  SHORT $LN4@ConvertIma

; 324  :    union SINGLE_FLOAT {
; 325  :        struct {
; 326  :            unsigned __int32 R_m : 23;
; 327  :            unsigned __int32 R_e : 8;
; 328  :            unsigned __int32 R_s : 1;
; 329  :        };
; 330  :        struct {
; 331  :            float r;
; 332  :        };
; 333  :    };
; 334  :    C_ASSERT(sizeof(SINGLE_FLOAT) == 4);
; 335  :    struct HALF_FLOAT
; 336  :    {
; 337  :        unsigned __int16 R_m : 10;
; 338  :        unsigned __int16 R_e : 5;
; 339  :        unsigned __int16 R_s : 1;
; 340  :    };
; 341  :    C_ASSERT(sizeof(HALF_FLOAT) == 2);
; 342  :    SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;

    push    ebx
    mov ebx, DWORD PTR _buffer$[ebp]
    push    edi
    mov DWORD PTR tv83[ebp], eax
$LL13@ConvertIma:

; 344  :    {
; 345  :        HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
; 346  :        for(DWORD i = 0; i< width; i++)

    mov edi, DWORD PTR _width$[ebp]
    mov edx, ebx
    test    edi, edi
    je  SHORT $LN5@ConvertIma
    npad    1
$LL3@ConvertIma:

; 347  :        {
; 348  :            d->R_s = s->R_s;

    movzx   ecx, WORD PTR [edx]
    movzx   eax, WORD PTR [edx]
    shl ecx, 16                 ; 00000010H
    xor ecx, DWORD PTR [esi]
    shl eax, 16                 ; 00000010H
    and ecx, 2147483647             ; 7fffffffH
    xor ecx, eax
    mov DWORD PTR [esi], ecx

; 349  :            d->R_e = s->R_e - 15 + 127;

    movzx   eax, WORD PTR [edx]
    shr eax, 10                 ; 0000000aH
    and eax, 31                 ; 0000001fH
    add eax, 112                ; 00000070H
    shl eax, 23                 ; 00000017H
    xor eax, ecx
    and eax, 2139095040             ; 7f800000H
    xor eax, ecx
    mov DWORD PTR [esi], eax

; 350  :            d->R_m = s->R_m << (23-10);

    movzx   ecx, WORD PTR [edx]
    and ecx, 1023               ; 000003ffH
    shl ecx, 13                 ; 0000000dH
    and eax, -8388608               ; ff800000H
    or  ecx, eax
    mov DWORD PTR [esi], ecx

; 351  :            d++;

    add esi, 4

; 352  :            s++;

    add edx, 2
    dec edi
    jne SHORT $LL3@ConvertIma
$LN5@ConvertIma:

; 343  :    for(DWORD j = 0; j< height; j++)

    add ebx, DWORD PTR _rowPitch$[ebp]
    dec DWORD PTR tv83[ebp]
    jne SHORT $LL13@ConvertIma
    pop edi
    pop ebx
$LN4@ConvertIma:
    pop esi

; 353  :        }
; 354  :    }
; 355  : }

    pop ebp
    ret 0
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ENDP ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
_TEXT   ENDS

Peter Cordes · Accepted Answer

The x86 F16C instruction-set extension adds hardware support for converting single-precision float vectors to/from vectors of half-precision float.

The format is the same IEEE 754 half-precision binary16 that you describe. I didn't check that the endianness is the same as your struct, but that's easy to fix if needed (with a pshufb).

F16C is supported starting from Intel IvyBridge and AMD Piledriver. (And has its own CPUID feature bit, which your code should check for, otherwise fall back to SIMD integer shifts and shuffles).

The intrinsics for VCVTPS2PH are:

__m128i _mm_cvtps_ph ( __m128 m1, const int imm);
__m128i _mm256_cvtps_ph(__m256 m1, const int imm);

The immediate byte is a rounding control. The compiler can use it as a convert-and-store directly to memory (unlike most instructions that can optionally use a memory operand, where it's the source operand that can be memory instead of a register.)

VCVTPH2PS goes the other way, and is just like most other SSE instructions (can be used between registers or as a load).

__m128 _mm_cvtph_ps ( __m128i m1);
__m256 _mm256_cvtph_ps ( __m128i m1)

F16C is so efficient that you might want to consider leaving your image in half-precision format, and converting on the fly every time you need a vector of data from it. This is great for your cache footprint.

Lindydancer · Answer

Accessing bitfields in memory can be really tricky, depending on the architecture, of course.

You might achieve better performance if you would make a union of a float and a 32 bit integer, and simply perform all decomposition and composition using a local variables. That way the generated code could perform the entire operation using only processor registers.

How can I optimize conversion from half-precision float16 to single-precision float32?

Tags:

c++

performance

precision

sse

intrinsics

KindDragon

2 Answers

Peter Cordes

Lindydancer

Recent Activity

Donate For Us

How can I optimize conversion from half-precision float16 to single-precision float32?

Tags:

c++

performance

precision

sse

intrinsics

KindDragon

2 Answers

Peter Cordes

Lindydancer

Related questions

Recent Activity

Donate For Us