I'm trying improve performance for my function. Profiler points to the code at inner loop. Can I improve perfomance of that code, maybe using SSE intrinsics?
void ConvertImageFrom_R16_FLOAT_To_R32_FLOAT(char* buffer, void* convertedData, DWORD width, DWORD height, UINT rowPitch)
{
struct SINGLE_FLOAT
{
union {
struct {
unsigned __int32 R_m : 23;
unsigned __int32 R_e : 8;
unsigned __int32 R_s : 1;
};
struct {
float r;
};
};
};
C_ASSERT(sizeof(SINGLE_FLOAT) == 4); // 4 bytes
struct HALF_FLOAT
{
unsigned __int16 R_m : 10;
unsigned __int16 R_e : 5;
unsigned __int16 R_s : 1;
};
C_ASSERT(sizeof(HALF_FLOAT) == 2);
SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
for(DWORD j = 0; j< height; j++)
{
HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
for(DWORD i = 0; i< width; i++)
{
d->R_s = s->R_s;
d->R_e = s->R_e - 15 + 127;
d->R_m = s->R_m << (23-10);
d++;
s++;
}
}
}
Update:
Disassembly
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
TITLE Utils.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
; Function compile flags: /Ogtp
; COMDAT ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z
_TEXT SEGMENT
_buffer$ = 8 ; size = 4
tv83 = 12 ; size = 4
_convertedData$ = 12 ; size = 4
_width$ = 16 ; size = 4
_height$ = 20 ; size = 4
_rowPitch$ = 24 ; size = 4
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z PROC ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT, COMDAT
; 323 : {
push ebp
mov ebp, esp
; 343 : for(DWORD j = 0; j< height; j++)
mov eax, DWORD PTR _height$[ebp]
push esi
mov esi, DWORD PTR _convertedData$[ebp]
test eax, eax
je SHORT $LN4@ConvertIma
; 324 : union SINGLE_FLOAT {
; 325 : struct {
; 326 : unsigned __int32 R_m : 23;
; 327 : unsigned __int32 R_e : 8;
; 328 : unsigned __int32 R_s : 1;
; 329 : };
; 330 : struct {
; 331 : float r;
; 332 : };
; 333 : };
; 334 : C_ASSERT(sizeof(SINGLE_FLOAT) == 4);
; 335 : struct HALF_FLOAT
; 336 : {
; 337 : unsigned __int16 R_m : 10;
; 338 : unsigned __int16 R_e : 5;
; 339 : unsigned __int16 R_s : 1;
; 340 : };
; 341 : C_ASSERT(sizeof(HALF_FLOAT) == 2);
; 342 : SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
push ebx
mov ebx, DWORD PTR _buffer$[ebp]
push edi
mov DWORD PTR tv83[ebp], eax
$LL13@ConvertIma:
; 344 : {
; 345 : HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
; 346 : for(DWORD i = 0; i< width; i++)
mov edi, DWORD PTR _width$[ebp]
mov edx, ebx
test edi, edi
je SHORT $LN5@ConvertIma
npad 1
$LL3@ConvertIma:
; 347 : {
; 348 : d->R_s = s->R_s;
movzx ecx, WORD PTR [edx]
movzx eax, WORD PTR [edx]
shl ecx, 16 ; 00000010H
xor ecx, DWORD PTR [esi]
shl eax, 16 ; 00000010H
and ecx, 2147483647 ; 7fffffffH
xor ecx, eax
mov DWORD PTR [esi], ecx
; 349 : d->R_e = s->R_e - 15 + 127;
movzx eax, WORD PTR [edx]
shr eax, 10 ; 0000000aH
and eax, 31 ; 0000001fH
add eax, 112 ; 00000070H
shl eax, 23 ; 00000017H
xor eax, ecx
and eax, 2139095040 ; 7f800000H
xor eax, ecx
mov DWORD PTR [esi], eax
; 350 : d->R_m = s->R_m << (23-10);
movzx ecx, WORD PTR [edx]
and ecx, 1023 ; 000003ffH
shl ecx, 13 ; 0000000dH
and eax, -8388608 ; ff800000H
or ecx, eax
mov DWORD PTR [esi], ecx
; 351 : d++;
add esi, 4
; 352 : s++;
add edx, 2
dec edi
jne SHORT $LL3@ConvertIma
$LN5@ConvertIma:
; 343 : for(DWORD j = 0; j< height; j++)
add ebx, DWORD PTR _rowPitch$[ebp]
dec DWORD PTR tv83[ebp]
jne SHORT $LL13@ConvertIma
pop edi
pop ebx
$LN4@ConvertIma:
pop esi
; 353 : }
; 354 : }
; 355 : }
pop ebp
ret 0
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ENDP ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
_TEXT ENDS
The x86 F16C instruction-set extension adds hardware support for converting single-precision float vectors to/from vectors of half-precision float.
The format is the same IEEE 754 half-precision binary16 that you describe. I didn't check that the endianness is the same as your struct, but that's easy to fix if needed (with a pshufb).
F16C is supported starting from Intel IvyBridge and AMD Piledriver. (And has its own CPUID feature bit, which your code should check for, otherwise fall back to SIMD integer shifts and shuffles).
The intrinsics for VCVTPS2PH are:
__m128i _mm_cvtps_ph ( __m128 m1, const int imm);
__m128i _mm256_cvtps_ph(__m256 m1, const int imm);
The immediate byte is a rounding control. The compiler can use it as a convert-and-store directly to memory (unlike most instructions that can optionally use a memory operand, where it's the source operand that can be memory instead of a register.)
VCVTPH2PS goes the other way, and is just like most other SSE instructions (can be used between registers or as a load).
__m128 _mm_cvtph_ps ( __m128i m1);
__m256 _mm256_cvtph_ps ( __m128i m1)
F16C is so efficient that you might want to consider leaving your image in half-precision format, and converting on the fly every time you need a vector of data from it. This is great for your cache footprint.
Accessing bitfields in memory can be really tricky, depending on the architecture, of course.
You might achieve better performance if you would make a union of a float and a 32 bit integer, and simply perform all decomposition and composition using a local variables. That way the generated code could perform the entire operation using only processor registers.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With