what is the big difference between a global pointer and a global reference for the VS2010 optimizer? why isn't the reference resolved down?
typedef unsigned char byte_t;
typedef unsigned short word_t;
struct byte_reg_t
{
byte_t low;
byte_t high;
};
union word_reg_t
{
word_t value;
byte_reg_t part;
};
word_reg_t r16;
byte_t& low_ref = r16.part.low;
byte_t* const low_ptr = &r16.part.low;
#define SPLIT() _asm nop;
int main()
{
low_ref = 4;
SPLIT()
byte_t a = r16.part.low;
SPLIT()
byte_t b = low_ref;
SPLIT()
byte_t c = *low_ptr;
SPLIT()
return a+b+c;
}
compiled in Release-Mode with Assembly Output produce this results
;byte_t a = r16.part.low;
mov cl, BYTE PTR ?r16@@3Tword_reg_t@@A
;byte_t b = low_ref;
mov edx, DWORD PTR ?low_ref@@3AAEA ; low_ref
mov dl, BYTE PTR [edx]
;byte_t c = *low_ptr;
mov al, BYTE PTR ?r16@@3Tword_reg_t@@A
unmodified disassembly
.text:00401000 _main proc near ; CODE XREF: __tmainCRTStartup+11D
.text:00401000 mov eax, ?low_ref@@3AAEA ; uchar & low_ref
.text:00401005 mov byte ptr [eax], 4
.text:00401008 nop
.text:00401009 mov cl, ?r16@@3Tword_reg_t@@A ; word_reg_t r16
.text:0040100F nop
.text:00401010 mov edx, ?low_ref@@3AAEA ; uchar & low_ref
.text:00401016 mov dl, [edx]
.text:00401018 nop
.text:00401019 mov al, ?r16@@3Tword_reg_t@@A ; word_reg_t r16
.text:0040101E nop
.text:0040101F movzx eax, al
.text:00401022 movzx edx, dl
.text:00401025 movzx ecx, cl
.text:00401028 add eax, edx
.text:0040102A add eax, ecx
.text:0040102C retn
.text:0040102C _main endp
.data:00403374 ?r16@@3Tword_reg_t@@A db ? ; DATA XREF: _main+9
.data:00403374 ; _main+19
.data:00403375 align 4
.data:00403018 ; unsigned char & low_ref
.data:00403018 ?low_ref@@3AAEA dd offset ?r16@@3Tword_reg_t@@A ; DATA XREF: _main
.data:00403018 ; _main+10
.data:00403018 ; word_reg_t r16
I tested several variants (returning from function etc.) - no resolving if the low_ref is in use
UPDATE
It seems to be an uncommon case for optimization - thx Michael Burr
it works if the reference is in a function scope - or inside of a class or struct instantiated in function scope (but its still strange that the optimizer resolves ptr const but not references - which are 100% identical)
UPDATE 2
its even more strange - if you switch from byte_t to int both resolvings works - const ptr and reference
so there is a small difference in the optimizer for ptr const and references, the reference scope..... and the refered type...sometimes :)
UPDATE 3
simpler testcode - checked with VS2010 and clang 3.1
typedef unsigned char byte_t;
typedef unsigned int dword_t;
//for msvc
#define SPLIT() _asm nop _asm nop;
//for clang
//#define SPLIT() asm("nop"); asm("nop");
byte_t byte;
dword_t dword;
byte_t& global_ref_byte = byte;
dword_t& global_ref_dword = dword;
byte_t* const global_ptrc_byte = &byte;
dword_t* const global_ptrc_dword = &dword;
int main(int argc, char** argv)
{
byte_t& local_ref_byte = byte;
dword_t& local_ref_dword = dword;
dword_t random = (dword_t)argv;
byte = (byte_t)random;
dword = (dword_t)random;
SPLIT()
byte_t a = global_ref_byte;
SPLIT()
dword_t b = global_ref_dword;
SPLIT()
byte_t c = *global_ptrc_byte;
SPLIT()
dword_t d = *global_ptrc_dword;
SPLIT()
byte_t e = local_ref_byte;
SPLIT()
dword_t f = local_ref_dword;
SPLIT()
dword_t result = a+b+c+d+e+f;
return result;
}
VS2010 disassembly
.text:00401000 ; int __cdecl main(int argc, const char **argv, const char **envp)
.text:00401000 _main proc near ; CODE XREF: ___tmainCRTStartup+11D
.text:00401000
.text:00401000 argc = dword ptr 8
.text:00401000 argv = dword ptr 0Ch
.text:00401000 envp = dword ptr 10h
.text:00401000
.text:00401000 push ebp
.text:00401001 mov ebp, esp
.text:00401003 mov eax, [ebp+argv]
.text:00401006 push ebx
.text:00401007 push esi
.text:00401008 push edi
.text:00401009 mov byte_403374, al
.text:0040100E mov dword_403378, eax
.text:00401013 nop
.text:00401014 nop
.text:00401015 mov eax, off_40301C
.text:0040101A mov al, [eax]
.text:0040101C nop
.text:0040101D nop
.text:0040101E mov ecx, dword_403378
.text:00401024 nop
.text:00401025 nop
.text:00401026 mov dl, byte_403374
.text:0040102C nop
.text:0040102D nop
.text:0040102E mov esi, dword_403378
.text:00401034 nop
.text:00401035 nop
.text:00401036 mov bl, byte_403374
.text:0040103C nop
.text:0040103D nop
.text:0040103E mov edi, dword_403378
.text:00401044 nop
.text:00401045 nop
.text:00401046 movzx edx, dl
.text:00401049 movzx ebx, bl
.text:0040104C add edx, edi
.text:0040104E movzx eax, al
.text:00401051 add edx, ebx
.text:00401053 add eax, edx
.text:00401055 pop edi
.text:00401056 add eax, esi
.text:00401058 pop esi
.text:00401059 add eax, ecx
.text:0040105B pop ebx
.text:0040105C pop ebp
.text:0040105D retn
.text:0040105D _main endp
clang 3.1 disassembly
.text:004012E0 sub_4012E0 proc near ; CODE XREF: sub_401020+91
.text:004012E0
.text:004012E0 arg_4 = dword ptr 0Ch
.text:004012E0
.text:004012E0 push ebp
.text:004012E1 mov ebp, esp
.text:004012E3 call sub_4014F0
.text:004012E8 mov eax, [ebp+arg_4]
.text:004012EB mov byte_402000, al
.text:004012F0 mov dword_402004, eax
.text:004012F5 nop
.text:004012F6 nop
.text:004012F7 movzx eax, byte_402000
.text:004012FE nop
.text:004012FF nop
.text:00401300 add eax, dword_402004
.text:00401306 nop
.text:00401307 nop
.text:00401308 movzx ecx, byte_402000
.text:0040130F add ecx, eax
.text:00401311 nop
.text:00401312 nop
.text:00401313 add ecx, dword_402004
.text:00401319 nop
.text:0040131A nop
.text:0040131B movzx eax, byte_402000
.text:00401322 add eax, ecx
.text:00401324 nop
.text:00401325 nop
.text:00401326 add eax, dword_402004
.text:0040132C nop
.text:0040132D nop
.text:0040132E pop ebp
.text:0040132F retn
.text:0040132F sub_4012E0 endp
without the nops both optimizers can produces better code - but clang is still better
VS2010 (more code because of the unresolved byte reference)
.text:00401003 mov eax, [ebp+argv]
.text:00401006 movzx ecx, al
.text:00401009 lea edx, [eax+eax*2]
.text:0040100C mov byte_403374, al
.text:00401011 mov dword_403378, eax
.text:00401016 lea eax, [edx+ecx*2]
.text:00401019 mov ecx, off_40301C
.text:0040101F movzx edx, byte ptr [ecx]
.text:00401022 add eax, edx
clang 3.1:
.text:004012E8 mov eax, [ebp+arg_4]
.text:004012EB mov byte_402000, al
.text:004012F0 mov dword_402004, eax
.text:004012F5 movzx ecx, al
.text:004012F8 add ecx, eax
.text:004012FA lea eax, [ecx+ecx*2]
The main use of references is acting as function formal parameters to support pass-by-reference. In an reference variable is passed into a function, the function works on the original copy (instead of a clone copy in pass-by-value). Changes inside the function are reflected outside the function.
Use references when you can, and pointers when you have to. References are usually preferred over pointers whenever you don't need “reseating”. This usually means that references are most useful in a class's public interface. References typically appear on the skin of an object, and pointers on the inside.
Here's what I believe is happening. The reference is being treated the similar to a non-const global pointer. You can see this if you remove the const
from the low_ptr
declaration.
You can also see that if you move the reference to be local to the function the compiler is able to optimize the access through it without problem.
I'd guess that since global references are quite rare (a 'statistic' I'll admit I just made up) that there's been little effort into optimizing them.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With