Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why is compiler generating 4-byte load instead of 1-byte load where the wider load may access unmapped data?

I have a byte buffer filled with variable-length records, whose length is determined by the first byte of the record. A reduced version of a C function to read a single record

void mach_parse_compressed(unsigned char* ptr, unsigned long int* val) {     if (ptr[0] < 0xC0U) {         *val = ptr[0] + ptr[1];         return;     }    *val = ((unsigned long int)(ptr[0]) << 24)       | ((unsigned long int)(ptr[1]) << 16)       | ((unsigned long int)(ptr[2]) << 8)       | ptr[3]; } 

generates assembly (GCC 5.4 -O2 -fPIC on x86_64) that loads four bytes at ptr first, compares the first byte with 0xC0, and then processes either two, either four bytes. The undefined bytes are thrown away correctly, but why does compiler think that it's safe to load four bytes in the first place? Since there is no e.g. alignment requirement for ptr, it may point to the last two bytes of a memory page that is next to an unmapped one for all we know, resulting in a crash.

Both -fPIC and -O2 or higher are required to reproduce.

Am I missing something here? Is compiler correct in doing this and how do I workaround this?

I can get the above show Valgrind/AddressSanitiser errors or a crash with mmap/mprotect:

//#define HEAP #define MMAP #ifdef MMAP #include <unistd.h> #include <sys/mman.h> #include <stdio.h> #elif HEAP #include <stdlib.h> #endif  void mach_parse_compressed(unsigned char* ptr, unsigned long int* val) {     if (ptr[0] < 0xC0U) {         *val = ptr[0] + ptr[1];         return;     }      *val = ((unsigned long int)(ptr[0]) << 24)         | ((unsigned long int)(ptr[1]) << 16)         | ((unsigned long int)(ptr[2]) << 8)         | ptr[3]; }  int main(void) {     unsigned long int val; #ifdef MMAP     int error;     long page_size = sysconf(_SC_PAGESIZE);     unsigned char *buf = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);     unsigned char *ptr = buf + page_size - 2;     if (buf == MAP_FAILED)     {         perror("mmap");         return 1;     }     error = mprotect(buf + page_size, page_size, PROT_NONE);     if (error != 0)     {         perror("mprotect");         return 2;     }     *ptr = 0xBF;     *(ptr + 1) = 0x10;     mach_parse_compressed(ptr, &val); #elif HEAP     unsigned char *buf = malloc(16384);     unsigned char *ptr = buf + 16382;     buf[16382] = 0xBF;     buf[16383] = 0x10; #else     unsigned char buf[2];     unsigned char *ptr = buf;     buf[0] = 0xBF;     buf[1] = 0x10; #endif     mach_parse_compressed(ptr, &val); } 

MMAP version:

Segmentation fault (core dumped) 

With Valgrind:

==3540== Process terminating with default action of signal 11 (SIGSEGV) ==3540==  Bad permissions for mapped region at address 0x4029000 ==3540==    at 0x400740: mach_parse_compressed (in /home/laurynas/gcc-too-wide-load/gcc-too-wide-load) ==3540==    by 0x40060A: main (in /home/laurynas/gcc-too-wide-load/gcc-too-wide-load) 

With ASan:

ASAN:SIGSEGV ================================================================= ==3548==ERROR: AddressSanitizer: SEGV on unknown address 0x7f8f4dc25000 (pc 0x000000400d8a bp 0x0fff884e56c6 sp 0x7ffc4272b620 T0)     #0 0x400d89 in mach_parse_compressed (/home/laurynas/gcc-too-wide-load/gcc-too-wide-load+0x400d89)     #1 0x400b92 in main (/home/laurynas/gcc-too-wide-load/gcc-too-wide-load+0x400b92)     #2 0x7f8f4c72082f in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2082f)     #3 0x400c58 in _start (/home/laurynas/gcc-too-wide-load/gcc-too-wide-load+0x400c58)  AddressSanitizer can not provide additional info. SUMMARY: AddressSanitizer: SEGV ??:0 mach_parse_compressed 

HEAP version with Valgrind:

==30498== Invalid read of size 4 ==30498==    at 0x400603: mach_parse_compressed (mach0data_reduced.c:9) ==30498==    by 0x4004DE: main (mach0data_reduced.c:34) ==30498==  Address 0x520703e is 16,382 bytes inside a block of size 16,384 alloc'd ==30498==    at 0x4C2DB8F: malloc (vg_replace_malloc.c:299) ==30498==    by 0x4004C0: main (mach0data_reduced.c:24) 

Stack version with ASan:

==30528==ERROR: AddressSanitizer: stack-buffer-overflow on address 0x7ffd50000440 at pc 0x000000400b63 bp 0x7ffd500003c0 sp 0x7ffd500003b0 READ of size 4 at 0x7ffd50000440 thread T0     #0 0x400b62 in mach_parse_compressed CMakeFiles/innobase.dir/mach/mach0data_reduced.c:15     #1 0x40087e in main CMakeFiles/innobase.dir/mach/mach0data_reduced.c:34     #2 0x7f3be2ce282f in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2082f)     #3 0x400948 in _start (/home/laurynas/obj-percona-5.5-release/storage/innobase/CMakeFiles/innobase.dir/mach/mach0data_test+0x400948) 

Thanks

EDIT: added MMAP version that actually crashes, clarified compiler options

EDIT 2: reported it as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77673. For workaround, inserting a compiler memory barrier asm volatile("": : :"memory"); after the if statement resolves the issue. Thanks everyone!

like image 644
Laurynas Biveinis Avatar asked Aug 08 '16 04:08

Laurynas Biveinis


People also ask

Do modern compilers generate machine code?

If we’re talking about compilers that compile source code down to native code (e.g., C, C++, etc.), most modern compilers generate machine code directly, and will only generate assembly language source code if you specify that as an option. Object files, which are generated by the complier, contain machine code.

How is the size of a pointer determined in a compiler?

Consider a compiler where int takes 4 bytes, char takes 1 byte and pointer takes 4 bytes. Explanation: Size of an array is number of elements multiplied by the type of element, that is why we get sizeof arri as 12 and sizeof arrc as 3. Size of a pointer is fixed for a compiler.

How many bits does the processor Send to the cache?

The processor sends 32 bit addresses to the cache controller. Each cache tag directory entry contains, in addition to address tag, 2 valid bits, 1 modified bit and 1 replacement bit.


2 Answers

Congratulations! You found a genuine compiler bug!

You can use http://gcc.godbolt.org to explore assembly output from different compilers and options.

With gcc version 6.2 for x86 64-bit linux, using gcc -fPIC -O2, your function does compile to incorrect code:

mach_parse_compressed(unsigned char*, unsigned long*):     movzbl  (%rdi), %edx     movl    (%rdi), %eax   ; potentially incorrect load of 4 bytes     bswap   %eax     cmpb    $-65, %dl     jbe     .L5     movl    %eax, %eax     movq    %rax, (%rsi)     ret .L5:     movzbl  1(%rdi), %eax     addl    %eax, %edx     movslq  %edx, %rdx     movq    %rdx, (%rsi)     ret 

You correctly diagnosed the problem and the mmap example provides a good regression test. gcc is trying too hard to optimize this function and the resulting code is definitely incorrect: reading 4 bytes from an unaligned address is OK for most X86 operating environments, but reading past the end of an array is not.

The compiler could assume that reads past the end of an array are OK if they do not cross a 32 bit or even 64 bit boundary, but this assumption is incorrect for your example. You might be able to get a crash for a block allocated with malloc if you make it large enough. malloc uses mmap for very large blocks (>= 128KB by default IRCC).

Note that this bug was introduced with version 5.1 of the compiler.

clang on the other hand does not have this problem, but the code seems less efficient in the general case:

#    @mach_parse_compressed(unsigned char*, unsigned long*) mach_parse_compressed(unsigned char*, unsigned long*):              movzbl  (%rdi), %ecx     cmpq    $191, %rcx     movzbl  1(%rdi), %eax     ja      .LBB0_2     addq    %rcx, %rax     movq    %rax, (%rsi)     retq .LBB0_2:     shlq    $24, %rcx     shlq    $16, %rax     orq     %rcx, %rax     movzbl  2(%rdi), %ecx     shlq    $8, %rcx     orq     %rax, %rcx     movzbl  3(%rdi), %eax     orq     %rcx, %rax     movq    %rax, (%rsi)     retq 
like image 126
chqrlie Avatar answered Oct 09 '22 02:10

chqrlie


It seems compiler optimize access to ptr. It is possible to disable optimization to access to ptr just adding keyword volatile. In this case there is no crash for MMAP variant.

//#define HEAP #define MMAP #ifdef MMAP #include <unistd.h> #include <sys/mman.h> #include <stdio.h> #elif HEAP #include <stdlib.h> #endif  void mach_parse_compressed(volatile unsigned char* ptr, unsigned long int* val) {     if (ptr[0] < 0xC0U) {         *val = ptr[0] + ptr[1];         return;     }      *val = ((unsigned long int)(ptr[0]) << 24)         | ((unsigned long int)(ptr[1]) << 16)         | ((unsigned long int)(ptr[2]) << 8)         | ptr[3]; }  int main(void) {     unsigned long int val; #ifdef MMAP     int error;     long page_size = sysconf(_SC_PAGESIZE);     unsigned char *buf = (unsigned char *) mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);     unsigned char *ptr = buf + page_size - 2;     if (buf == MAP_FAILED)     {         perror("mmap");         return 1;     }     error = mprotect(buf + page_size, page_size, PROT_NONE);     if (error != 0)     {         perror("mprotect");         return 2;     }     *ptr = 0xBF;     *(ptr + 1) = 0x10;     mach_parse_compressed(ptr, &val); #elif HEAP     unsigned char *buf = malloc(16384);     unsigned char *ptr = buf + 16382;     buf[16382] = 0xBF;     buf[16383] = 0x10; #else     unsigned char buf[2];     unsigned char *ptr = buf;     buf[0] = 0xBF;     buf[1] = 0x10; #endif     mach_parse_compressed(ptr, &val); } 
like image 20
brovko Avatar answered Oct 09 '22 02:10

brovko