After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath> #include <cstdio> const int N = 4000; float cosine[N][N]; float sine[N][N]; int main() { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float ang = i*j*2*M_PI/N; cosine[i][j] = cos(ang); sine[i][j] = sin(ang); } } }
With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N;
To
double ang = i*j*2*M_PI/N;
I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp" .globl cosine .bss .align 32 .type cosine, @object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, @object .size sine, 64000000 sine: .zero 64000000 .text .globl main .type main, @function main: .LFB87: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 movq %rsp, %rbp .cfi_offset 6, -16 .cfi_def_cfa_register 6 subq $16, %rsp movl $0, -4(%rbp) jmp .L2 .L5: movl $0, -8(%rbp) jmp .L3 .L4: movl -4(%rbp), %eax imull -8(%rbp), %eax addl %eax, %eax cvtsi2sd %eax, %xmm0 movsd .LC0(%rip), %xmm1 mulsd %xmm1, %xmm0 movsd .LC1(%rip), %xmm1 divsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movss %xmm0, -12(%rbp) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call cos unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, cosine(,%rax,4) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call sin unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, sine(,%rax,4) addl $1, -8(%rbp) .L3: cmpl $3999, -8(%rbp) setle %al testb %al, %al jne .L4 addl $1, -4(%rbp) .L2: cmpl $3999, -4(%rbp) setle %al testb %al, %al jne .L5 movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE87: .size main, .-main .section .rodata .align 4 .type _ZL1N, @object .size _ZL1N, 4 _ZL1N: .long 4000 .align 8 .LC0: .long 1413754136 .long 1074340347 .align 8 .LC1: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",@progbits
Assembly for first test with O3:
.file "main.cpp" .text .p2align 4,,15 .globl main .type main, @function main: .LFB121: .cfi_startproc pushq %r15 .cfi_def_cfa_offset 16 xorl %r15d, %r15d .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 movl $cosine+16000, %r14d .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 xorl %r13d, %r13d .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 pushq %rbp .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 subq $24, %rsp .cfi_def_cfa_offset 80 .p2align 4,,10 .p2align 3 .L2: movslq %r15d, %rbp .cfi_offset 3, -56 .cfi_offset 6, -48 .cfi_offset 12, -40 movl %r13d, %r12d movl $0x3f800000, %edx imulq $16000, %rbp, %rbp xorl %eax, %eax leaq cosine(%rbp), %rbx addq $sine, %rbp jmp .L5 .p2align 4,,10 .p2align 3 .L3: movl %r12d, %eax leaq 8(%rsp), %rsi leaq 12(%rsp), %rdi subl %r13d, %eax cvtsi2sd %eax, %xmm0 mulsd .LC2(%rip), %xmm0 divsd .LC3(%rip), %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 call sincosf movl 8(%rsp), %edx movl 12(%rsp), %eax .L5: movl %edx, (%rbx) addq $4, %rbx movl %eax, 0(%rbp) addl %r13d, %r12d addq $4, %rbp cmpq %r14, %rbx jne .L3 addl $1, %r15d addl $2, %r13d leaq 16000(%rbx), %r14 cmpl $4000, %r15d jne .L2 addq $24, %rsp .cfi_def_cfa_offset 56 xorl %eax, %eax popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE121: .size main, .-main .globl cosine .bss .align 32 .type cosine, @object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, @object .size sine, 64000000 sine: .zero 64000000 .section .rodata.cst8,"aM",@progbits,8 .align 8 .LC2: .long 1413754136 .long 1074340347 .align 8 .LC3: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",@progbits
Here's a possibility:
In C, cos
is double precision and cosf
is single precision. In C++, std::cos
has overloads for both double and single.
You aren't calling std::cos
. If <cmath>
doesn't also overload ::cos
(as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x)
as (float)cos((double)x)
, so even if you are calling the float
function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With