I wrote a simple program to compare Rust and C performance.
The Rust version:
use std::time::Instant;
const STREAM_ARRAY_SIZE: usize = 10000000;
static mut A: [f64; STREAM_ARRAY_SIZE] = [1.0; STREAM_ARRAY_SIZE];
fn main() {
let now = Instant::now();
unsafe {
for i in 0..STREAM_ARRAY_SIZE {
A[i] = 2.0E0 * A[i];
}
}
let duration = now.elapsed();
println!("{}", (duration.as_secs() * 1_000_000_000 + duration.subsec_nanos() as u64) / 1000);
}
Run it in debug & release mode:
$ ./target/debug/calc
472046 us.
$ ./target/release/calc
62860 us.
The release version has a drastic performance gain compared to debug.
The C version does the same thing and runs on the same server:
#include <sys/time.h>
#include <stdio.h>
#define STREAM_ARRAY_SIZE 10000000
static double A[STREAM_ARRAY_SIZE];
int mysecond(void)
{
struct timeval tp;
struct timezone tzp;
int i;
i = gettimeofday(&tp,&tzp);
return (tp.tv_sec * 1000000 + tp.tv_usec);
}
int main(void)
{
int j = 0;
for (j = 0; j < STREAM_ARRAY_SIZE; j++)
{
A[j] = 1.0;
}
int t = mysecond();
for (j = 0; j < STREAM_ARRAY_SIZE; j++)
{
A[j] = 2.0E0 * A[j];
}
printf("%d us.\n", mysecond() - t);
return 0;
}
Compile and run it with -O0
and -O2
:
$ gcc test.c
$ ./a.out
41626 us.
$ gcc -O2 test.c
$ ./a.out
13499 us.
The Rust optimized version only compares to gcc -O0
, and is very weak compared to gcc -O2
. Is this reasonable? How can I improve the performance of the Rust version?
Rust compiles the loop to:
.LBB0_1:
movupd xmm0, xmmword ptr [rcx + 8*rax - 48]
movupd xmm1, xmmword ptr [rcx + 8*rax - 32]
addpd xmm0, xmm0
addpd xmm1, xmm1
movupd xmmword ptr [rcx + 8*rax - 48], xmm0
movupd xmmword ptr [rcx + 8*rax - 32], xmm1
movupd xmm0, xmmword ptr [rcx + 8*rax - 16]
movupd xmm1, xmmword ptr [rcx + 8*rax]
addpd xmm0, xmm0
addpd xmm1, xmm1
movupd xmmword ptr [rcx + 8*rax - 16], xmm0
movupd xmmword ptr [rcx + 8*rax], xmm1
add rax, 8
cmp rax, 100006
jne .LBB0_1
While GCC 7.1.0 compiles to:
L6:
movsd (%rbx), %xmm0
addq $8, %rbx
addsd %xmm0, %xmm0
movsd %xmm0, -8(%rbx)
cmpq %rbp, %rbx
jne L6
Rust puts the array in a data section, while C actually writes (memset
with pattern) to the memory. This means that your OS running the app likely just maps the range and relies on virtual memory to do the right thing.
If you change the code to run the same loop before the measurement, the runtime goes down considerably. It's actually faster than the C version on my machine. (possibly due to that loop unrolling)
unsafe {
for i in 0..STREAM_ARRAY_SIZE {
A[i] = 2.0E0 * A[i];
}
}
let now = Instant::now();
unsafe {
for i in 0..STREAM_ARRAY_SIZE {
A[i] = 2.0E0 * A[i];
}
}
let duration = now.elapsed();
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With