Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why don't I get performance improvement by using get_unchecked()?

I tried to use get_unchecked() instead of the [] index operator to get performance improvement in the s() function of my des crate.

However, it does not result in a visible performance improvement even thought the [] (or get_unchecked()) function is called 4.8 billion times in my benchmark. I would have thought that calling get_unchecked() 4.8 billion times instead of [] would result in a 2-second time improvement on my Intel Core 2 Duo 2.4 GHz processor.

I made this small benchmark to have a small code to show you:

#![feature(test)]

extern crate test;

fn main() {
}

pub fn s(box_id: usize, block: u64) -> u64 {
    const TABLES: [[[u64; 16]; 4]; 8] =
        [[[ 14,  4, 13, 1,  2, 15, 11,  8,  3, 10,  6, 12,  5,  9, 0, 7]
        , [  0, 15,  7, 4, 14,  2, 13,  1, 10,  6, 12, 11,  9,  5, 3, 8]
        , [  4,  1, 14, 8, 13,  6,  2, 11, 15, 12,  9,  7,  3, 10, 5, 0]
        , [ 15, 12,  8, 2,  4,  9,  1,  7,  5, 11,  3, 14, 10,  0, 6, 13]
        ],
        [ [ 15,  1,  8, 14,  6, 11,  3,  4,  9, 7,  2, 13, 12, 0,  5, 10]
        , [  3, 13,  4,  7, 15,  2,  8, 14, 12, 0,  1, 10,  6, 9, 11,  5]
        , [  0, 14,  7, 11, 10,  4, 13,  1,  5, 8, 12,  6,  9, 3,  2, 15]
        , [ 13,  8, 10,  1,  3, 15,  4,  2, 11, 6,  7, 12,  0, 5, 14,  9]
        ],
        [ [ 10,  0,  9, 14, 6,  3, 15,  5,  1, 13, 12,  7, 11,  4,  2,  8]
        , [ 13,  7,  0,  9, 3,  4,  6, 10,  2,  8,  5, 14, 12, 11, 15,  1]
        , [ 13,  6,  4,  9, 8, 15,  3,  0, 11,  1,  2, 12,  5, 10, 14,  7]
        , [  1, 10, 13,  0, 6,  9,  8,  7,  4, 15, 14,  3, 11,  5,  2, 12]
        ],
        [ [  7, 13, 14, 3,  0,  6,  9, 10,  1, 2, 8,  5, 11, 12,  4, 15]
        , [ 13,  8, 11, 5,  6, 15,  0,  3,  4, 7, 2, 12,  1, 10, 14,  9]
        , [ 10,  6,  9, 0, 12, 11,  7, 13, 15, 1, 3, 14,  5,  2,  8,  4]
        , [  3, 15,  0, 6, 10,  1, 13,  8,  9, 4, 5, 11, 12,  7,  2, 14]
        ],
        [ [  2, 12,  4,  1,  7, 10, 11,  6,  8,  5,  3, 15, 13, 0, 14,  9]
        , [ 14, 11,  2, 12,  4,  7, 13,  1,  5,  0, 15, 10,  3, 9,  8,  6]
        , [  4,  2,  1, 11, 10, 13,  7,  8, 15,  9, 12,  5,  6, 3,  0, 14]
        , [ 11,  8, 12,  7,  1, 14,  2, 13,  6, 15,  0,  9, 10, 4,  5,  3]
        ],
        [ [ 12,  1, 10, 15,  9,  2,  6,  8,  0, 13,  3,  4, 14,  7,  5, 11]
        , [ 10, 15,  4,  2,  7, 12,  9,  5,  6,  1, 13, 14,  0, 11,  3,  8]
        , [  9, 14, 15,  5,  2,  8, 12,  3,  7,  0,  4, 10,  1, 13, 11,  6]
        , [  4,  3,  2, 12,  9,  5, 15, 10, 11, 14,  1,  7,  6,  0,  8, 13]
        ],
        [ [  4, 11,  2, 14, 15, 0,  8, 13,  3, 12, 9,  7,  5, 10, 6,  1]
        , [ 13,  0, 11,  7,  4, 9,  1, 10, 14,  3, 5, 12,  2, 15, 8,  6]
        , [  1,  4, 11, 13, 12, 3,  7, 14, 10, 15, 6,  8,  0,  5, 9,  2]
        , [  6, 11, 13,  8,  1, 4, 10,  7,  9,  5, 0, 15, 14,  2, 3, 12]
        ],
        [ [ 13,  2,  8,  4,  6, 15, 11,  1, 10,  9,  3, 14,  5,  0, 12,  7]
        , [  1, 15, 13,  8, 10,  3,  7,  4, 12,  5,  6, 11,  0, 14,  9,  2]
        , [  7, 11,  4,  1,  9, 12, 14,  2,  0,  6, 10, 13, 15,  3,  5,  8]
        , [  2,  1, 14,  7,  4, 10,  8, 13, 15, 12,  9,  0,  3,  5,  6, 11]
        ]];
        let i = ((block & 0x20) >> 4 | (block & 1)) as usize;
        let j = ((block & 0x1E) >> 1) as usize;
        unsafe { *TABLES.get_unchecked(box_id).get_unchecked(i).get_unchecked(j) }
        //TABLES[box_id][i][j]
}

#[cfg(test)]
mod bench {
    use test::{Bencher, black_box};

    use super::s;

    #[bench]
    fn bench_s(bencher: &mut Bencher) {
        bencher.iter(|| {
            let box_id = black_box(7);
            (0 .. 40_000_000).fold(0, |acc, block| acc + s(box_id, block))
        });
    }
}

The first time I ran this benchmark with [], it took less time than the version with get_unchecked() (even though the get_unchecked() version seems a little bit faster on average). I am not sure it really reflects my real-life benchmark (which consists of encrypting a big file), but it gives an idea.

I checked the assembly to be sure that the compiler did not optimize away the bound checking.

Here is the version with get_unchecked():

0000000000009360 <_ZN3des7s_table17hbabdd9dee72331a5E>:
    9360:   48 89 f0                mov    %rsi,%rax
    9363:   48 c1 e8 04             shr    $0x4,%rax
    9367:   83 e0 02                and    $0x2,%eax
    936a:   89 f1                   mov    %esi,%ecx
    936c:   83 e1 01                and    $0x1,%ecx
    936f:   48 09 c1                or     %rax,%rcx
    9372:   48 c1 e7 09             shl    $0x9,%rdi
    9376:   48 8d 05 93 57 04 00    lea    0x45793(%rip),%rax        # 4eb10 <ref10404>
    937d:   48 01 f8                add    %rdi,%rax
    9380:   48 c1 e1 07             shl    $0x7,%rcx
    9384:   48 01 c1                add    %rax,%rcx
    9387:   83 e6 1e                and    $0x1e,%esi
    938a:   48 8b 04 b1             mov    (%rcx,%rsi,4),%rax
    938e:   c3                      retq   
    938f:   90                      nop

And here is the version with []:

0000000000009390 <_ZN3des7s_table17hbabdd9dee72331a5E>:
    9390:   50                      push   %rax
    9391:   48 89 f8                mov    %rdi,%rax
    9394:   48 83 f8 07             cmp    $0x7,%rax
    9398:   77 30                   ja     93ca <_ZN3des7s_table17hbabdd9dee72331a5E+0x3a>
    939a:   48 89 f1                mov    %rsi,%rcx
    939d:   48 c1 e9 04             shr    $0x4,%rcx
    93a1:   83 e1 02                and    $0x2,%ecx
    93a4:   89 f2                   mov    %esi,%edx
    93a6:   83 e2 01                and    $0x1,%edx
    93a9:   48 09 ca                or     %rcx,%rdx
    93ac:   48 c1 e0 09             shl    $0x9,%rax
    93b0:   48 8d 0d 99 57 04 00    lea    0x45799(%rip),%rcx        # 4eb50 <const10401>
    93b7:   48 01 c1                add    %rax,%rcx
    93ba:   48 c1 e2 07             shl    $0x7,%rdx
    93be:   48 01 ca                add    %rcx,%rdx
    93c1:   83 e6 1e                and    $0x1e,%esi
    93c4:   48 8b 04 b2             mov    (%rdx,%rsi,4),%rax
    93c8:   59                      pop    %rcx
    93c9:   c3                      retq   
    93ca:   48 8d 3d b7 a3 25 00    lea    0x25a3b7(%rip),%rdi        # 263788 <panic_bounds_check_loc10404>
    93d1:   ba 08 00 00 00          mov    $0x8,%edx
    93d6:   48 89 c6                mov    %rax,%rsi
    93d9:   e8 82 25 04 00          callq  4b960 <_ZN4core9panicking18panic_bounds_check17hb2d969c3cc11ed08E>
    93de:   66 90                   xchg   %ax,%ax

We can see that the get_unchecked() version is smaller and that the version with [] check bounds.

Both versions are compiled in release mode.

Why is the get_unchecked() version not faster than that? I think it should be at least a couple of seconds faster than the [] version when get_unchecked()/[] is called 4.8 billion times.

Edit: I profiled the code using valgrind.

The version with [] shows a cost of 10 for the line with the array indexing while the version with get_unchecked() shows a cost of less than 1 (see the images below). However, the costs of the function (see on the left of the images) stayed the same. This is strange. Does anybody have an explanation?

Version with get_unchecked Version with get_unchecked().

Version with index Version with []

like image 531
antoyo Avatar asked Aug 28 '16 23:08

antoyo


1 Answers

I haven't learned much Rust (yet), but I think I can still answer the performance part of this.

First of all, are you sure your benchmark is actually running the stand-alone version of the function? It might inline the function into the call site, where box_id is a compile-time constant. Besides removing the minor call/ret overhead, the table index calculation asm would be significantly simplified. Also, the bounds-check could be elided if it's known at compile-time that they're not exceeded.

If someone shows how to modify the OP's example to compile to actual asm on the Godbolt compiler explorer, I'd be able to take a look and say more. Putting it up as-is on godbolt gives something that compiles to empty asm output. I might decide to learn enough Rust to do this myself, but probably not soon.


Having a look at the stand-alone version of the function:

The extra instructions in the checked version are just the first 4:

9390:   50                      push   %rax        # align the stack in case we make a function call (wasted work for the common case)
9391:   48 89 f8                mov    %rdi,%rax   # compiler is dumb, could have checked the value in %rdi
9394:   48 83 f8 07             cmp    $0x7,%rax
9398:   77 30                   ja     93ca <_ZN3des7s_table17hbabdd9dee72331a5E+0x3a>

And the final pop %rcx to add 8 to %rsp.

So that's 4 uops at the start of the function. (Core2Duo can't macro-fuse in 64-bit mode. Core2Duo can macro-fuse cmp/ja in 32-bit code, though. (See Agner Fog's microarch pdf). If the compiler was smarter, it would only be 2 extra instructions / uops total on the fast-path through the function (cmp/ja), with stack alignment for another function call only done in the branch that actually makes the call.

You might think that having these 4 uops issue as the first group of the function would be a problem, because it would delay the CPU from getting to the instructions on the critical path. But that's not the case, because apparently your code doesn't bottleneck on the front-end. (You didn't show the asm for the code that calls this in a loop). So instructions are issued into the out-of-order core ahead of instructions that are actually executing.

Probably by the time the function input is ready in %rdi, the scheduler already has the critical-path instructions waiting for it. If the benchmark is actually running the standalone version, the 4 extra instructions at the beginning of the function aren't actually delaying the critical path any. So presumably the critical path is bottlenecked on latency, not throughput. (Does the output of one call form the index for the next lookup? If so, that would prevent multiple invocations of the function from executing at the same time for different inputs. L1 load-use latency is 3 cycles on Core 2 (according to Agner Fog's microarch pdf).

There's quite a bit of instruction-level parallelism in the instructions that calculate the table index from the input, though. There are a couple mov instructions that make a copy, and then instructions do different things to the copy and the original. And the two args are already separate. I think there's probably enough parallelism to run 3 instructions in parallel a lot of the time, between when the input is ready and when the table index is ready. So if execution has to wait 3 cycles for a table lookup before there's another input, that's time for the extra uops to run. (The scheduler runs uops on a first-ready basis, though, not critical-path-first, so you'd expect them to sometimes lengthen the critical path with resource conflicts (stealing the execution ports from the critical path).

TL:DR The L1 load-use latency could still easily be the bottleneck, not uop throughput, if the output of one call is the input to the next. Otherwise there must be some other bottleneck that gives the 5 extra uops time to run without delaying the "real work". Or else the check is being optimized away in the code that actually runs.

like image 86
Peter Cordes Avatar answered Nov 06 '22 03:11

Peter Cordes