Strange JavaScript performance

Question

When I was implementing ChaCha20 in JavaScript, I stumbled upon some strange behavior.

My first version was build like this (let's call it "Encapsulated Version"):

function quarterRound(x, a, b, c, d) {
    x[a] += x[b]; x[d] = ((x[d] ^ x[a]) << 16) | ((x[d] ^ x[a]) >>> 16);
    x[c] += x[d]; x[b] = ((x[b] ^ x[c]) << 12) | ((x[b] ^ x[c]) >>> 20);
    x[a] += x[b]; x[d] = ((x[d] ^ x[a]) <<  8) | ((x[d] ^ x[a]) >>> 24);
    x[c] += x[d]; x[b] = ((x[b] ^ x[c]) <<  7) | ((x[b] ^ x[c]) >>> 25);
}

function getBlock(buffer) {
    var x = new Uint32Array(16);

    for (var i = 16; i--;) x[i] = input[i];
    for (var i = 20; i > 0; i -= 2) {
        quarterRound(x, 0, 4, 8,12);
        quarterRound(x, 1, 5, 9,13);
        quarterRound(x, 2, 6,10,14);
        quarterRound(x, 3, 7,11,15);
        quarterRound(x, 0, 5,10,15);
        quarterRound(x, 1, 6,11,12);
        quarterRound(x, 2, 7, 8,13);
        quarterRound(x, 3, 4, 9,14);
    }
    for (i = 16; i--;) x[i] += input[i];
    for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
    input[12]++;
    return buffer;
}

To reduce unnecessary function calls (with parameter overhead etc.) I removed the quarterRound-function and put it's contents inline (it's correct; I verified it against some test vectors):

function getBlock(buffer) {
    var x = new Uint32Array(16);

    for (var i = 16; i--;) x[i] = input[i];
    for (var i = 20; i > 0; i -= 2) {
        x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) << 16) | ((x[12] ^ x[ 0]) >>> 16);
        x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) << 12) | ((x[ 4] ^ x[ 8]) >>> 20);
        x[ 0] += x[ 4]; x[12] = ((x[12] ^ x[ 0]) <<  8) | ((x[12] ^ x[ 0]) >>> 24);
        x[ 8] += x[12]; x[ 4] = ((x[ 4] ^ x[ 8]) <<  7) | ((x[ 4] ^ x[ 8]) >>> 25);
        x[ 1] += x[ 5]; x[13] = ((x[13] ^ x[ 1]) << 16) | ((x[13] ^ x[ 1]) >>> 16);
        x[ 9] += x[13]; x[ 5] = ((x[ 5] ^ x[ 9]) << 12) | ((x[ 5] ^ x[ 9]) >>> 20);
        x[ 1] += x[ 5]; x[13] = ((x[13] ^ x[ 1]) <<  8) | ((x[13] ^ x[ 1]) >>> 24);
        x[ 9] += x[13]; x[ 5] = ((x[ 5] ^ x[ 9]) <<  7) | ((x[ 5] ^ x[ 9]) >>> 25);
        x[ 2] += x[ 6]; x[14] = ((x[14] ^ x[ 2]) << 16) | ((x[14] ^ x[ 2]) >>> 16);
        x[10] += x[14]; x[ 6] = ((x[ 6] ^ x[10]) << 12) | ((x[ 6] ^ x[10]) >>> 20);
        x[ 2] += x[ 6]; x[14] = ((x[14] ^ x[ 2]) <<  8) | ((x[14] ^ x[ 2]) >>> 24);
        x[10] += x[14]; x[ 6] = ((x[ 6] ^ x[10]) <<  7) | ((x[ 6] ^ x[10]) >>> 25);
        x[ 3] += x[ 7]; x[15] = ((x[15] ^ x[ 3]) << 16) | ((x[15] ^ x[ 3]) >>> 16);
        x[11] += x[15]; x[ 7] = ((x[ 7] ^ x[11]) << 12) | ((x[ 7] ^ x[11]) >>> 20);
        x[ 3] += x[ 7]; x[15] = ((x[15] ^ x[ 3]) <<  8) | ((x[15] ^ x[ 3]) >>> 24);
        x[11] += x[15]; x[ 7] = ((x[ 7] ^ x[11]) <<  7) | ((x[ 7] ^ x[11]) >>> 25);
        x[ 0] += x[ 5]; x[15] = ((x[15] ^ x[ 0]) << 16) | ((x[15] ^ x[ 0]) >>> 16);
        x[10] += x[15]; x[ 5] = ((x[ 5] ^ x[10]) << 12) | ((x[ 5] ^ x[10]) >>> 20);
        x[ 0] += x[ 5]; x[15] = ((x[15] ^ x[ 0]) <<  8) | ((x[15] ^ x[ 0]) >>> 24);
        x[10] += x[15]; x[ 5] = ((x[ 5] ^ x[10]) <<  7) | ((x[ 5] ^ x[10]) >>> 25);
        x[ 1] += x[ 6]; x[12] = ((x[12] ^ x[ 1]) << 16) | ((x[12] ^ x[ 1]) >>> 16);
        x[11] += x[12]; x[ 6] = ((x[ 6] ^ x[11]) << 12) | ((x[ 6] ^ x[11]) >>> 20);
        x[ 1] += x[ 6]; x[12] = ((x[12] ^ x[ 1]) <<  8) | ((x[12] ^ x[ 1]) >>> 24);
        x[11] += x[12]; x[ 6] = ((x[ 6] ^ x[11]) <<  7) | ((x[ 6] ^ x[11]) >>> 25);
        x[ 2] += x[ 7]; x[13] = ((x[13] ^ x[ 2]) << 16) | ((x[13] ^ x[ 2]) >>> 16);
        x[ 8] += x[13]; x[ 7] = ((x[ 7] ^ x[ 8]) << 12) | ((x[ 7] ^ x[ 8]) >>> 20);
        x[ 2] += x[ 7]; x[13] = ((x[13] ^ x[ 2]) <<  8) | ((x[13] ^ x[ 2]) >>> 24);
        x[ 8] += x[13]; x[ 7] = ((x[ 7] ^ x[ 8]) <<  7) | ((x[ 7] ^ x[ 8]) >>> 25);
        x[ 3] += x[ 4]; x[14] = ((x[14] ^ x[ 3]) << 16) | ((x[14] ^ x[ 3]) >>> 16);
        x[ 9] += x[14]; x[ 4] = ((x[ 4] ^ x[ 9]) << 12) | ((x[ 4] ^ x[ 9]) >>> 20);
        x[ 3] += x[ 4]; x[14] = ((x[14] ^ x[ 3]) <<  8) | ((x[14] ^ x[ 3]) >>> 24);
        x[ 9] += x[14]; x[ 4] = ((x[ 4] ^ x[ 9]) <<  7) | ((x[ 4] ^ x[ 9]) >>> 25);
    }
    for (i = 16; i--;) x[i] += input[i];
    for (i = 16; i--;) U32TO8_LE(buffer, 4 * i, x[i]);
    input[12]++;
    return buffer;
}

But the performance result was not quite as expected:

Encapsulated performance

vs.

Inline performance

While the performance difference under Firefox and Safari is neglectible or not important the performance cut under Chrome is HUGE... Any ideas why this happens?

P.S.: If the images are to small, open them in a new tab :)

PP.S.: Here are the links:

Inlined

Encapsulated

Vyacheslav Egorov · Accepted Answer

Regression happens because you hit a bug in one of the passes in V8's current optimizing compiler Crankshaft.

If you look at what Crankshaft does to the slow "inlined" case, you will notice that getBlock function constantly deoptimizes.

To see that you can either just pass --trace-deopt flag to V8 and read the output it dumps to the console or use a tool called IRHydra.

I collected V8 output for both inlined and uninlined cases, you can explore in IRHydra:

inlined case: http://mrale.ph/irhydra/2/#gist:1fefdc70567a924d4cf2
encapsulated case: http://mrale.ph/irhydra/2/#gist:be3de64f860c78ca640b

Here is what it shows for "inlined" case:

method list

Each entry in the function list is a single optimization attempt. Red color means that optimized function later deoptimized because some assumption made by an optimizing compiler was violated.

This means getBlock is constantly optimized and deoptimized. There is nothing like this in the "encapsulated" case:

enter image description here

Here getBlock is optimized once and never deoptimizes.

If we look inside getBlock we will see that it is array load from Uint32Array that deoptimizes because result of this load is a value that does not fit into int32 value.

enter image description here

The reasons for this deopt are a bit convoluted. JavaScript's only number type is a double precision floating point number. Doing all calculations with it would be somewhat inefficient so optimizing JITs usually try to keep integer vlaues represented as actual integers within optimized code.

Crankshaft's widest integer representation is int32 and half of uint32 values are not representable in it. To partially mitigate this limitation Crankshaft performs an optimization pass called uint32 analysis. This pass tries to figure out whether it is safe to represent uint32 value as an int32 value - which is done by looking at how this uint32 value is used: some operations, e.g. bitwise ones, do not care about the "sign" but only about individual bits, other operations (e.g. deoptimization or convertion from integer to double) can be taught to handle int32-that-is-actually-uint32 in a special way. If analysis succeeds - all uses of a uint32 value are safe - then this operation is marked in a special way, otherwise (some uses are found to be unsafe) the operation is not marked and will deopt if it produces uint32 value that does not fit into int32 range (anything above 0x7fffffff).

In this case analysis was not marking x[i] as a safe uint32 operation - so it was deoptimizing when result of x[i] was outside of int32 range. The reason for not marking x[i] as safe was because one of its uses, an artificial instruction created by inliner when inlining U32TO8_LE, was considered unsafe. Here is a patch for V8 that fixes the problem it also contains a small illustration of the issue:

var u32 = new Uint32Array(1);
u32[0] = 0xFFFFFFFF;  // this uint32 value doesn't fit in int32

function tr(x) {
  return x|0;
  //     ^^^ - this use is uint32-safe
}
function ld() {
  return tr(u32[0]);
  //     ^  ^^^^^^ uint32 op, will deopt if uses are not safe
  //     |
  //     \--- tr is inlined into ld and an hidden artificial 
  //          HArgumentObject instruction was generated that
  //          captured values of all parameters at entry (x)
  //          This instruction was considered uint32-unsafe
  //          by oversight.
}

while (...) ld();

You were not hitting this bug in "encapsulated" version because Crankshaft's own inliner was running out of budget before it reached U32TO8_LE call site. As you can see in IRHydra only first three calls to quarterRound are inlined:

enter image description here

You can workaround this bug by changing U32TO8_LE(buffer, 4 * i, x[i]) to U32TO8_LE(buffer, 4 * i, x[i]|0) which makes the only use of x[i] value uint32-safe and does not change the result.

Strange JavaScript performance

Tags:

performance

javascript

K. Biermann

1 Answers

Vyacheslav Egorov

Recent Activity

Donate For Us

Strange JavaScript performance

Tags:

performance

javascript

K. Biermann

1 Answers

Vyacheslav Egorov

Related questions

Recent Activity

Donate For Us