3D Morton code computation utilizing carry-less multiplication

Question

This question arose specifically in the context of exploratory work for RISC-V platforms which may optionally support a carry-less multiplication instruction CLMUL. It would equally apply to other architectures offering such an instruction acting on general-purpose registers, but I am not aware of any others.

The computation of a 3D Morton code involves interleaving bits from three integers, x, y, and z such that r_3i+0 = x_i, r_3i+1 = y_i, r_3i+2 = z_i. This is trivial to do when the processor architecture offers a bit-deposit instruction, as is the case on x86-64 platforms supporting the BMI2 extension:

/* For x, y, z in [0, 1023], compute a 30-bit 3D Morton code by interleaving bits.
   For i in [0, 9]: r<3*i+0> = x<i>, r<3*i+1> = y<i>, r<3*i+2> = z<i>.
*/
uint32_t encode_morton3d_10_bits (uint32_t x, uint32_t y, uint32_t z)
{
    return (_pdep_u32 (x, 01111111111) |  // octal constants!
            _pdep_u32 (y, 02222222222) |
            _pdep_u32 (z, 04444444444));
}

On architectures currently lacking support for a bit-deposit operation, such as RISC-V, the functionality of depositing source bits at every third result bit must be emulated. Compared with the commonly used approach, the following algorithm using multiplies to spread bits reduces the length of dependency chains and increases instruction level parallelism, so for most architectures, it performs no worse on low-end CPU implementations but significantly better on high-end CPU implementations. The multiplication factors used are simple enough that compilers can (and do) replace them with cheaper instructions where appropriate for a given (micro-)architecture.

/* Spread least significant ten bits of argument x in [0, 1023] to every third 
   bit of result. For i in [0, 9]: r<3*i+0> = x<i>, r<3*i+1> = 0, r<3*i+2> = 0. 
*/
uint32_t deposit_every_third (uint32_t x)
{
    const uint32_t sm1  = 0x333ul;      // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2  = 0x0ccul;      // source mask: select bits 2,3,6,7
    const uint32_t ml1a = 0x50005ul;    // multiplier: spread to bits 0-11,16-27
    const uint32_t ml1b = 0x00500ul;    // multiplier: spread to bits 8-19
    const uint32_t ml2  = 0x05050ul;    // multiplier: spread to bits 6-13,14-21
    const uint32_t rm1  = 0x09009009ul; // result mask: bits 0,3,12,15,24,27
    const uint32_t rm2  = 0x00240240ul; // result mask: bits 6,9,18,21
    uint32_t t = x & sm1;
    return ((((t * ml1a) | (t * ml1b)) & rm1) | (((x & sm2) * ml2) & rm2));
}

For the purposes of computing a 3D Morton code one can obviously incorporate the shifts into the deposit functionality, resulting in the following exemplary implementation:

uint32_t deposit_every_third_shifted (uint32_t x, uint32_t s)
{
    const uint32_t sm1  = 0x333ul;           // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2  = 0x0ccul;           // source mask: select bits 2,3,6,7
    const uint32_t ml1a = 0x50005ul << s;    // multipliers
    const uint32_t ml1b = 0x00500ul << s;
    const uint32_t ml2  = 0x05050ul << s;
    const uint32_t rm1  = 0x09009009ul << s; // result masks
    const uint32_t rm2  = 0x00240240ul << s;
    uint32_t t = x & sm1;
    return ((((t * ml1a) | (t * ml1b)) & rm1) | (((x & sm2) * ml2) & rm2));
}

/* For x, y, z in [0, 1023], compute 30-bit 3D Morton code by interleaving bits.
   For i in [0, 9]: r<3*i+0> = x<i>, r<3*i+1> = y<i>, r<3*i+2> = z<i>.
*/
uint32_t encode_morton3d_10_bits (uint32_t x, uint32_t y, uint32_t z)
{
    return (deposit_every_third_shifted (x, 0) |
            deposit_every_third_shifted (y, 1) |
            deposit_every_third_shifted (z, 2));
}

Compiling the code above with full optimization (-O3) for a 32-bit RISC-V platform results in reasonably efficient code, with both clang 20.1 and gcc 15.1 producing linear instruction sequences of 58 instructions, although the two generated sequences differ quite a bit from each other. I note with interest how the two compilers tune the multiplies differently: With nine multiplies present in the source code, clang produces seven MUL instructions, but gcc only five.

Using ordinary integer multiplies, in the implementation of deposit_every_third_shifted() above, the first multiplier had to be broken up into two parts, ml1a and ml1b, to avoid incorrect results due to carry-propagation. This is trivial to address on RISC-V processors that support the ratified ISA extension Zbc, which offers a carry-less multiplication instruction CLMUL. FWIW, I could not find this being offered via a compiler intrinsic, so I resorted to inline assembler:

#if defined (PORTABLE)
uint32_t clmul (uint32_t rs1, uint32_t rs2)
{
    uint32_t x = 0;
    for (int i = 0; i < 32; i++)
        if ((rs2 >> i) & 1)
            x ^= rs1 << i;
    return x;
}
#else // !PORTABLE
long clmul (long a, long b) // caveat: this assumes sizeof (long) matches RISC-V register size
{
  long r;
  __asm__("clmul	%0,%1,%2" : "=r"(r) : "r"(a), "r"(b));
  return r;
}
#endif // PORTABLE

uint32_t deposit_every_third_clmul_shifted (uint32_t x, uint32_t s)
{
    const uint32_t sm1 = 0x333ul;           // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2 = 0x0ccul;           // source mask: select bits 2,3,6,7
    const uint32_t ml1 = 0x50505ul << s;    // multipliers
    const uint32_t ml2 = 0x05050ul << s;    
    const uint32_t rm1 = 0x09009009ul << s; // result masks
    const uint32_t rm2 = 0x00240240ul << s; 
    return (clmul (x & sm1, ml1) & rm1) | (clmul (x & sm2, ml2) & rm2);
}

uint32_t encode_morton3d_10_bits_clmul (uint32_t x, uint32_t y, uint32_t z)
{
    return (deposit_every_third_clmul_shifted (x, 0) |
            deposit_every_third_clmul_shifted (y, 1) |
            deposit_every_third_clmul_shifted (z, 2));
}

When compiling with -O3, the generated code comprises 47 instructions with both clang 20.1 and gcc 15.1, although the instruction sequences produced differ from each other significantly. As CLMUL should execute no slower than MUL but is likely faster (e.g. 2 cycle latency for CLMUL vs 4 cycle latency for MUL), this reduction in instruction count should result in a meaningful speedup for Morton code generation.

However, it is probably too small to justify the inclusion of the Zbc extension in a RISC-V processor if there are no significant other uses for it. When computing 3D Morton codes, is there a way to utilize CLMUL for even greater performance benefit? I am aware that clmul (x,x) interleaves one zero bit between consecutive source bits, ideal for a 2D Morton code, but I do not seem to be able to find a way to extend this idea to interleaving two zero bits between consecutive source bits as needed for a 3D Morton code.

Aki Suihkonen · Accepted Answer

One useful trick is that having bit sequence of x=0b'0aaa0bbb we can remove the zero bit by x + 0b'0000bbb -> 0b0aaabbb0 .

But there's more -- we can shift two bits x = 0baaa00bbb by x += (x & 0b111) * 3 and four bits by x += (x & mask) * 15 -- and this naturally works in SWAR style too.

What's even nicer is that since we know that there are plenty of zero bits, the masks become really simple too (and can be encoded as immediates in ARM64).

x = interleave(interleave(0,a), interleave(b,c)); // assumes `0abc 0abc`
x += x & 0x0F0F0F0F;        // 0abcabc0 0abcabc0 0abcabc0 0abcacb0 
x += (x & 0x00FF00FF) * 3;  // 0abcabca'bcabc000 0abcabca'bcabc000 
x += (x & 0x0000FFFF) * 15; // 0abcabcabcabc'abcabcabcabc'0000000
x >>= 7;

One can only interleave 8 bits (in uint32_t) this way, so the options are either move to uint64_t if available, split the computation to interleaving 5 bits at a time or pack 4 computations into 5 32-bit registers.

The interleave itself uses clmul as in

uint32_t interleave2(uint32_t a, uint32_t b) {
     return clmul(a,a) + 2 * clmul(b,b);
}

uint32_t interleave4(a,b,c) {
  auto ac = clmul(a, a + a) ^ clmul(c, c);
  auto ob = clmul(b, b);
  return = clmul(ac, ac) ^ clmul(ob, ob + ob);
}

The typical flow for interleaving two elements is clmul(a,a) + 2 * clmul(b,b); , however, one can as well premultiply one of the inputs for slightly different dependency chain between the instructions.

EDIT 3 - how to efficiently use the 8-bit only version for 10 bits

oa = clmul(a,a); bc = interleave(b,c);
// we compute the bottom 6/8 bits here
X = interleave(oa, bc) & 127;  // ...........................0ABC0abc
X = X + (x & 7);               // ...........................0ABCabc0
// and the TOP 24/32 bits here, as in the original implementation
x = interleave(oa >> 4, bc >> 4); // 0abc0abc 0abc0abc 0abc0abc 0abc0abc
x += x & 0x0F0F0F0F;              // 0abcabc0 0abcabc0 0abcabc0 0abcacb0
x += (x & 0x00FF00FF) * 3;        // 0abcabca'bcabc000 0abcabca'bcabc000
x += (x & 0x0000FFFF) * 15;       // 0abcabcabcabc'abcabcabcabc'0000000
return (x | X) >> 1;

thus, the 2 extra bits require only those statements with X or about 7 easy arithmetic operations

EDIT

One can also do a hybrid version clmul and the given concept to spread bits with easier magic constants and easier multipliers

uint32_t deposit_every_third_clmul_shifted(uint32_t x) {
  // just spread every bit once
  // - then we list the result after each stage
  //  and by how many bits each bit needs to be shifted left
  auto x = clmul(x,x);        //         0a0b0c0d0e0f0g0h0i0j
                              //          9 8 7 6 5 4 3 2 1 0
  x = x + 15 * (x & 0xfff00); //     0a0b0c0d0e0f00000g0h0i0j
                              //      5 4 3 2 1 0     3 2 1 0
  x = x + 15 * (x & 0xf0000); // 0a0b00000c0d0e0f00000g0h0i0j
                              //  1 0     3 2 1 0     3 2 1 0
  x = x +  2 * (x & 0xf00f0); // 0a0b000c0d000e0f000g0h000i0j
                              //  1 0   1 0   1 0   1 0   1 0
  x = x +    (x & 0x4104104);
 return x;
}

EDIT2 or 3

And yes, there definitely is a method to leverage clmul further.

                0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j clmul 0111
          0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j
    0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j
    - a - b - c---d-c---d---e---f---g---h---i-h-j-i---j <- what we have
a - - b - - c - - d - - e - - f - - g - - h - - i - - j <- what we want
3     2     1     0     2     1     0     2     1     0 <- left shift

Thus we need to

                  0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j 
            0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j
      0 a 0 b 0 c 0 d 0 e 0 f 0 g 0 h 0 i 0 j
      - A - B - c---------D---E---f---g---------H---i---j shift by 2
    A   b       C     D   e       F   g     h       I   j shift by 1 
  a - - b - - c - - d - - e - - f - - g - - h - - i - - j
  
  X = clmul(clmul(X,X), 0b1001001) & mask_0; // 0b1010100001010101000010101
  X += 3 * (X & mask_2); // 0b001010000001010000000010000
  X += 1 * (X & mask_1); // 0b100000100100000100000000100

----

There's one more method, which I've typically used for transposing matrices with dimensions of powers of two, i.e. repeated interleaving of left side and right side -- however this seems to work quite nicely with other shapes as well.

%octave
a=[0:29 -1 -1];for p = 1:5; a=a=[a(1:16);a(16:31)](:)';end, a
 0   10   20    1   11   21    2   12   22    3   13   23    4   14   24     5   15   25    6   16   26    7   17   27    8   18   28    9   19   29   10   20

With the two last bits being garbage, therefore

uint32_t interleave_repeated(uint32_t a, uint32_t b, uint32_t c) {
     a = a | (b << 10) | (c << 20);
     for (int i = 0; i < 5; i++) {
         b = a >> 15;
         a = clmul(a, a);
         b = clmul(b, b);
         a ^= b+b;
     }
     return (a << 2) >> 2; // the last two bits needs to be removed
}

which finally satisfies the requirements of utilising basically just clmul.

5 revs, 3 users 99%njuffa · Answer

In comments, @Falk Hüffner has suggested an interesting way of using CLMUL to compute 3D Morton codes. Since comments are to be considered ephemeral on this site and Falk Hüffner's idea has utility, I have decided to move it into a community wiki answer to record it permanently.

FWIW, when I wrote the question I had somehow forgotten just how much code bloat is caused by the construction of literal constants on RISC-V, with a high likelihood of a negative performance impact on lower-end narrow-issue CPUs. Therefore one should avoid incorporating shift factors into multipliers and masks (as I had done in my question) when writing code for computing 3D Morton codes on this architecture.

I have rectified this shortcoming of the question in the comparison test framework below. The following table records the instruction counts observed from Falk Hüffner's proposed algorithm, as well as Aki Suihkonen's algorithms and the two algorithms I used in the question. All algorithms provide ample instruction-level parallelism for advantageous performance scaling on wide-issue high-end CPUs.

-O3 -march=rv32gc_zbc | clang 20.1 | gcc 15.1
----------------------+------------+----------
HUEFFNER_CLMUL        |    39      |    39 
SUIHKONEN_CLMUL       |    38      |    38
SUIHKONEN_CLMUL_BIS   |    31      |    31
CLASSIC               |    40      |    46
CLASSIC_WITH_CLMUL    |    33      |    33

The full code of my test frame work follows.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define PORTABLE_CLMUL      (0)

#define CLASSIC             (1)
#define CLASSIC_WITH_CLMUL  (2)
#define HUEFFNER_CLMUL      (3)
#define SUIHKONEN_CLMUL     (4)
#define SUIHKONEN_CLMUL_BIS (5)
#define ALGO                (SUIHKONEN_CLMUL_BIS)

long clmul (long, long); // assumption: sizeof(long) tracks register width

// Based directly on a comment by Falk Hüffner on 4/28/2025
uint32_t deposit_every_third_hueffner_9bit (uint32_t x)
{
    uint32_t r;
    r = clmul (clmul (x & 0x1ff, 010101) & 07007007, 0b10101) & 0111111111;
    return r;
}

// Based directly on the answer by Ari Suihkonen on 4/30/2025 
uint32_t deposit_every_third_suihkonen (uint32_t x)
{
    const uint32_t mask_0 = 0b001010100001010101000010101;
    const uint32_t mask_2 = 0b001010000001010000000010000;
    const uint32_t mask_1 = 0b100000100100000100000000100;
    x = clmul (clmul (x, x), 0b1001001) & mask_0;
    x += 3 * (x & mask_2);
    x += 1 * (x & mask_1);
    return x;
}

// Classic method accelerated with CLMUL
uint32_t deposit_every_third_clmul (uint32_t x)
{
    const uint32_t sm1 = 0x333ul;  // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2 = 0x0ccul;  // source mask: select bits 2,3,6,7
    const uint32_t ml1 = 0x50505ul;// multiplier: spread to bits 0-11,8-19,16-27
    const uint32_t ml2 = 0x05050ul;// multiplier: spread to bits 6-13,14-21
    const uint32_t rm1 = 0x09009009ul; // result mask: bits 0,3,12,15,24,27
    const uint32_t rm2 = 0x00240240ul; // result mask: bits 6,9,18,21
    return (clmul (x & sm1, ml1) & rm1) | (clmul (x & sm2, ml2) & rm2);
}

// Classic method using regular integer multiplies
uint32_t deposit_every_third (uint32_t x)
{
    const uint32_t sm1  = 0x333ul;      // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2  = 0x0ccul;      // source mask: select bits 2,3,6,7
    const uint32_t ml1a = 0x50005ul;    // multiplier: spread to bits 0-11,16-27
    const uint32_t ml1b = 0x00500ul;    // multiplier: spread to bits 8-19
    const uint32_t ml2  = 0x05050ul;    // multiplier: spread to bits 6-13,14-21
    const uint32_t rm1  = 0x09009009ul; // result mask: bits 0,3,12,15,24,27
    const uint32_t rm2  = 0x00240240ul; // result mask: bits 6,9,18,21
    uint32_t t = x & sm1;
    return ((((t * ml1a) | (t * ml1b)) & rm1) | (((x & sm2) * ml2) & rm2));
}

/* For x, y, z in [0, 1023], compute 30-bit 3D Morton code by interleaving bits.
   For i in [0, 9]: r<3*i+0> = x<i>, r<3*i+1> = y<i>, r<3*i+2> = z<i>.
*/
uint32_t encode_morton3d_10_bits (uint32_t x, uint32_t y, uint32_t z)
{
    uint32_t r;
#if (ALGO == CLASSIC)
    r = ((deposit_every_third (x) << 0) |
         (deposit_every_third (y) << 1) |
         (deposit_every_third (z) << 2));
#elif (ALGO == CLASSIC_WITH_CLMUL)
    r = ((deposit_every_third_clmul (x) << 0) |
         (deposit_every_third_clmul (y) << 1) |
         (deposit_every_third_clmul (z) << 2));
#elif (ALGO == HUEFFNER_CLMUL)
    r = ((deposit_every_third_hueffner_9bit (x) << 0) |
         (deposit_every_third_hueffner_9bit (y) << 1) |
         (deposit_every_third_hueffner_9bit (z) << 2));
    // handle bit 9 separately
    r |= ((x & (1 << 9)) | 2 * (y & (1 << 9)) | 4 * (z & (1 << 9))) << (27 - 9);
#elif (ALGO == SUIHKONEN_CLMUL)
    r = ((deposit_every_third_suihkonen (x) << 0) |
         (deposit_every_third_suihkonen (y) << 1) |
         (deposit_every_third_suihkonen (z) << 2));
#elif (ALGO == SUIHKONEN_CLMUL_BIS)
    // based directly on Aki Suihkonen's updated answer 5/3/2025
    x = x | (y << 10) | (z << 20);
    for (int i = 0; i < 5; i++) {
        y = x >> 15;
        x = clmul (x, x);
        y = clmul (y, y);
        x ^= y + y;
    }
    r = (x << 2) >> 2;
#else // ALGO
#error unknown ALGO
#endif // ALGO
    return r;
}

#if (PORTABLE_CLMUL)
long clmul (long rs1, long rs2)
{
    uint32_t x = 0;
    for (int i = 0; i < 32; i++)
        if ((rs2 >> i) & 1)
            x ^= rs1 << i;
    return x;
}
#else // !PORTABLE_CLMUL
long clmul (long a, long b) // RISC-V with Zbc extension
{
  long r;
  __asm__("clmul	%0,%1,%2" : "=r"(r) : "r"(a),"r"(b));
  return r;
}
#endif // PORTABLE_CLMUL

/* For argument x in which only bits x<3*i> are utilized and the remaining
   bits must be zero, right-compact the used bits into a 10-bit integer, i.e.
   r<i> = a<3*i>.
*/
uint32_t gather_every_third (uint32_t x)
{
    const uint32_t group_mult = (1L <<  4) | (1L <<  2) | (1L << 0);
    const uint32_t group_mask = (7L << 22) | (7L << 13) | (7L << 4);
    const uint32_t concat_mul = (1L << 14) | (1L <<  8) | (1L << 2);
    const uint32_t concat_msk = (0x1ffL << 18);
    const uint32_t bit27_mask = (1L << 27);
    uint32_t t; 
    t = x * group_mult; // form 3-bit groups: t<4:6>,t<13:15>,t<22:24>
    t = t & group_mask; // isolate the 3-bit groups
    t = t * concat_mul; // concatenate groups, t<26:18>
    t = t & concat_msk; // isolate the concatenation of three 3-bit groups
    x = x & bit27_mask; // isolate ungrouped bit t<27>
    t = (t | x) >> 18 ; // merge ungrouped bit t<27>, move result to t<9:0>
    return t;
}

/* De-interleave a 30-bit 3D Morton code into units x, y, z each in [0,1023] 
   For i in [0, 9]: x<i> = a<3*i+0>, y<i> = a<3*i+1>, z<i> = a<3*i+1>
*/
void decode_morton3d_10_bits (uint32_t a, uint32_t *x, uint32_t *y, uint32_t *z)
{
    uint32_t mask = 01111111111; // octal constant!
    *x = gather_every_third ((a >> 0) & mask);
    *y = gather_every_third ((a >> 1) & mask);
    *z = gather_every_third ((a >> 2) & mask);
}

int main (void)
{
    for (uint32_t x = 0; x < 1024; x++) {
        for (uint32_t y = 0; y < 1024; y++) {
            for (uint32_t z = 0; z < 1024; z++) {
                uint32_t encoded = encode_morton3d_10_bits (x, y, z);
                uint32_t dx, dy, dz;
                decode_morton3d_10_bits (encoded, &dx, &dy, &dz);
                if ((x != dx) | (y != dy) | (z != dz)) {
                    printf ("error @ %03lx,%03lx,%03lx: encoded=%08lx decoded=%03lx,%03lx,%03lx
",
                            (unsigned long int)x, (unsigned long int)y, (unsigned long int)z,
                            (unsigned long int)encoded,
                            (unsigned long int)dx, (unsigned long int)dy, (unsigned long int)dz);
                    return EXIT_FAILURE;
                }
            }
        }
    }
    return EXIT_SUCCESS;
}

3D Morton code computation utilizing carry-less multiplication

Tags:

algorithm

bit-manipulation

assembly

micro-optimization

riscv

njuffa

2 Answers

Aki Suihkonen

5 revs, 3 users 99%njuffa

Recent Activity

Donate For Us

3D Morton code computation utilizing carry-less multiplication

Tags:

algorithm

bit-manipulation

assembly

micro-optimization

riscv

njuffa

2 Answers

Aki Suihkonen

5 revs, 3 users 99%njuffa

Related questions

Recent Activity

Donate For Us