Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Optimizing portable 128-bit integer shifts on a 32-bit architecture

In my spare time I've been working on a utility library that among other things, supports signed/unsigned 128-bit integers. This library uses cpu-dispatching to utilize simd instructions in some cases, but requires a portable fallback so it will run everywhere else. Most recently I implemented the portable fallback for 128-bit shifting. It works correctly and runs reasonably fast, but it's not as fast as I'd like it to be, especially on 32-bit architectures.

Here's a stripped version with all relevant types and functionality(64-bit version included for completeness):

typedef uint32_t UInt32;
typedef int32_t Int32;
typedef uint64_t UInt64;
typedef int64_t Int64;

// Returns 0xFFFFFFFF if value != 0, otherwise returns 0.
UInt32 AllOrNothingMask32(Int32 value)
{
    return UInt32(-Int32(value != 0));
}

struct alignas(16) UInt128
{     
    // Ensure the layout matches the architecture.
    // LE = little endian
    // BE = big endian
#if CPU_TYPE == CPU_LE32
    UInt32 mLow;
    UInt32 mLowMid;
    UInt32 mHighMid;
    UInt32 mHigh;
#elif CPU_TYPE == CPU_BE32
    UInt32 mHigh;
    UInt32 mHighMid;
    UInt32 mLowMid;
    UInt32 mLow;
#elif CPU_TYPE == CPU_LE64
    UInt64 mLow;
    UInt64 mHigh;
#elif CPU_TYPE == CPU_BE64
    UInt64 mHigh;
    UInt64 mLow;
#endif

    UInt128() = default;

    UInt128& operator=(const UInt128& other) = default;

    inline
    UInt128(UInt32 high, UInt32 highMid, UInt32 lowMid, UInt32 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(low),
        mLowMid(lowMid),
        mHighMid(highMid),
        mHigh(high) { }
#elif CPU_SIZE == CPU_64BIT
        mLow((UInt64(lowMid) << 32) | low),
        mHigh((UInt64(high) << 32) | highMid) { }
#endif

    inline
    UInt128(UInt64 high, UInt64 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(UInt32(low)),
        mLowMid(UInt32(low >> 32)),
        mHighMid(UInt32(high)),
        mHigh(UInt32(high >> 32)) { }
#elif CPU_SIZE == CPU_64BIT
        mLow(low),
        mHigh(high) { }
#endif

    inline
    bool UInt128::operator==(const UInt128& other) const noexcept
    {
#if CPU_TYPE == CPU_32BIT
        return mLow == other.mLow &&
               mLowMid == other.mLowMid &&
               mHighMid == other.mHighMid &&
               mHigh == other.mHigh;
#elif CPU_TYPE == CPU_64BIT
        return mLow == other.mLow &&
               mHigh == other.mHigh;
#endif
    }

    inline
    UInt128& UInt128::operator<<=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow;
        auto lowMid = mLowMid;
        auto highMid = mHighMid;
        auto high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low << shift);
            mLowMid = (lowMid << shift) | (low >> rshift);
            mHighMid = (highMid << shift) | (lowMid >> rshift);
            mHigh = (high << shift) | (highMid >> rshift);
        } else if (shift < 64) {
            auto lshift = (shift - 32);
            auto rshift = (32 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = (low << lshift);
            mHighMid = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
            mHigh = (highMid << lshift) | ((lowMid >> rshift) & rshiftMask);
        } else if (shift < 96) {
            auto lshift = (shift - 64);
            auto rshift = (64 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = 0;
            mHighMid = (low << lshift);
            mHigh = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
        } else {
            mLow = 0;
            mLowMid = 0;
            mHighMid = 0;
            mHigh = (low << (shift - 96));
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low << shift);
            mHigh = (high << shift) | (low >> (64 - shift));
        } else {
            mLow = 0;
            mHigh = (low << (shift - 64));
        }
#endif
        return *this;
    }

    inline
    UInt128& UInt128::operator>>=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow,
             lowMid = mLowMid,
             highMid = mHighMid,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low >> shift) | (lowMid << rshift);
            mLowMid = (lowMid >> shift) | (highMid << rshift);
            mHighMid = (highMid >> shift) | (high << rshift);
            mHigh = (high >> shift);
        } else if (shift < 64) {
            auto rshift = (shift - 32);
            auto lshift = (32 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (lowMid >> rshift) | ((highMid << lshift) & lshiftMask);
            mLowMid = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mHighMid = (high >> rshift);
            mHigh = 0;
        } else if (shift < 96) {
            auto rshift = (shift - 64);
            auto lshift = (64 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mLowMid = (high >> rshift);
            mHighMid = 0;
            mHigh = 0;
        } else {
            mLow = (high >> (shift - 96));
            mLowMid = 0;
            mHighMid = 0;
            mHigh = 0;
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low >> shift) | (high << (64 - shift));
            mHigh = (high >> shift);
        } else {
            mLow = (high >> (shift - 64));
            mHigh = 0;
        }
#endif
        return *this;
    }
};

The relevant 32-bit assembly output is rather long, so unless requested I'm omitting it.

The main bottleneck is obviously branching when the shift argument isn't known at compile time. What can be done to eliminate branching, or for that matter what portable trickery can be utilized to speed this up?

Update 1

Added the copy assignment operator that was missing in the above example. For those interested, here are the unit tests. I'm using Catch for it's simplicity.

// Left shift lookup table, from 0-127.
const UInt128 gLeftShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200),
    UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400),
    UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800),
    UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000),
    UInt128(0xFDDBB9977553310E, 0xECCAA88664422000),
    UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000),
    UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000),
    UInt128(0xEEDDCCBBAA998877, 0x6655443322110000),
    UInt128(0xDDBB9977553310EE, 0xCCAA886644220000),
    UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000),
    UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000),
    UInt128(0xEDDCCBBAA9988776, 0x6554433221100000),
    UInt128(0xDBB9977553310EEC, 0xCAA8866442200000),
    UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000),
    UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000),
    UInt128(0xDDCCBBAA99887766, 0x5544332211000000),
    UInt128(0xBB9977553310EECC, 0xAA88664422000000),
    UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000),
    UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000),
    UInt128(0xDCCBBAA998877665, 0x5443322110000000),
    UInt128(0xB9977553310EECCA, 0xA886644220000000),
    UInt128(0x732EEAA6621DD995, 0x510CC88440000000),
    UInt128(0xE65DD54CC43BB32A, 0xA219910880000000),
    UInt128(0xCCBBAA9988776655, 0x4433221100000000),
    UInt128(0x9977553310EECCAA, 0x8866442200000000),
    UInt128(0x32EEAA6621DD9955, 0x10CC884400000000),
    UInt128(0x65DD54CC43BB32AA, 0x2199108800000000),
    UInt128(0xCBBAA99887766554, 0x4332211000000000),
    UInt128(0x977553310EECCAA8, 0x8664422000000000),
    UInt128(0x2EEAA6621DD99551, 0xCC8844000000000),
    UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000),
    UInt128(0xBBAA998877665544, 0x3322110000000000),
    UInt128(0x77553310EECCAA88, 0x6644220000000000),
    UInt128(0xEEAA6621DD995510, 0xCC88440000000000),
    UInt128(0xDD54CC43BB32AA21, 0x9910880000000000),
    UInt128(0xBAA9988776655443, 0x3221100000000000),
    UInt128(0x7553310EECCAA886, 0x6442200000000000),
    UInt128(0xEAA6621DD995510C, 0xC884400000000000),
    UInt128(0xD54CC43BB32AA219, 0x9108800000000000),
    UInt128(0xAA99887766554433, 0x2211000000000000),
    UInt128(0x553310EECCAA8866, 0x4422000000000000),
    UInt128(0xAA6621DD995510CC, 0x8844000000000000),
    UInt128(0x54CC43BB32AA2199, 0x1088000000000000),
    UInt128(0xA998877665544332, 0x2110000000000000),
    UInt128(0x53310EECCAA88664, 0x4220000000000000),
    UInt128(0xA6621DD995510CC8, 0x8440000000000000),
    UInt128(0x4CC43BB32AA21991, 0x880000000000000),
    UInt128(0x9988776655443322, 0x1100000000000000),
    UInt128(0x3310EECCAA886644, 0x2200000000000000),
    UInt128(0x6621DD995510CC88, 0x4400000000000000),
    UInt128(0xCC43BB32AA219910, 0x8800000000000000),
    UInt128(0x9887766554433221, 0x1000000000000000),
    UInt128(0x310EECCAA8866442, 0x2000000000000000),
    UInt128(0x621DD995510CC884, 0x4000000000000000),
    UInt128(0xC43BB32AA2199108, 0x8000000000000000),
    UInt128(0x8877665544332211, 0x0),
    UInt128(0x10EECCAA88664422, 0x0),
    UInt128(0x21DD995510CC8844, 0x0),
    UInt128(0x43BB32AA21991088, 0x0),
    UInt128(0x8776655443322110, 0x0),
    UInt128(0xEECCAA886644220 , 0x0),
    UInt128(0x1DD995510CC88440, 0x0),
    UInt128(0x3BB32AA219910880, 0x0),
    UInt128(0x7766554433221100, 0x0),
    UInt128(0xEECCAA8866442200, 0x0),
    UInt128(0xDD995510CC884400, 0x0),
    UInt128(0xBB32AA2199108800, 0x0),
    UInt128(0x7665544332211000, 0x0),
    UInt128(0xECCAA88664422000, 0x0),
    UInt128(0xD995510CC8844000, 0x0),
    UInt128(0xB32AA21991088000, 0x0),
    UInt128(0x6655443322110000, 0x0),
    UInt128(0xCCAA886644220000, 0x0),
    UInt128(0x995510CC88440000, 0x0),
    UInt128(0x32AA219910880000, 0x0),
    UInt128(0x6554433221100000, 0x0),
    UInt128(0xCAA8866442200000, 0x0),
    UInt128(0x95510CC884400000, 0x0),
    UInt128(0x2AA2199108800000, 0x0),
    UInt128(0x5544332211000000, 0x0),
    UInt128(0xAA88664422000000, 0x0),
    UInt128(0x5510CC8844000000, 0x0),
    UInt128(0xAA21991088000000, 0x0),
    UInt128(0x5443322110000000, 0x0),
    UInt128(0xA886644220000000, 0x0),
    UInt128(0x510CC88440000000, 0x0),
    UInt128(0xA219910880000000, 0x0),
    UInt128(0x4433221100000000, 0x0),
    UInt128(0x8866442200000000, 0x0),
    UInt128(0x10CC884400000000, 0x0),
    UInt128(0x2199108800000000, 0x0),
    UInt128(0x4332211000000000, 0x0),
    UInt128(0x8664422000000000, 0x0),
    UInt128(0xCC8844000000000 , 0x0),
    UInt128(0x1991088000000000, 0x0),
    UInt128(0x3322110000000000, 0x0),
    UInt128(0x6644220000000000, 0x0),
    UInt128(0xCC88440000000000, 0x0),
    UInt128(0x9910880000000000, 0x0),
    UInt128(0x3221100000000000, 0x0),
    UInt128(0x6442200000000000, 0x0),
    UInt128(0xC884400000000000, 0x0),
    UInt128(0x9108800000000000, 0x0),
    UInt128(0x2211000000000000, 0x0),
    UInt128(0x4422000000000000, 0x0),
    UInt128(0x8844000000000000, 0x0),
    UInt128(0x1088000000000000, 0x0),
    UInt128(0x2110000000000000, 0x0),
    UInt128(0x4220000000000000, 0x0),
    UInt128(0x8440000000000000, 0x0),
    UInt128(0x880000000000000 , 0x0),
    UInt128(0x1100000000000000, 0x0),
    UInt128(0x2200000000000000, 0x0),
    UInt128(0x4400000000000000, 0x0),
    UInt128(0x8800000000000000, 0x0),
    UInt128(0x1000000000000000, 0x0),
    UInt128(0x2000000000000000, 0x0),
    UInt128(0x4000000000000000, 0x0),
    UInt128(0x8000000000000000, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0)
};
// Right shift lookup table, from 0-127.
const UInt128 gRightShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880),
    UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440),
    UInt128(0x1FFDDBB997755331, 0xEECCAA886644220),
    UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110),
    UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088),
    UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844),
    UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422),
    UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211),
    UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108),
    UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884),
    UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442),
    UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221),
    UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910),
    UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88),
    UInt128(0x1FFDDBB997755, 0x3310EECCAA886644),
    UInt128(0xFFEEDDCCBBAA, 0x9988776655443322),
    UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991),
    UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8),
    UInt128(0x1FFDDBB99775, 0x53310EECCAA88664),
    UInt128(0xFFEEDDCCBBA, 0xA998877665544332),
    UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199),
    UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC),
    UInt128(0x1FFDDBB9977, 0x553310EECCAA8866),
    UInt128(0xFFEEDDCCBB, 0xAA99887766554433),
    UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219),
    UInt128(0x3FFBB7732E, 0xEAA6621DD995510C),
    UInt128(0x1FFDDBB997, 0x7553310EECCAA886),
    UInt128(0xFFEEDDCCB, 0xBAA9988776655443),
    UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21),
    UInt128(0x3FFBB7732, 0xEEAA6621DD995510),
    UInt128(0x1FFDDBB99, 0x77553310EECCAA88),
    UInt128(0xFFEEDDCC, 0xBBAA998877665544),
    UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2),
    UInt128(0x3FFBB773, 0x2EEAA6621DD99551),
    UInt128(0x1FFDDBB9, 0x977553310EECCAA8),
    UInt128(0xFFEEDDC, 0xCBBAA99887766554),
    UInt128(0x7FF76EE, 0x65DD54CC43BB32AA),
    UInt128(0x3FFBB77, 0x32EEAA6621DD9955),
    UInt128(0x1FFDDBB, 0x9977553310EECCAA),
    UInt128(0xFFEEDD, 0xCCBBAA9988776655),
    UInt128(0x7FF76E, 0xE65DD54CC43BB32A),
    UInt128(0x3FFBB7, 0x732EEAA6621DD995),
    UInt128(0x1FFDDB, 0xB9977553310EECCA),
    UInt128(0xFFEED, 0xDCCBBAA998877665),
    UInt128(0x7FF76, 0xEE65DD54CC43BB32),
    UInt128(0x3FFBB, 0x7732EEAA6621DD99),
    UInt128(0x1FFDD, 0xBB9977553310EECC),
    UInt128(0xFFEE, 0xDDCCBBAA99887766),
    UInt128(0x7FF7, 0x6EE65DD54CC43BB3),
    UInt128(0x3FFB, 0xB7732EEAA6621DD9),
    UInt128(0x1FFD, 0xDBB9977553310EEC),
    UInt128(0xFFE, 0xEDDCCBBAA9988776),
    UInt128(0x7FF, 0x76EE65DD54CC43BB),
    UInt128(0x3FF, 0xBB7732EEAA6621DD),
    UInt128(0x1FF, 0xDDBB9977553310EE),
    UInt128(0xFF, 0xEEDDCCBBAA998877),
    UInt128(0x7F, 0xF76EE65DD54CC43B),
    UInt128(0x3F, 0xFBB7732EEAA6621D),
    UInt128(0x1F, 0xFDDBB9977553310E),
    UInt128(0xF, 0xFEEDDCCBBAA99887),
    UInt128(0x7, 0xFF76EE65DD54CC43),
    UInt128(0x3, 0xFFBB7732EEAA6621),
    UInt128(0x1, 0xFFDDBB9977553310),
    UInt128(0x0, 0xFFEEDDCCBBAA9988),
    UInt128(0x0, 0x7FF76EE65DD54CC4),
    UInt128(0x0, 0x3FFBB7732EEAA662),
    UInt128(0x0, 0x1FFDDBB997755331),
    UInt128(0x0, 0xFFEEDDCCBBAA998),
    UInt128(0x0, 0x7FF76EE65DD54CC),
    UInt128(0x0, 0x3FFBB7732EEAA66),
    UInt128(0x0, 0x1FFDDBB99775533),
    UInt128(0x0, 0xFFEEDDCCBBAA99),
    UInt128(0x0, 0x7FF76EE65DD54C),
    UInt128(0x0, 0x3FFBB7732EEAA6),
    UInt128(0x0, 0x1FFDDBB9977553),
    UInt128(0x0, 0xFFEEDDCCBBAA9),
    UInt128(0x0, 0x7FF76EE65DD54),
    UInt128(0x0, 0x3FFBB7732EEAA),
    UInt128(0x0, 0x1FFDDBB997755),
    UInt128(0x0, 0xFFEEDDCCBBAA),
    UInt128(0x0, 0x7FF76EE65DD5),
    UInt128(0x0, 0x3FFBB7732EEA),
    UInt128(0x0, 0x1FFDDBB99775),
    UInt128(0x0, 0xFFEEDDCCBBA),
    UInt128(0x0, 0x7FF76EE65DD),
    UInt128(0x0, 0x3FFBB7732EE),
    UInt128(0x0, 0x1FFDDBB9977),
    UInt128(0x0, 0xFFEEDDCCBB),
    UInt128(0x0, 0x7FF76EE65D),
    UInt128(0x0, 0x3FFBB7732E),
    UInt128(0x0, 0x1FFDDBB997),
    UInt128(0x0, 0xFFEEDDCCB),
    UInt128(0x0, 0x7FF76EE65),
    UInt128(0x0, 0x3FFBB7732),
    UInt128(0x0, 0x1FFDDBB99),
    UInt128(0x0, 0xFFEEDDCC),
    UInt128(0x0, 0x7FF76EE6),
    UInt128(0x0, 0x3FFBB773),
    UInt128(0x0, 0x1FFDDBB9),
    UInt128(0x0, 0xFFEEDDC),
    UInt128(0x0, 0x7FF76EE),
    UInt128(0x0, 0x3FFBB77),
    UInt128(0x0, 0x1FFDDBB),
    UInt128(0x0, 0xFFEEDD),
    UInt128(0x0, 0x7FF76E),
    UInt128(0x0, 0x3FFBB7),
    UInt128(0x0, 0x1FFDDB),
    UInt128(0x0, 0xFFEED),
    UInt128(0x0, 0x7FF76),
    UInt128(0x0, 0x3FFBB),
    UInt128(0x0, 0x1FFDD),
    UInt128(0x0, 0xFFEE),
    UInt128(0x0, 0x7FF7),
    UInt128(0x0, 0x3FFB),
    UInt128(0x0, 0x1FFD),
    UInt128(0x0, 0xFFE),
    UInt128(0x0, 0x7FF),
    UInt128(0x0, 0x3FF),
    UInt128(0x0, 0x1FF),
    UInt128(0x0, 0xFF),
    UInt128(0x0, 0x7F),
    UInt128(0x0, 0x3F),
    UInt128(0x0, 0x1F),
    UInt128(0x0, 0xF),
    UInt128(0x0, 0x7),
    UInt128(0x0, 0x3),
    UInt128(0x0, 0x1)
};

TEST_CASE("UInt128 left shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 1; i <= 127; i++) {
        auto sample = base;
        sample <<= i;

        INFO("i = " << i);
        REQUIRE(sample == gLeftShiftLut128[i]);
    }
}

TEST_CASE("UInt128 right shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 0; i <= 127; i++) {
        auto sample = base;
        sample >>= i;

        INFO("i = " << i);
        REQUIRE(sample == gRightShiftLut128[i]);
    }
}
like image 699
Koby Duck Avatar asked Oct 29 '22 10:10

Koby Duck


1 Answers

I haven't benchmarked it, but something like this is branchless:

inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
    auto lshift = shift & 31;
    auto rshift = 31 - lshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
        0, 0, 0, 0,
        mLow << lshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mHigh << lshift | mHighMid >> 1 >> rshift
#elif CPU_TYPE == CPU_BE32
        mHigh << lshift | mHighMid >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mLow << lshift,
        0, 0, 0, 0
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_LE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}

inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
    auto rshift = shift & 31;
    auto lshift = 31 - rshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
    mLow >> rshift | mMidLow << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mHigh >> rshift,
    0, 0, 0, 0
#elif CPU_TYPE == CPU_BE32
    0, 0, 0, 0,
    mHigh >> rshift,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mLow >> rshift | mMidLow << lshift << 1
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_BE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}
like image 133
ephemient Avatar answered Nov 15 '22 07:11

ephemient