Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to use XACQUIRE, XRELEASE Hardware Lock Elision (HLE) prefix hints?

Just for the sake of learning this, I'm trying to grasp how to use HLE prefixes XACQUIRE and XRELEASE. After reading the Intel documentation, my understanding was that after executing an instruction with the XACQUIRE prefix the CPU enters into some sort of a write lock until the instruction with the XRELEASE prefix. So I wrote the following test code to see if I'm correct. Well, there's still something that I don't understand because my code sample fails.

So can someone tell me what am I missing with those HLE prefixes?

Two fails:

  1. The xtest instruction reports that HLE was not enabled, and

  2. Because my assumed "mutex-ed" code doesn't run as a mutex, it fails concurrency.

Next is the Windows C++ project, compiled with VS 2017 with x64 .asm file as follows:

.code

testCPUID PROC
    push rbx

    ; CPUID.07h.EBX.HLE[bit 4]==1

    mov eax, 7h
    xor ecx, ecx
    cpuid
    and rbx, 1 shl 4

    mov rax, rbx
    pop rbx
    ret
testCPUID ENDP



testHLEWrite PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;

    xor edx, edx
    xacquire xchg [rcx], rdx        ; I'm assuming that this will work as a mutex ...

    xtest                           ; Sanity check to see if HLE got enabled?
    jnz lbl_00                      ; If HLE is on => ZF=0
    int 3                           ; we get here if HLE did not get enabled
lbl_00:

    ; Do some nonsensical stuff
    ; The idea is to write sequential values into a shared array
    ; to see if the lock above holds
    ; Format:
    ;       > --16 sequential bytes-- <

    mov r8d, dword ptr [rcx + 8]

    mov byte ptr [rdx], '>'
    inc rdx

    ; Write 16 sequential bytes

    mov rax, 10h
lbl_01:
    mov byte ptr [rdx], r8b
    inc r8
    inc rdx
    dec rax
    jnz lbl_01

    mov byte ptr [rdx], '<'
    inc rdx

    cmp rdx, [rcx + 10h]            ; check if reached the end of buffer
    jb lbl_02
    mov rdx, [rcx + 18h]            ; reset ptr to the beginning of buffer
lbl_02:

    mov dword ptr [rcx + 8], r8d
    xrelease mov [rcx], rdx         ; this will release the mutex

    ret
testHLEWrite ENDP





testHLEForCorrectness PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;

    xor edx, edx
    xacquire xchg [rcx], rdx        ; I'm assuming that this will work as a mutex ...

    xtest                           ; Sanity check to see if HLE got enabled?
    jnz lbl_00                      ; If HLE is on => ZF=0
    int 3                           ; we get here if HLE did not get enabled
lbl_00:

    mov r9, [rcx + 18h]

lbl_repeat:
    cmp r9, rdx
    jae lbl_out

    cmp byte ptr [r9], '>'
    jnz lbl_bad
    cmp byte ptr [r9 + 1 + 10h], '<'
    jnz lbl_bad

    mov r8b, byte ptr [r9 + 1]
    sub eax, eax
lbl_01:
    cmp [r9 + rax + 1], r8b
    jnz lbl_bad
    inc rax
    inc r8
    cmp rax, 10h
    jb lbl_01

    add r9, 2 + 10h
    jmp lbl_repeat

lbl_out:

    xrelease mov [rcx], rdx         ; this will release the mutex

    ret

lbl_bad:
    ; Verification failed
    int 3

testHLEForCorrectness ENDP

END

And this is how it's called from the user-mode C++ project:

#include <assert.h>
#include <Windows.h>

struct TST91{
    BYTE* pNextWrite;
    int nNextValue;
    BYTE* pCutoffPtr;
    BYTE* pBeginPtr;
};

extern "C" {
    BOOL testCPUID(void);
    void testHLEWrite(TST91* p);
    void testHLEForCorrectness(TST91* p);
};

DWORD WINAPI ThreadProc01(LPVOID lpParameter);

TST91* gpStruct = NULL;
BYTE* gpMem = NULL;             //Its size is 'gszcbMemSize' BYTEs
const size_t gszcbMemSize = 0x1000 * 8;

int main()
{
    if(testCPUID())
    {
        gpStruct = new TST91;
        gpMem = new BYTE[gszcbMemSize];

        gpStruct->pNextWrite = gpMem;
        gpStruct->nNextValue = 1;
        gpStruct->pBeginPtr = gpMem;
        gpStruct->pCutoffPtr = gpMem + gszcbMemSize - 0x100;

        for(int t = 0; t < 5; t++)
        {
            CloseThread(CreateThread(NULL, 0, 
                ThreadProc01, (VOID*)(1LL << t), 0, NULL));
        }

        _gettch();

        delete gpStruct;
        delete[] gpMem;
    }
    else
        _tprintf(L"Your CPU doesn't support HLE\n");

   return 0;
}

DWORD WINAPI ThreadProc01(LPVOID lpParameter)
{
    if(!SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)lpParameter))
    {
        assert(NULL);
    }

    for(;;)
    {
        testHLEWrite(gpStruct);
        testHLEForCorrectness(gpStruct);
    }

    return 0;
}
like image 812
MikeF Avatar asked Jan 03 '23 02:01

MikeF


1 Answers

You can answer your own questions, can't you?

Anyway. I think I got it. I'll try to stick with plain English, or go with how I understand it. Feel free to edit it out if I make an incorrect statement. (By the way, Hardware Lock Elision, what a cool name. Sounds like some Matt Damon movie. I even had to Google word "elision" to understand what it means... and I still don't remember it.)

So this HLE concept is nothing more than a hint for the CPU to treat the lock prefix in a more optimized way. The lock prefix by itself is somewhat "expensive" for the modern processors to execute in an efficient way. So when the CPU that supports it sees the HLE prefix it will initially not acquire the lock, but will do so only if there is a read/write conflict. In that case the CPU will issue an HLE abort, that in turn will require a later conventional lock.

Morever, the HLE prefix for XACQUIRE is F2, and for XRELEASE is F3, which is nothing more than the old-school REPNE and REP prefixes, that are simply ignored when used with a lock-able instruction by the older CPUs that don't support HLE. What all this means is that to use HLE one doesn't need to check with CPUID instruction for its support and can safely use them as-is. The older CPUs will ignore them and treat the accompanying lock prefix as a lock, while newer CPUs will take them as an optimization hint. In other words, using those XACQUIRE and XRELEASE prefixes will not hurt anything if you add them into your own implementation of a mutex, semaphore, you name it.

So having said that, I had to rewrite my original test code sample as such (just the relevant concurrency parts for a very basic mutex-type lock).

ASM code to enter the lock:

testHLEWrite PROC
    ; RCX = pointer to TST91 struct:
    ;       void* pPtrToNextWrite;
    ;       int nNextValue;
    ;       void* pCutoffPtr;
    ;       void* pBeginPtr;
    ;       size_t lock;          <-- new member

lbl_retry:
    xacquire lock bts qword ptr [rcx + 20h], 1      ; Try to acquire lock (use HLE hint prefix)
    jnc lbl_locked
    pause                       ; Will issue an implicit HLE abort
    jmp lbl_retry


lbl_locked:

and then to leave the lock:

(Note here that XRELEASE prefix differs from the lock prefix in that it supports a mov instruction that has a memory destination operand.)

    xrelease mov qword ptr [rcx + 20h], 0       ; Release the lock (use HLE prefix hint)

    ret
testHLEWrite ENDP

Also if you want to write it in C with the use of (Visual Studio's) intrinsics:

//Some variable to hold the lock
volatile long lock = 0;

and then the code itself:

//Acquire the lock
while(_interlockedbittestandset_HLEAcquire((long *)&lock, 1))
{
    _mm_pause();
}

and then:

//Leave the lock
_Store_HLERelease(&lock, 0);

Lastly, I want to point out that I haven't done any timing/benchmark tests on the performance of the code with and without the HLE prefixes. So if someone wants to do it (and see the validity of the HLE concept) you're welcome to it. I'll be glad to learn it as well.

like image 138
MikeF Avatar answered Jan 05 '23 18:01

MikeF