I'm working on the irq_lock() / irq_unlock() implementation for an RTOS and found an issue. We want to absolutely minimize how much time the CPU spends with interrupts locked. Right now, our irq_lock() inline function for x86 uses "memory" clobber:
static ALWAYS_INLINE unsigned int _do_irq_lock(void)
{
unsigned int key;
__asm__ volatile (
"pushfl;\n\t"
"cli;\n\t"
"popl %0;\n\t"
: "=g" (key)
:
: "memory"
);
return key;
}
The problem is that the compiler will still reorder potentially expensive operations into the critical section if they just touch registers and not memory. A specific example is here in our kernel's sleep function:
void k_sleep(int32_t duration)
{
__ASSERT(!_is_in_isr(), "");
__ASSERT(duration != K_FOREVER, "");
K_DEBUG("thread %p for %d ns\n", _current, duration);
/* wait of 0 ns is treated as a 'yield' */
if (duration == 0) {
k_yield();
return;
}
int32_t ticks = _TICK_ALIGN + _ms_to_ticks(duration);
int key = irq_lock();
_remove_thread_from_ready_q(_current);
_add_thread_timeout(_current, NULL, ticks);
_Swap(key);
}
The 'ticks' calculation, which does expensive math, gets reordered inside where we lock interrupts, so we are calling __divdi3 with interrupts locked which is not what we want:
Dump of assembler code for function k_sleep:
0x0010197a <+0>: push %ebp
0x0010197b <+1>: mov %esp,%ebp
0x0010197d <+3>: push %edi
0x0010197e <+4>: push %esi
0x0010197f <+5>: push %ebx
0x00101980 <+6>: mov 0x8(%ebp),%edi
0x00101983 <+9>: test %edi,%edi
0x00101985 <+11>: jne 0x101993 <k_sleep+25>
0x00101987 <+13>: lea -0xc(%ebp),%esp
0x0010198a <+16>: pop %ebx
0x0010198b <+17>: pop %esi
0x0010198c <+18>: pop %edi
0x0010198d <+19>: pop %ebp
0x0010198e <+20>: jmp 0x101944 <k_yield>
0x00101993 <+25>: pushf
0x00101994 <+26>: cli
0x00101995 <+27>: pop %esi
0x00101996 <+28>: pushl 0x104608
0x0010199c <+34>: call 0x101726 <_remove_thread_from_ready_q>
0x001019a1 <+39>: mov $0x64,%eax
0x001019a6 <+44>: imul %edi
0x001019a8 <+46>: mov 0x104608,%ebx
0x001019ae <+52>: add $0x3e7,%eax
0x001019b3 <+57>: adc $0x0,%edx
0x001019b6 <+60>: mov %ebx,0x20(%ebx)
0x001019b9 <+63>: movl $0x0,(%esp)
0x001019c0 <+70>: push $0x3e8
0x001019c5 <+75>: push %edx
0x001019c6 <+76>: push %eax
0x001019c7 <+77>: call 0x1000a0 <__divdi3>
0x001019cc <+82>: add $0x10,%esp
0x001019cf <+85>: inc %eax
0x001019d0 <+86>: mov %eax,0x28(%ebx)
0x001019d3 <+89>: movl $0x0,0x24(%ebx)
0x001019da <+96>: lea 0x18(%ebx),%edx
0x001019dd <+99>: mov $0x10460c,%eax
0x001019e2 <+104>: add $0x28,%ebx
0x001019e5 <+107>: mov $0x10162b,%ecx
0x001019ea <+112>: push %ebx
0x001019eb <+113>: call 0x101667 <sys_dlist_insert_at>
0x001019f0 <+118>: mov %esi,0x8(%ebp)
0x001019f3 <+121>: pop %eax
0x001019f4 <+122>: lea -0xc(%ebp),%esp
0x001019f7 <+125>: pop %ebx
0x001019f8 <+126>: pop %esi
0x001019f9 <+127>: pop %edi
0x001019fa <+128>: pop %ebp
0x001019fb <+129>: jmp 0x100f77 <_Swap>
End of assembler dump.
We discovered that we can get the ordering we want by declaring 'ticks' volatile:
Dump of assembler code for function k_sleep:
0x0010197a <+0>: push %ebp
0x0010197b <+1>: mov %esp,%ebp
0x0010197d <+3>: push %ebx
0x0010197e <+4>: push %edx
0x0010197f <+5>: mov 0x8(%ebp),%edx
0x00101982 <+8>: test %edx,%edx
0x00101984 <+10>: jne 0x10198d <k_sleep+19>
0x00101986 <+12>: call 0x101944 <k_yield>
0x0010198b <+17>: jmp 0x1019f5 <k_sleep+123>
0x0010198d <+19>: mov $0x64,%eax
0x00101992 <+24>: push $0x0
0x00101994 <+26>: imul %edx
0x00101996 <+28>: add $0x3e7,%eax
0x0010199b <+33>: push $0x3e8
0x001019a0 <+38>: adc $0x0,%edx
0x001019a3 <+41>: push %edx
0x001019a4 <+42>: push %eax
0x001019a5 <+43>: call 0x1000a0 <__divdi3>
0x001019aa <+48>: add $0x10,%esp
0x001019ad <+51>: inc %eax
0x001019ae <+52>: mov %eax,-0x8(%ebp)
0x001019b1 <+55>: pushf
0x001019b2 <+56>: cli
0x001019b3 <+57>: pop %ebx
0x001019b4 <+58>: pushl 0x1045e8
0x001019ba <+64>: call 0x101726 <_remove_thread_from_ready_q>
0x001019bf <+69>: mov 0x1045e8,%eax
0x001019c4 <+74>: mov -0x8(%ebp),%edx
0x001019c7 <+77>: movl $0x0,0x24(%eax)
0x001019ce <+84>: mov %edx,0x28(%eax)
0x001019d1 <+87>: mov %eax,0x20(%eax)
0x001019d4 <+90>: lea 0x18(%eax),%edx
0x001019d7 <+93>: add $0x28,%eax
0x001019da <+96>: mov %eax,(%esp)
0x001019dd <+99>: mov $0x10162b,%ecx
0x001019e2 <+104>: mov $0x1045ec,%eax
0x001019e7 <+109>: call 0x101667 <sys_dlist_insert_at>
0x001019ec <+114>: mov %ebx,(%esp)
0x001019ef <+117>: call 0x100f77 <_Swap>
0x001019f4 <+122>: pop %eax
0x001019f5 <+123>: mov -0x4(%ebp),%ebx
0x001019f8 <+126>: leave
0x001019f9 <+127>: ret
End of assembler dump.
However this just fixes it in one spot. We really need a way to modify the irq_lock() implementation such that it does the right thing everywhere, and right now the "memory" clobber is not sufficient.
Since your architecture is x86 anyway, try using __sync_synchronize
instead of the memory clobber. It's a full hardware memory barrier supported by x86.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With