Note: I'm running on FreeBSD, but I've also included Linux as a tag since the problem is somewhat general and Linux-specific solutions are of interest to me.
Edit: just to confirm that the issue was not FreeBSD specific, I ported the module to Linux, and indeed got the exact same behavior. The code for the Linux version of the module is given below; it is essentially exactly the same, the only major difference being that the IDT is evidently given read-only protection in Linux, and so I had to disable the Write Protection bit in cr0
for the code to work.
I'm learning a bit about kernel development on an x86-64 architecture, and at present have been reading about interrupt handling in the Intel developers' manual. As practice I'm trying to write a small kernel module that hooks entries in the IDT, but have been running into a problem. My general question is this: how do you ensure that the code for your hook (or the data for your new IDT table if you're using lidt
to change the entire idtr
rather than just overwrite individual entries of the IDT) is always present in RAM? The issue I've been running into is that I'll change an IDT entry, trigger the corresponding interrupt, and then get a double fault because the code for my hook isn't mapped into RAM. In general are there ways of avoiding this problem?
For specifics of my situation, the following is code for a FreeBSD LKM I've written that simply overwrites the listed address in the IDT entry for handling zero-divisor faults and replaces it with the address of asm_hook
, which at present just unconditionally jmp
s back into the original interrupt handler. (In the future I'll of course add more functionality.)
#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/syscall.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
//idt entry
struct idte_t {
unsigned short offset_0_15;
unsigned short segment_selector;
unsigned char ist; //interrupt stack table
unsigned char type:4;
unsigned char zero_12:1;
unsigned char dpl:2; //descriptor privilege level
unsigned char p:1; //present flag
unsigned short offset_16_31;
unsigned int offset_32_63;
unsigned int rsv; }
__attribute__((packed))
*zd_idte;
#define ZD_INT 0x00
unsigned long idte_offset; //contains absolute address of original interrupt handler
//idt register
struct idtr_t {
unsigned short lim_val;
struct idte_t *addr; }
__attribute__((packed))
idtr;
__asm__(
".text;"
".global asm_hook;"
"asm_hook:;"
"jmp *(idte_offset);");
extern void asm_hook(void);
static int
init() {
__asm__ __volatile__ (
"cli;"
"sidt %0;"
"sti;"
:: "m"(idtr));
uprintf("[*] idtr dump\n"
"[**] address:\t%p\n"
"[**] lim val:\t0x%x\n"
"[*] end dump\n\n",
idtr.addr, idtr.lim_val);
zd_idte=(idtr.addr)+ZD_INT;
idte_offset=(long)(zd_idte->offset_0_15)|((long)(zd_idte->offset_16_31)<<16)|((long)(zd_idte->offset_32_63)<<32);
uprintf("[*] old idt entry %d:\n"
"[**] addr:\t%p\n"
"[**] segment:\t0x%x\n"
"[**] ist:\t%d\n"
"[**] type:\t%d\n"
"[**] dpl:\t%d\n"
"[**] p:\t\t%d\n"
"[*] end dump\n\n",
ZD_INT, (void *)idte_offset, zd_idte->segment_selector,
zd_idte->ist, zd_idte->type, zd_idte->dpl, zd_idte->p);
if(!zd_idte->p) {
uprintf("[*] fatal: handler segment not present\n");
return ENOSYS; }
__asm__ __volatile__("cli");
zd_idte->offset_0_15=((unsigned long)(&asm_hook))&0xffff;
zd_idte->offset_16_31=((unsigned long)(&asm_hook)>>16)&0xffff;
zd_idte->offset_32_63=((unsigned long)(&asm_hook)>>32)&0xffffffff;
__asm__ __volatile__("sti");
uprintf("[*] new idt entry %d:\n"
"[**] addr:\t%p\n"
"[**] segment:\t0x%x\n"
"[**] ist:\t%d\n"
"[**] type:\t%d\n"
"[**] dpl:\t%d\n"
"[**] p:\t\t%d\n"
"[*] end dump\n\n",
ZD_INT, (void *)(\
(long)zd_idte->offset_0_15|((long)zd_idte->offset_16_31<<16)|((long)zd_idte->offset_32_63<<32)),
zd_idte->segment_selector, zd_idte->ist, zd_idte->type, zd_idte->dpl, zd_idte->p);
return 0; }
static void
fini() {
__asm__ __volatile__("cli");
zd_idte->offset_0_15=idte_offset&0xffff;
zd_idte->offset_16_31=(idte_offset>>16)&0xffff;
zd_idte->offset_32_63=(idte_offset>>32)&0xffffffff;
__asm__ __volatile__("sti"); }
static int
load(struct module *module, int cmd, void *arg) {
int error=0;
switch(cmd) {
case MOD_LOAD:
error=init();
break;
case MOD_UNLOAD:
fini();
break;
default:
error=EOPNOTSUPP;
break; }
return error; }
static moduledata_t idt_hook_mod = {
"idt_hook",
load,
NULL };
DECLARE_MODULE(idt_hook, idt_hook_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
(I have also written another LKM that creates an entire new IDT table using malloc(9)
and uses lidt
to load that table into idtr
, but that seems to me an inferior approach as it will only alter the IDT on the particular CPU core it's running on, and hence won't work reliably in multiprocessor systems. Unless there's something I'm missing is this an accurate assessment?)
Anyway, compiling the code and loading the kernel module cause no issues:
# kldload ./idt_hook.ko
[*] idtr dump
[**] address: 0xffffffff81fb2c40
[**] lim val: 0xfff
[*] end dump
[*] old idt entry 0:
[**] addr: 0xffffffff81080f90
[**] segment: 0x20
[**] ist: 0
[**] type: 14
[**] dpl: 0
[**] p: 1
[*] end dump
[*] new idt entry 0:
[**] addr: 0xffffffff8281d000
[**] segment: 0x20
[**] ist: 0
[**] type: 14
[**] dpl: 0
[**] p: 1
[*] end dump
However, when I test the hook with the following, the kernel hangs:
#include <stdio.h>
int main() {
int x=1, y=0;
printf("x/y=%d\n", x/y);
return 0; }
To understand what's going on I spun up the VirtualBox built-in debugger, and set a breakpoint on the IDT's double fault exception handler (entry 8). Debugging showed that my LKM alters the IDT correctly, but running the zero-divisor code above triggers a double fault. I realized the reason for this when I tried to access the memory at 0xffffffff8281d000
(the address of my asm_hook
code), which triggered a VERR_PAGE_TABLE_NOT_PRESENT
error in the VirtualBox debugger. So, unless I'm misunderstanding something, evidently the issue is indeed that my asm_hook
gets removed from memory at some point. Any ideas on how to address this problem? For instance, is there a way to tell the FreeBSD kernel that a particular page should never be unmapped from RAM?
Edit: Nate Eldredge in the comments below helped me find some errors in my code (now corrected), but unfortunately the problem still persists. To give greater debugging detail: first I load the kernel module, and then I set a breakpoint on the listed address of my asm_hook
code (0xffffffff8281d000
) in the VirtualBox debugger. I've confirmed by disassembling memory at that address that it does indeed contain the code of asm_hook
. (Although, as Nate points out, it's slightly odd that it's placed exactly on a page boundary – anyone have any ideas why this might be?)
In any case, when I trigger the zero-divisor interrupt, the breakpoint is unfortunately never hit, and, once I'm inside the double fault interrupt handler, when I try to access the memory at 0xffffffff8281d000
the VERR_PAGE_TABLE_NOT_PRESENT
error still flags up.
It is true that it would be an unusual (?) feature of the FreeBSD's design to swap out/unmap portions of its kernel from RAM, so maybe a better question is "what's causing this page fault?"
Edit: Here is a version of the module ported to Linux:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <asm/io.h>
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Hooks the zero divisor IDT entry");
MODULE_VERSION("0.01");
struct idte_t {
unsigned short offset_0_15;
unsigned short segment_selector;
unsigned char ist; //interrupt stack table
unsigned char type:4;
unsigned char zero_12:1;
unsigned char dpl:2; //descriptor privilege level
unsigned char p:1; //present flag
unsigned short offset_16_31;
unsigned int offset_32_63;
unsigned int rsv; }
__attribute__((packed))
*zd_idte;
#define ZD_INT 0x00
unsigned long idte_offset; //contains absolute address of original interrupt handler
struct idtr_t {
unsigned short lim_val;
struct idte_t *addr; }
__attribute__((packed))
idtr;
__asm__(
".text;"
".global asm_hook;"
"asm_hook:;"
"jmp *(idte_offset);");
extern void asm_hook(void);
static int __init
idt_init(void) {
__asm__ __volatile__ (
"cli;"
"sidt %0;"
"sti;"
:: "m"(idtr));
printk("[*] idtr dump\n"
"[**] address:\t%px\n"
"[**] lim val:\t0x%x\n"
"[*] end dump\n\n",
idtr.addr, idtr.lim_val);
zd_idte=(idtr.addr)+ZD_INT;
idte_offset=(long)(zd_idte->offset_0_15)|((long)(zd_idte->offset_16_31)<<16)|((long)(zd_idte->offset_32_63)<<32);
printk("[*] old idt entry %d:\n"
"[**] addr:\t%px\n"
"[**] segment:\t0x%x\n"
"[**] ist:\t%d\n"
"[**] type:\t%d\n"
"[**] dpl:\t%d\n"
"[**] p:\t\t%d\n"
"[*] end dump\n\n",
ZD_INT, (void *)idte_offset, zd_idte->segment_selector,
zd_idte->ist, zd_idte->type, zd_idte->dpl, zd_idte->p);
if(!zd_idte->p) {
printk("[*] fatal: handler segment not present\n");
return ENOSYS; }
unsigned long cr0;
__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0));
cr0 &= ~(long)0x10000;
__asm__ __volatile__("mov %0, %%cr0" :: "r"(cr0));
__asm__ __volatile__("cli");
zd_idte->offset_0_15=((unsigned long)(&asm_hook))&0xffff;
zd_idte->offset_16_31=((unsigned long)(&asm_hook)>>16)&0xffff;
zd_idte->offset_32_63=((unsigned long)(&asm_hook)>>32)&0xffffffff;
__asm__ __volatile__("sti");
cr0 |= 0x10000;
__asm__ __volatile__("mov %0, %%cr0" :: "r"(cr0));
printk("[*] new idt entry %d:\n"
"[**] addr:\t%px\n"
"[**] segment:\t0x%x\n"
"[**] ist:\t%d\n"
"[**] type:\t%d\n"
"[**] dpl:\t%d\n"
"[**] p:\t\t%d\n"
"[*] end dump\n\n",
ZD_INT, (void *)(\
(long)zd_idte->offset_0_15|((long)zd_idte->offset_16_31<<16)|((long)zd_idte->offset_32_63<<32)),
zd_idte->segment_selector, zd_idte->ist, zd_idte->type, zd_idte->dpl, zd_idte->p);
return 0; }
static void __exit
idt_fini(void) {
unsigned long cr0;
__asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0));
cr0 &= ~(long)0x10000;
__asm__ __volatile__("mov %0, %%cr0" :: "r"(cr0));
__asm__ __volatile__("cli");
zd_idte->offset_0_15=idte_offset&0xffff;
zd_idte->offset_16_31=(idte_offset>>16)&0xffff;
zd_idte->offset_32_63=(idte_offset>>32)&0xffffffff;
__asm__ __volatile__("sti");
cr0 |= 0x10000;
__asm__ __volatile__("mov %0, %%cr0" :: "r"(cr0)); }
module_init(idt_init);
module_exit(idt_fini);
A page fault occurs when a program attempts to access data or code that is in its address space, but is not currently located in the system RAM. So when page fault occurs then following sequence of events happens :
Hard page faults happen when the page is located inside the page file on the hard disk. Soft page faults occur when the page is found somewhere else in memory. Preventions to reduce high page fault rate: One method to prevent page faults is using a memory allocator that knows about allocating memory and is utilized only on the same pages.
If the page fault rate is too high, it ensures that the process has too few frames allocated to it. Boundaries can be eradicated from the process if the page fault rate falls down the lower limit. Similarly, if the page fault rate surpasses the upper limit, more frames can be allocated to the process.
Once virtual address caused page fault is known, system checks to see if address is valid and checks if there is no protection access problem. If the virtual address is valid, the system checks to see if a page frame is free.
EDIT 07/18/20: Sorry to resurrect a dead post, but in fact there was more to the story. In short, the problem was actually not with VirtualBox, but with my code failing to account for meltdown mitigation techniques, and in particular Kernel Page Table Isolation. Evidently Qemu does not enable KPTI by default, which is why it appeared that the problem was hypervisor specific. However, enabling use of OS X's "Hypervisor Framework" with Qemu (which does enable KPTI by default) caused the module to fail again. After a lot of investigation I finally realized that the issue was the KPTI; apparently loadable kernel modules – like much kernel code – are not included in userspace page tables.
To fix this I had to write a new module that overrides the code of the kernel's existing IRQ handler (which is included in userspace page tables) with a snippet to change cr3
to a value that will include my kernel module's page entries. (This is stub
in the code below.) I then jump to asm_hook
– which is now paged in – increment my counter variable, restore the old value of cr3
, and jump to an existing kernel IRQ handler. (Since the division-error handler is overwritten I jump to the soft breakpoint handler instead.) The code is below, and can be tested with the same division-by-zero program.
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/kallsyms.h>
#include <asm/io.h>
#include "utilities.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Atticus Stonestrom");
MODULE_DESCRIPTION("Hooks the zero divisor IDT entry");
struct idte_t *idte; //points to the start of the IDT
#define ZD_INT 0x00
#define BP_INT 0x03
unsigned long zd_handler; //contains absolute address of division error IRQ handler
unsigned long bp_handler; //contains absolute address of soft breakpoint IRQ handler
#define STUB_SIZE 0x2b //includes extra 8 bytes for the old value of cr3
unsigned char orig_bytes[STUB_SIZE]; //contains the original bytes of the division error IRQ handler
struct idtr_t idtr; //holds base address and limit value of the IDT
int counter=0;
__asm__(
".text;"
".global asm_hook;"
"asm_hook:;"
"incl counter;"
"movq (bp_handler), %rax;"
"ret;");
extern void asm_hook(void);
__asm__(
".text;"
".global stub;"
"stub:;"
"push %rax;" //bp_handler
"push %rbx;" //new cr3, &asm_hook
"push %rdx;" //old cr3
"mov %cr3, %rdx;"
"mov .CR3(%rip), %rbx;"
"mov %rbx, %cr3;"
"mov $asm_hook, %rbx;"
"call *%rbx;"
"mov %rdx, %cr3;"
"pop %rdx;"
"pop %rbx;"
"xchg %rax, (%rsp);"
"ret;"
".CR3:;"
//will be filled with a valid value of cr3 during module initialization
".quad 0xdeadbeefdeadbeef;");
extern void stub(void);
static int __init
idt_init(void) {
READ_IDT(idtr)
printk("[*] idtr dump\n"
"[**] address:\t0x%px\n"
"[**] lim val:\t0x%x\n"
"[*] end dump\n\n",
idtr.addr, idtr.lim_val);
idte=(idtr.addr);
zd_handler=0
| ((long)((idte+ZD_INT)->offset_0_15))
| ((long)((idte+ZD_INT)->offset_16_31)<<16)
| ((long)((idte+ZD_INT)->offset_32_63)<<32);
printk("[*] idt entry %d:\n"
"[**] addr:\t0x%px\n"
"[**] segment:\t0x%x\n"
"[**] ist:\t%d\n"
"[**] type:\t%d\n"
"[**] dpl:\t%d\n"
"[**] p:\t\t%d\n"
"[*] end dump\n\n",
ZD_INT, (void *)zd_handler, (idte+ZD_INT)->segment_selector,
(idte+ZD_INT)->ist, (idte+ZD_INT)->type, (idte+ZD_INT)->dpl, (idte+ZD_INT)->p);
if(!(idte+ZD_INT)->p) {
printk("[*] fatal: handler segment not present\n");
return ENOSYS; }
bp_handler=0
| ((long)((idte+BP_INT)->offset_0_15))
| ((long)((idte+BP_INT)->offset_16_31)<<16)
| ((long)((idte+BP_INT)->offset_32_63)<<32);
printk("[*] breakpoint handler:\t0x%lx\n\n", bp_handler);
unsigned long cr3;
__asm__ __volatile__("mov %%cr3, %0":"=r"(cr3)::"memory");
printk("[*] cr3:\t0x%lx\n\n", cr3);
memcpy(orig_bytes, (void *)zd_handler, STUB_SIZE);
DISABLE_RW_PROTECTION
__asm__ __volatile__("cli":::"memory");
memcpy((void *)zd_handler, &stub, STUB_SIZE);
*(unsigned long *)(zd_handler+STUB_SIZE-8)=cr3; //fills the .CR3 data section of stub with a value of cr3 guaranteed to have the code asm_hook paged in
__asm__ __volatile__("sti":::"memory");
ENABLE_RW_PROTECTION
return 0; }
static void __exit
idt_fini(void) {
printk("[*] counter: %d\n\n", counter);
DISABLE_RW_PROTECTION
__asm__ __volatile__("cli":::"memory");
memcpy((void *)zd_handler, orig_bytes, STUB_SIZE);
__asm__ __volatile__("sti":::"memory");
ENABLE_RW_PROTECTION }
module_init(idt_init);
module_exit(idt_fini);
utilities.h
just contains some relevant IDT macros and structs
, eg the following:
#define DISABLE_RW_PROTECTION \
__asm__ __volatile__( \
"mov %%cr0, %%rax;" \
"and $0xfffffffffffeffff, %%rax;" \
"mov %%rax, %%cr0;" \
:::"rax");
#define ENABLE_RW_PROTECTION \
__asm__ __volatile__( \
"mov %%cr0, %%rax;" \
"or $0x10000, %%rax;" \
"mov %%rax, %%cr0;" \
:::"rax");
struct idte_t {
unsigned short offset_0_15;
unsigned short segment_selector;
unsigned char ist; //interrupt stack table
unsigned char type:4;
unsigned char zero_12:1;
unsigned char dpl:2; //descriptor privilege level
unsigned char p:1; //present flag
unsigned short offset_16_31;
unsigned int offset_32_63;
unsigned int rsv; }
__attribute__((packed));
struct idtr_t {
unsigned short lim_val;
struct idte_t *addr; }
__attribute__((packed));
#define READ_IDT(dst) \
__asm__ __volatile__( \
"cli;" \
"sidt %0;" \
"sti;" \
:: "m"(dst) \
: "memory");
#define WRITE_IDT(src) \
__asm__ __volatile__( \
"cli;" \
"lidt %0;" \
"sti;" \
:: "m"(src) \
: "memory");
Upon removal of the module, dmesg
will display the number of times that a division-error handler was invoked, indicating success.
*Evidently the problem is not to do with my code, but with VirtualBox. While playing around in the VirtualBox debugger, I realized that, once inside IDT/IRQ handlers, trying to access certain areas of even kernel memory flags a VERR_PAGE_TABLE_NOT_PRESENT
error, so it looks like something in VirtualBox's implementation must periodically swap out areas of kernel memory. It seems very strange to me, but unfortunately VirtualBox does not have much documentation as far as I can see; if anyone has any insight into what's going on here I'd be interested to hear.
In any case, I switched over to qemu
, and the kernel module worked flawlessly there. For posterity, to confirm that it's working, make the following modifications to the module code (I changed the linux one, in particular):
int counter=0;
__asm__(
".text;"
".global asm_hook;"
"asm_hook:;"
"incl counter;"
"jmp *(idte_offset);");
...
static void __exit
idt_fini(void) {
printk("[*] counter:\t%d\n\n", counter);
...
Once the kernel module is loaded in, run the division-by-zero program several times, and then unload the module and check dmesg
to confirm it's working as desired.
So, in summary, the issue was not with the code but with VirtualBox itself; nonetheless thanks to everyone who tried to help out.*
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With