Hi, I am writing a vmx hypervisor and encountered a pretty strange problem.
I have a user program that invokes an IOCTL to run a vm, while passing a user buffer to the IOCTL for return info. In the IOCTL handler I create another thread that saves register and non-register state, executes vm-launch, and on the first IO operation traps and exits the VM. After exiting the VM I restore the state I saved, then I exit the thread (terminating it in the process). The main IOCTL thread waits for the vmlaunch thread to terminate and then tries to write information to the user buffer with IoCompleteRequest - but throws page fault in non-mapped memory instead.
For a more detailed explenation:
- I save the state using the VMCS:
static BOOLEAN InitHostVMCS() {
DESCRIPTOR_TABLE_REGISTER gdtr, idtr;
GetGdtr(&gdtr);
GetIdtr(&idtr);
TSS64_DESCRIPTOR* trDesc = (TSS64_DESCRIPTOR*)(gdtr.Base + readReg(REG_TR) & (~0b111));
// Reconstruct base from descriptor fields
UINT64 trBase = ((UINT64)trDesc->BaseLow) | ((UINT64)trDesc->BaseMiddle << 16) | ((UINT64)trDesc->BaseHigh << 24) | ((UINT64)trDesc->BaseUpper << 32);
UINT8 type = trDesc->Flags1 & 0x0F;
if (type != 0x9 && type != 0xB) {
DebugLog("[!] TSS Section type is incorrect.\n");
return FALSE;
}
PVOID vcpuStack = ExAllocatePool2(POOL_FLAG_NON_PAGED, PAGE_SIZE*1000, ALLOCATION_TAG); // TODO: Don't use magic number
if (vcpuStack == NULL) {
DebugLog("[!] Failed to allocatte VCPU stack!\n");
return FALSE;
}
// VMCS Host Registers
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR0, readReg(REG_CR0)), "VMCS_HOST_CR0", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR3, readReg(REG_CR3)), "VMCS_HOST_CR3", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR4, readReg(REG_CR4)), "VMCS_HOST_CR4", TRUE)) return FALSE;
// VMCS Host Segment Selectors
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CS_SELECTOR, readReg(REG_CS)), "VMCS_CS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_SS_SELECTOR, readReg(REG_SS)), "VMCS_SS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_DS_SELECTOR, 0), "VMCS_DS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_ES_SELECTOR, 0), "VMCS_ES", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_FS_SELECTOR, 0), "VMCS_FS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GS_SELECTOR, 0), "VMCS_GS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_TR_SELECTOR, readReg(REG_TR)), "VMCS_TR", TRUE)) return FALSE;
// VMCS Host Segment Bases
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_FS_BASE, __readmsr(IA32_FS_BASE)), "VMCS_FS_BASE", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GS_BASE, __readmsr(IA32_GS_BASE)), "VMCS_GS_BASE", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_TR_BASE, trBase), "VMCS_TR_BASE", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GDTR_BASE, gdtr.Base), "VMCS_GDTR", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IDTR_BASE, idtr.Base), "VMCS_IDTR", TRUE)) return FALSE;
// VMCS Host MSRs
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_CS, __readmsr(IA32_SYSENTER_CS)), "VMCS_HOST_SYSENTER_CS", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, __readmsr(IA32_SYSENTER_ESP)), "VMCS_HOST_SYSENTER_ESP", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, __readmsr(IA32_SYSENTER_EIP)), "VMCS_HOST_SYSENTER_EIP", TRUE)) return FALSE;
if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_EFER, __readmsr(IA32_EFER)), "IA32_EFER", TRUE)) return FALSE;
return TRUE;
}
- When actually launching the VM I also save the registers themselves:
runVm PROC
sub rsp, 8 ; Align RSP to 16-byte alignment.
; Save the 8 callee-savved registers (this keeps alignment correct).
push RBX
push RBP
push RDI
push RSI
push R12
push R13
push R14
push R15
mov rbx, cr8
push rbx
sub rsp, 8
sub rsp, 32
; RSP[0] = RDX
; RSP[1] = RCX
; RSP[2] = Saved RCX (used after returning from vm-entry).
; RSP[3] = Return Value
; Save arguments.
mov [rsp + 8], rcx
mov [rsp], rdx
; Save host RSP into VMCS host-state
mov rax, rsp
mov rdx, 6C14h ; Host RSP
vmwrite rdx, rax
jc vmwrite_fail_cf ; CF=1 -> VMCS Invalid
jz vmwrite_fail_zf ; ZF=1 -> VMCS Valid
; Save host RIP (address of 'success')
lea rax, success
mov rdx, 6C16h ; Host RIP
vmwrite rdx, rax
jc vmwrite_fail_cf
jz vmwrite_fail_zf
mov rax, [rcx]
mov rbx, [rcx + 8]
mov rdx, [rcx + 24]
mov rsi, [rcx + 32]
mov rdi, [rcx + 40]
mov rbp, [rcx + 48]
mov r8, [rcx + 56]
mov r9, [rcx + 64]
mov r10, [rcx + 72]
mov r11, [rcx + 80]
mov r12, [rcx + 88]
mov r13, [rcx + 96]
mov r14, [rcx + 104]
mov r15, [rcx + 112]
mov rcx, [rcx + 16]
; -------- Enter guest: choose vmresume vs vmlaunch --------
cmp qword ptr [rsp], 0
je do_resume
do_launch:
vmlaunch
jc vmwrite_fail_cf
jz vmwrite_fail_zf
do_resume:
vmresume
jc vmwrite_fail_cf
jz vmwrite_fail_zf
vmwrite_fail_zf: ; VMX_INSTR_ERROR
mov qword ptr [rsp + 24], 1 ; VMX_INSTR_ERROR=1
jmp done
vmwrite_fail_cf: ; VMCS pointer invalid
mov qword ptr [rsp + 24], 2 ; VMX_VMCS_INVALID=2
jmp done
success:
mov qword ptr [rsp + 24], 0 ; VMX_NOERROR=0
jmp done
done:
; Save RCX
mov [rsp + 16], rcx
mov rcx, [rsp + 8]
mov [rcx], rax
mov [rcx + 8], rbx
mov [rcx + 24], rdx
mov [rcx + 32], rsi
mov [rcx + 40], rdi
mov [rcx + 48], rbp
mov [rcx + 56], r8
mov [rcx + 64], r9
mov [rcx + 72], r10
mov [rcx + 80], r11
mov [rcx + 88], r12
mov [rcx + 96], r13
mov [rcx + 104], r14
mov [rcx + 112], r15
; Save RCX.
mov rax, [rsp + 16]
mov [rcx + 16], rax
; Return The Saved Return Value.
mov rax, [rsp + 24]
add rsp, 32
; Restore the 8 callee-savved registers (this keeps alignment correct).
add rsp, 8
pop rbx
mov cr8, rbx
pop R15
pop R14
pop R13
pop R12
pop RSI
pop RDI
pop RBP
pop RBX
add rsp, 8 ; Re-add the alignment.
ret
runVm ENDP
After VM-Exit I set IF to 1, call VMCLEAR, VMX_OFF, then terminate the thread (by returning from the thread's main function).
- The IOCTL itself returns successfully if no info is written to the user's buffer. But if I don't set Irp->UserIosb to NULL I get the same page fault. I am pretty sure that if I wait a second the page fault does not occure (probably because the state balances by this time). I also tried to call ProbeForRead and MmIsAddressValid on the user buffer [Irp->UserBuffer] address after attaching to the user process and they said the address is valid (ProbeForRead caused no error and MmIsAddressValid returned true) - but actually writing to the buffer caused a page fault because the page wasn't found.
I tried to flush the cache (re-write CR3) and nothing helps. (I disabled SMAP).
I'd really like if someone could help me understand what could be the problem or how to debug it further, thanks for the reading.