Page fault when writing to user memory after vmlaunch

Hi, I am writing a vmx hypervisor and encountered a pretty strange problem.
I have a user program that invokes an IOCTL to run a vm, while passing a user buffer to the IOCTL for return info. In the IOCTL handler I create another thread that saves register and non-register state, executes vm-launch, and on the first IO operation traps and exits the VM. After exiting the VM I restore the state I saved, then I exit the thread (terminating it in the process). The main IOCTL thread waits for the vmlaunch thread to terminate and then tries to write information to the user buffer with IoCompleteRequest - but throws page fault in non-mapped memory instead.

For a more detailed explenation:

  1. I save the state using the VMCS:
static BOOLEAN InitHostVMCS() {
    DESCRIPTOR_TABLE_REGISTER gdtr, idtr;
    GetGdtr(&gdtr);
    GetIdtr(&idtr);
    TSS64_DESCRIPTOR* trDesc = (TSS64_DESCRIPTOR*)(gdtr.Base + readReg(REG_TR) & (~0b111));

    // Reconstruct base from descriptor fields
    UINT64 trBase = ((UINT64)trDesc->BaseLow) | ((UINT64)trDesc->BaseMiddle << 16) | ((UINT64)trDesc->BaseHigh << 24) | ((UINT64)trDesc->BaseUpper << 32);
    UINT8 type = trDesc->Flags1 & 0x0F;
    if (type != 0x9 && type != 0xB) {
        DebugLog("[!] TSS Section type is incorrect.\n");
        return FALSE;
    }

    PVOID vcpuStack = ExAllocatePool2(POOL_FLAG_NON_PAGED, PAGE_SIZE*1000, ALLOCATION_TAG); // TODO: Don't use magic number
    if (vcpuStack == NULL) {
        DebugLog("[!] Failed to allocatte VCPU stack!\n");
        return FALSE;
    }

    // VMCS Host Registers
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR0, readReg(REG_CR0)), "VMCS_HOST_CR0", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR3, readReg(REG_CR3)), "VMCS_HOST_CR3", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CR4, readReg(REG_CR4)), "VMCS_HOST_CR4", TRUE)) return FALSE;

    // VMCS Host Segment Selectors
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_CS_SELECTOR, readReg(REG_CS)), "VMCS_CS", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_SS_SELECTOR, readReg(REG_SS)), "VMCS_SS", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_DS_SELECTOR, 0), "VMCS_DS", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_ES_SELECTOR, 0), "VMCS_ES", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_FS_SELECTOR, 0), "VMCS_FS", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GS_SELECTOR, 0), "VMCS_GS", TRUE)) return FALSE;

    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_TR_SELECTOR, readReg(REG_TR)), "VMCS_TR", TRUE)) return FALSE;

    // VMCS Host Segment Bases
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_FS_BASE, __readmsr(IA32_FS_BASE)), "VMCS_FS_BASE", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GS_BASE, __readmsr(IA32_GS_BASE)), "VMCS_GS_BASE", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_TR_BASE, trBase), "VMCS_TR_BASE", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_GDTR_BASE, gdtr.Base), "VMCS_GDTR", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IDTR_BASE, idtr.Base), "VMCS_IDTR", TRUE)) return FALSE;

    // VMCS Host MSRs
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_CS, __readmsr(IA32_SYSENTER_CS)), "VMCS_HOST_SYSENTER_CS", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, __readmsr(IA32_SYSENTER_ESP)), "VMCS_HOST_SYSENTER_ESP", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, __readmsr(IA32_SYSENTER_EIP)), "VMCS_HOST_SYSENTER_EIP", TRUE)) return FALSE;
    if (!vmxSucceeded(__vmx_vmwrite(VMCS_HOST_IA32_EFER, __readmsr(IA32_EFER)), "IA32_EFER", TRUE)) return FALSE;
    return TRUE;
}
  1. When actually launching the VM I also save the registers themselves:
runVm PROC
	sub rsp, 8 ; Align RSP to 16-byte alignment.
	; Save the 8 callee-savved registers (this keeps alignment correct).
	push RBX
	push RBP
	push RDI
	push RSI
	push R12
	push R13
	push R14
	push R15

	mov rbx, cr8
	push rbx
	sub rsp, 8

	sub rsp, 32
	; RSP[0] = RDX
	; RSP[1] = RCX
	; RSP[2] = Saved RCX (used after returning from vm-entry).
	; RSP[3] = Return Value

	; Save arguments.
	mov [rsp + 8], rcx
	mov [rsp], rdx

    ; Save host RSP into VMCS host-state
    mov     rax, rsp
    mov     rdx, 6C14h           ; Host RSP
    vmwrite rdx, rax
    jc      vmwrite_fail_cf      ; CF=1 -> VMCS Invalid
    jz      vmwrite_fail_zf      ; ZF=1 -> VMCS Valid

    ; Save host RIP (address of 'success')
    lea     rax, success
    mov     rdx, 6C16h           ; Host RIP
    vmwrite rdx, rax
    jc      vmwrite_fail_cf
    jz      vmwrite_fail_zf

	mov rax, [rcx]
	mov rbx, [rcx + 8]
	mov rdx, [rcx + 24]
	mov rsi, [rcx + 32]
	mov rdi, [rcx + 40]
	mov rbp, [rcx + 48]
	mov r8,  [rcx + 56]
	mov r9,  [rcx + 64]
	mov r10, [rcx + 72]
	mov r11, [rcx + 80]
	mov r12, [rcx + 88]
	mov r13, [rcx + 96]
	mov r14, [rcx + 104]
	mov r15, [rcx + 112]
	mov rcx, [rcx + 16]

    ; -------- Enter guest: choose vmresume vs vmlaunch --------
    cmp	   qword ptr [rsp], 0
    je     do_resume

do_launch:
    vmlaunch
    jc vmwrite_fail_cf
    jz vmwrite_fail_zf

do_resume:
    vmresume
    jc vmwrite_fail_cf
    jz vmwrite_fail_zf

vmwrite_fail_zf:			; VMX_INSTR_ERROR
	mov qword ptr [rsp + 24], 1	; VMX_INSTR_ERROR=1
    jmp     done

vmwrite_fail_cf:			; VMCS pointer invalid
	mov qword ptr [rsp + 24], 2	; VMX_VMCS_INVALID=2
    jmp     done

success:
	mov qword ptr [rsp + 24], 0	; VMX_NOERROR=0
	jmp done

done:
	; Save RCX
	mov [rsp + 16], rcx
	mov rcx, [rsp + 8]

	mov [rcx], rax
	mov [rcx + 8], rbx
	mov [rcx + 24], rdx
	mov [rcx + 32], rsi
	mov [rcx + 40], rdi
	mov [rcx + 48], rbp
	mov [rcx + 56], r8
	mov [rcx + 64], r9
	mov [rcx + 72], r10
	mov [rcx + 80], r11
	mov [rcx + 88], r12
	mov [rcx + 96], r13
	mov [rcx + 104], r14
	mov [rcx + 112], r15

	; Save RCX.
	mov rax, [rsp + 16]
	mov [rcx + 16], rax

	; Return The Saved Return Value.
	mov rax, [rsp + 24]
	add rsp, 32
	; Restore the 8 callee-savved registers (this keeps alignment correct).
	
	add rsp, 8
	pop rbx
	mov cr8, rbx

	pop R15
	pop R14
	pop R13
	pop R12
	pop RSI
	pop RDI
	pop RBP
	pop RBX
	add rsp, 8 ; Re-add the alignment.
    ret
runVm ENDP

After VM-Exit I set IF to 1, call VMCLEAR, VMX_OFF, then terminate the thread (by returning from the thread's main function).

  1. The IOCTL itself returns successfully if no info is written to the user's buffer. But if I don't set Irp->UserIosb to NULL I get the same page fault. I am pretty sure that if I wait a second the page fault does not occure (probably because the state balances by this time). I also tried to call ProbeForRead and MmIsAddressValid on the user buffer [Irp->UserBuffer] address after attaching to the user process and they said the address is valid (ProbeForRead caused no error and MmIsAddressValid returned true) - but actually writing to the buffer caused a page fault because the page wasn't found.
    I tried to flush the cache (re-write CR3) and nothing helps. (I disabled SMAP).

I'd really like if someone could help me understand what could be the problem or how to debug it further, thanks for the reading. :slight_smile: