win10: Get virtual address of contiguous buffer allocated in kernel driver (KMDF 1.9)

  1. Hello,

Long time ago I was advised here how to get the virtual address of a buffer allocated in a kernel driver.

The 32MB buffer is allocated successfully with the following code:

NTSTATUS AllocateContinuousPhysicalMemory(IN PDEVICE_EXTENSION devExt, int Channel, int BufferSize)
{
WDF_DMA_ENABLER_CONFIG dmaConfig;
NTSTATUS status;

//
// Configure the DMA  object
//

WDF_DMA_ENABLER_CONFIG_INIT(&dmaConfig,
	WdfDmaProfilePacket,
	COMMON_BUFFER_SIZE);

status = WdfDmaEnablerCreate(devExt->Device,
	&dmaConfig,
	WDF_NO_OBJECT_ATTRIBUTES,
	&devExt->DmaEnabler[Channel]);
if (!NT_SUCCESS(status))
{
	KdPrint(("WdfDmaEnblerCreate failed: %08X\n", status));
	return status;
}

status = WdfCommonBufferCreate(devExt->DmaEnabler[Channel],
	COMMON_BUFFER_SIZE,
	WDF_NO_OBJECT_ATTRIBUTES,
	&devExt->CommonBuffer[Channel]);

if (!NT_SUCCESS(status))
{
	KdPrint(("WdfCommonBufferCreate failed: %08X\n", status));
	return status;
}

devExt->KernelCommonBuffer[Channel] = WdfCommonBufferGetAlignedVirtualAddress(devExt->CommonBuffer[Channel]);
devExt->PhysicalKernelCommonBuffer[Channel] = WdfCommonBufferGetAlignedLogicalAddress(devExt->CommonBuffer[Channel]);

RtlFillMemory(devExt->KernelCommonBuffer[Channel], BufferSize, 0x0);

devExt->CommonBufferMdl[Channel] = IoAllocateMdl(devExt->KernelCommonBuffer[Channel], BufferSize, FALSE, FALSE, NULL);

if (!devExt->CommonBufferMdl)
{
	KdPrint(("IoAllocateMdl failed.\n"));
	return STATUS_INSUFFICIENT_RESOURCES;
}

MmBuildMdlForNonPagedPool(devExt->CommonBufferMdl[Channel]);

KdPrint(("Channel %d: PhysicalDataAddressLow=%08x\n", Channel, devExt->PhysicalKernelCommonBuffer[Channel].LowPart));
KdPrint(("Channel %d: PhysicalDataAddressHigh=%08x\n", Channel, devExt->PhysicalKernelCommonBuffer[Channel].HighPart));

return STATUS_SUCCESS;

}

I get the virtual address of the buffer with the following code:


VOID DeviceEvtIoInCallerContext(__in WDFDEVICE Device,
__in WDFREQUEST Request)
/*++
Routine Description:

Responds to EvtIoInCallerContext events from KMDF
It calls different functions to process different type of IOCTLs.

Arguments:

Device - handle to a WDF Device object

Request - handle to the incoming WDF Request object

Return Value:

VOID.

--*/

{
NTSTATUS status = STATUS_SUCCESS;
PDEVICE_EXTENSION devExt = NULL;
WDF_REQUEST_PARAMETERS requestParameters;
BOOLEAN processed = FALSE;

WdfRequestGetParameters(Request, &requestParameters);

devExt = PLxGetDeviceContext(Device);

// get the request parameters
WDF_REQUEST_PARAMETERS_INIT(&requestParameters);
WdfRequestGetParameters(Request, &requestParameters);

if (requestParameters.Type == WdfRequestTypeDeviceControl)
{
	// 1. Requests that should be processed only in the context of the app's process
	if (!processed)
	{
		processed = RequestDispatchToSequentialQueue(Device, Request, requestParameters);
	}
	if (!processed)
	{
		status = WdfDeviceEnqueueRequest(Device, Request);
		if (!NT_SUCCESS(status))
		{
			KdPrint(("WdfDeviceEnqueueRequest failed\n"));
		}
	}
}

}

BOOLEAN
RequestDispatchToSequentialQueue(
__in WDFDEVICE Device,
__in WDFREQUEST Request,
__in WDF_REQUEST_PARAMETERS RequestParameters
)
/*++
Routine Description:

These requests can be processed in a non-serialized manner, most of them don’t need to access device.

Arguments:

Device - handle to a WDF Device object

Request - handle to the incoming WDF Request object

RequestParameters - request parameters

Return Value:

BOOLEAN - TRUE (request processed); FALSE (request is not processed in this function).

–*/
{
NTSTATUS status = STATUS_SUCCESS;
PDEVICE_EXTENSION devExt = NULL;

ALLOCATE_COMMON_BUFFER		*pAllocateRequest;
ALLOCATE_COMMON_BUFFER_REPLY	*pAllocateReply;

size_t        bytesReturned = 0;

void *pInBuffer;
void *pOutBuffer;
int Channel;
size_t Length;

//-->Legacy
MG_SETTING *pMgSettingsRequest;
MG_DESCRIPTOR *pMgDescriptorReply;
HANDLE EventHandle[N_CHANNELS];
MG_BOARD_STAT *pMgBoardStat;
//<--Legacy

ULONG	IoControlCode = RequestParameters.Parameters.DeviceIoControl.IoControlCode;
devExt = PLxGetDeviceContext(Device);

KdPrint(("320: IoControlCode=0x%x\n", IoControlCode));
switch (IoControlCode)
{
case ALLOCATE_COMMON_BUFFER_CODE:
	KdPrint(("PsGetCurrentProcessId=%x\n", PsGetCurrentProcessId()));

	status = WdfRequestRetrieveInputBuffer(Request, sizeof(ALLOCATE_COMMON_BUFFER),
		&pInBuffer, &Length);
	pAllocateRequest = (ALLOCATE_COMMON_BUFFER *)pInBuffer;
	Channel = pAllocateRequest->Channel;

	status = WdfRequestRetrieveOutputBuffer(Request, sizeof(READ_WORD32_REPLY),
		&pOutBuffer, &Length);

	pAllocateReply = (ALLOCATE_COMMON_BUFFER_REPLY *)pOutBuffer;

	bytesReturned = sizeof(ALLOCATE_COMMON_BUFFER_REPLY);

	//Map common buffer to user space. This can be done only upon IOCTL request 
	__try
	{
		devExt->UserSpaceCommonBuffer[Channel] = MmMapLockedPagesSpecifyCache(devExt->CommonBufferMdl[Channel],
			UserMode,
			MmCached,
			NULL,
			FALSE,
			NormalPagePriority);

		if (!devExt->UserSpaceCommonBuffer)
		{
			KdPrint(("MmMapLockedPagesSpecifyCache failed.\n"));
		}
	}
	__except (EXCEPTION_EXECUTE_HANDLER)
	{
		devExt->UserSpaceCommonBuffer[Channel] = NULL;
		KdPrint(("MmMapLockedPagesSpecifyCache caused exception: %x\n", GetExceptionCode()));
	}

	pAllocateReply->VirtualAddress = (UINT64)devExt->UserSpaceCommonBuffer[Channel];

	WdfRequestCompleteWithInformation(Request, STATUS_SUCCESS, bytesReturned);
	break;
  }
  return TRUE;

}

**Problem **: under win10 (or server 2016) , Sometimes, after sending the IOCTL to get back the virtual address, the application hangs.
Same code after PC restart works well, and after few times the application runs, it hangs again.

On another PC running windows 7 embedded, the code runs OK without any hang.

Of course, for win10, the driver code was compiled for win10.

Thank you,
Zvika

Why do you call this RequestDispatchToSequentialQueue? There is no queueing involved here at all.

You don’t do any error checking at all here. Your WdfRequestRetrieveOutputBuffer call is asking for sizeof(READ_WORD32_REPLY) instead of sizeof(ALLOCATE_COMMON_BUFFER_REPLY). If the sizes don’t match, you’d get an error that you aren’t checking for. If the MmMapLockedPagesSpecifyCache call fails, you issue a message, but you just keep on with the processing without returning an error. And if you return null, you complete the request as though it was all a complete success. If the app doesn’t check for a 0, that would crash. Do some error handling, please.

You should get rid of remnants from the sample that you don’t need. All of that “Legacy” stuff is nonsense for you.

@zvivered said:
… snip …

**Problem **: under win10 (or server 2016) , Sometimes, after sending the IOCTL to get back the virtual address, the application hangs.
Same code after PC restart works well, and after few times the application runs, it hangs again.

On another PC running windows 7 embedded, the code runs OK without any hang.

Of course, for win10, the driver code was compiled for win10.

Thank you,
Zvika

32MB is a pretty large chunk to be getting in a contiguous region, and the OS may be having some problems satisfying that request (especially after it’s been running for awhile and memory has gotten fragmented) … and as @Tim mentioned, if the MmMapLockedPagesSpecifyCache call fails due to that you aren’t handling it and aren’t going to have good results. Win10 has tightened things up in a variety of areas over Win7, I would not be surprised if this is not one of them …

In your shoes I would be doing three things:

  • Run this driver under the KMDF verifier, checking all of the boxes, with a kernel mode debugger attached to it. Verifier is really your friend, it will catch a plethora of problems and should be running all of the time
  • Install the driver as initially disabled, so that you can enable/ disable it with Device Manager. Enable the driver right after the machine has started a few times and see what happens, then enable the driver after the machine has been running for awhile (say 30 min) a few times and see what happens … this will help see if the problem is MmMapLocked… failing from sandbars
  • Reduce the size of the continguous area you’re trying to map to something like 4MB or 8MB and redo the experiments

As an aside, there are very few reasons to get that large of a contiguous region in any case; definitely not for something being passed to usermode. You can easily get an allocation in kernel space of that size, take the MDL from that allocation and convert it into a process space buffer to pass back to the usermode. Usermode will still see it as a big 32MB chunk of memory, you can still access it from kernel mode and all will be well …

What specific problem are you solving that your solution is to allocate a 32MB buffer in kernel space that is being passed to usermode?

Hi Tim,

Regarding WdfRequestRetrieveOutputBuffer, very basic error. Sorry abut this.

Regarding RequestDispatchToSequentialQueue: For some reason I thought ALLOCATE_COMMON_BUFFER_CODE (used to get the virtual address of the contiguous bufffer) should be handled in a “different” IOCTL handler compared to the regular one that handles RD\WR to hardware (for example).

I will collect the error I got and update soon.

Thank you very much !
Zvika

You ARE handling that differently (as you need to do), in that you’re handling it in the “InCallerContext” callback. However, you’re not dispatching it to a serial queue. You’re just handling the ioctl and completing it, in the InCallerContext handler. I’m saying the handling appears approximately correct, but the routine is named deceptively. There is no sequential queue, and it’s not being dispatched – it’s being handled.

@craig_howard said:

@zvivered said:
… snip …
As an aside, there are very few reasons to get that large of a contiguous region in any case; definitely not for something being passed to usermode. You can easily get an allocation in kernel space of that size, take the MDL from that allocation and convert it into a process space buffer to pass back to the usermode. Usermode will still see it as a big 32MB chunk of memory, you can still access it from kernel mode and all will be well …

I second that

What specific problem are you solving that your solution is to allocate a 32MB buffer in kernel space that is being passed to usermode?

While I am not the one who asked this question, but I am the one who can tell about the solution one may need to solve for this case.
Imagine a radar controlled by FPGA and all radio layer is handled there. FPGA then exposes 32 to 256 MB of its memory to a user mode DLL which then processes these acoustic data presented in FPGA buffer either on GPU of an another accelerator silicon. There may be other approaches but given the fixed limited purpose hw the options are a few.

Sergey

@rusakov2 said:

@craig_howard said:

@zvivered said:
… snip …
As an aside, there are very few reasons to get that large of a contiguous region in any case; definitely not for something being passed to usermode. You can easily get an allocation in kernel space of that size, take the MDL from that allocation and convert it into a process space buffer to pass back to the usermode. Usermode will still see it as a big 32MB chunk of memory, you can still access it from kernel mode and all will be well …

I second that

What specific problem are you solving that your solution is to allocate a 32MB buffer in kernel space that is being passed to usermode?

While I am not the one who asked this question, but I am the one who can tell about the solution one may need to solve for this case.
Imagine a radar controlled by FPGA and all radio layer is handled there. FPGA then exposes 32 to 256 MB of its memory to a user mode DLL which then processes these acoustic data presented in FPGA buffer either on GPU of an another accelerator silicon. There may be other approaches but given the fixed limited purpose hw the options are a few.

Sergey

Correct; MmMapIoSpace will attempt to find a 32 to 256MB contiguous chunk to map the FPGA memory into system space … and fail if it can’t. This is something that can’t be fixed with allocating and mapping, you either have the contiguous xMB free or you don’t …

And, as noted in the threads, doing this introduces all kinds of potential for security and synchronization drama …

Hi Tim,

I changed to sizeof(ALLOCATE_COMMON_BUFFER_REPLY) and handled all IOCTL requests in RequestDispatchToSequentialQueue.
Now it works OK.

Thank you very much,
Zvika

1 Like

@craig_howard said:

What specific problem are you solving that your solution is to allocate a 32MB buffer in kernel space that is being passed to usermode?

@rusakov2 said:
While I am not the one who asked this question, but I am the one who can tell about the solution one may need to solve for this case.
Imagine a radar controlled by FPGA and all radio layer is handled there. FPGA then exposes 32 to 256 MB of its memory to a user mode DLL which then processes these acoustic data presented in FPGA buffer either on GPU of an another accelerator silicon. There may be other approaches but given the fixed limited purpose hw the options are a few.

Correct; MmMapIoSpace will attempt to find a 32 to 256MB contiguous chunk to map the FPGA memory into system space … and fail if it can’t. This is something that can’t be fixed with allocating and mapping, you either have the contiguous xMB free or you don’t .

With hw be healthy it always does, there are rare cases where FPGA needs to expose 2 to 6 GB of its RAM to Windows 10, then there are other challenges to address to force mapping to areas above 4 GB.
Real challenge comes when part of FPGA memory temporarily overheats, and is reset, while FPGA continues to run and MmapIoSpace isn’t aware of memory issue, we deal with those corner cases :frowning:

And, as noted in the threads, doing this introduces all kinds of potential for security and synchronization drama …

Those systems aren’t Windows 10 friendly by design, and either go with compromise to force fit into Windows 10 realm or go with another OS…