Is this a bug that occurs in the Windows operating system kernel?

[TEST OS VERSION]: Windows 11 Pro / 23H2

[How it happens]: When dynamically allocating memory in the CmRegisterCallbacksEx handler and attempting to free it in a different thread.

[Problem]: When attempting to free data allocated from NonPagedPool within the handler using a different thread created by the PsCreateSystemThread function, the NonPagedPool size increases by 20MB instead of decreasing.

[Attempt 1]: When freeing the allocated NonPagedPool data before returning from the handler, there is no problem.

After loading the problematic kernel driver into the operating system and right-clicking about 20 times, the NonPagedPool size quickly balloons to over 1GB.

[PROCESSING]:

  1. Call to Registry Handler
  2. Allocate the NonPagedPool by ExAllocatePoolWithTag()
  3. Call PsCreateSystemThread() and create a new thread
  4. Return from the Registry Handler
  5. Attempt to free the allocated NonPagedPool in the thread from step 3 using ExFreePoolWithTag()
  6. Pool size increases instead

(In the actual handler implemented, the maximum allocation size is approximately 312 bytes, used for string data.)

Video Reference

[SOLUTION]:
I solved it using a WORK ITEM + Memory Mapping I/O

But why is this happening?

Since the registry callback handler runs at PASSIVE_LEVEL, it didn’t even occur to me to use a WORK_ITEM. Why is this happening?

Without seeing the code it's hard to say what is going on but as a first step you should run the code with DriverVerifier enabled and monitor it with WinDbg. This should rule out common mistakes with the allocation.

[Completely solved]:
[Following this my way]

  1. call a Handler ( CmRegisterCallbacksEx, MiniFilter, NDIS(all of the callout), bla bla)
  2. Allocate to the Memory (in my case use it 'ExAllocatedWithTag' and NonPagedPool(but <= DISPATCH_LEVEL) or PagedPool(but == PASSIVE_LEVEL)
  3. start MAPPING
  4. Allocated Mem(kernel virtual memory) -> call MmGetPhysicalAddress() -> call MmMapIoSpace() if when you allocated with NonPagedPool
  5. Allocated Mem(Kernel virtual memory) -> call IoAllocatedMdi() -> MmProbeAndLockPages(recommend 'IoWriteAccess') -> call MmGetPhysicalAddress() -> call MmMapIoSpace() if when you allocated with PagedPool
  6. initialize the KEVENT
  7. copy (5) mapped address to the other thread ( call PsCreateSystemThread() ) and start(create) thread
  8. Locking to the KEVENT
  9. in (7) thread, copied the data from mapped address
  10. UnLock(Release) to the KEVENT in (7) thread
  11. FREE all allocated address




[SIMPLE_CODE]

#include "notifyroutine.h"

#pragma warning(disable:4189)
#pragma warning(disable:4996)
#pragma warning(disable:4100)
LARGE_INTEGER Cookie_for_unload = { 0, };
NTSTATUS Registry_Event_Loader(PDRIVER_OBJECT Input_Driver_Obj) {

	UNICODE_STRING altitude;
	RtlInitUnicodeString(&altitude, L"390596");

	return CmRegisterCallbackEx(
		ExRegistryCallback_for_monitor,
		&altitude,
		Input_Driver_Obj,
		NULL,
		&Cookie_for_unload,
		NULL
	);
}

typedef struct data {
    KEVENT event;
    PHYSICAL_ADDRESS data_phy_data;
    PUCHAR data;
    PVOID mappedAddress; // MmMapIoSpace로 매핑된 주소 저장
    SIZE_T mappingSize;   // 매핑된 크기 저장
} data, * Pdata;

VOID test(Pdata data) {
    PUCHAR test_alloc = NULL;

    // NonPagedPool에 복사 버퍼를 할당합니다.
    test_alloc = ExAllocatePoolWithTag(NonPagedPool, 999, 'REG3');
    if (!test_alloc) {
        goto Exit;
    }

    if (data->mappedAddress) {
        RtlCopyMemory(test_alloc, data->mappedAddress, 999);
    }

    // ... test_alloc 사용 ...

Exit:
    if (test_alloc) {
        ExFreePoolWithTag(test_alloc, 'REG3');
    }
    KeSetEvent(&data->event, IO_NO_INCREMENT, FALSE);
}

NTSTATUS ExRegistryCallback_for_monitor(PVOID CallbackContext, PVOID Argument1, PVOID Argument2) {
    UNREFERENCED_PARAMETER(CallbackContext);
    UNREFERENCED_PARAMETER(Argument1);
    UNREFERENCED_PARAMETER(Argument2);

    Pdata alloc = NULL;
    PUCHAR DATA = NULL;
    NTSTATUS status = STATUS_SUCCESS;

    alloc = ExAllocatePoolWithTag(NonPagedPool, sizeof(data), 'REG');
    if (!alloc) {
        status = STATUS_INSUFFICIENT_RESOURCES;
        goto Exit;
    }

    DATA = ExAllocatePoolWithTag(NonPagedPool, 999, 'REG2');
    if (!DATA) {
        status = STATUS_INSUFFICIENT_RESOURCES;
        goto Exit;
    }

    alloc->data_phy_data = MmGetPhysicalAddress(DATA);
    alloc->data = DATA;
    alloc->mappingSize = 999;

    // 물리 주소 범위를 가상 주소 공간에 매핑합니다.
    alloc->mappedAddress = MmMapIoSpace(alloc->data_phy_data, alloc->mappingSize, MmNonCached);
    if (!alloc->mappedAddress) {
        status = STATUS_INSUFFICIENT_RESOURCES;
        goto Exit;
    }

    KeInitializeEvent(&alloc->event, SynchronizationEvent, FALSE);

    HANDLE handle;
    //PETHREAD threadObject;
    status = PsCreateSystemThread(&handle, THREAD_ALL_ACCESS, NULL, NULL, NULL, (PKSTART_ROUTINE)test, alloc);
    if (!NT_SUCCESS(status)) {
        goto Exit;
    }
    ZwClose(handle);

   // 스레드가 종료될 때까지 기다립니다.
   KeWaitForSingleObject(&alloc->event, Executive, KernelMode, FALSE, NULL);


Exit:
    // 리소스를 해제합니다.
    if (alloc && alloc->mappedAddress) {
        MmUnmapIoSpace(alloc->mappedAddress, alloc->mappingSize);
    }
    if (DATA) {
        ExFreePoolWithTag(DATA, 'REG2');
    }
    if (alloc) {
        ExFreePoolWithTag(alloc, 'REG');
    }

    return status;
}






This is a method for transferring dynamically allocated data from the callback function to external threads, which uses the "physical memory mapping" function to solve the exponential growth problem of anomalous "NonPagedPools".

I think you are not telling the full story.

I guess the first implementation was asynchronous, as it didn't wait for data processing in a thread and returned from the callback immediately, this might have created hundreds or thousands of kernel mode threads, thus overwhelming the system with allocations from the callback and structures to manage new threads (ETHREAD etc). The system might have struggled to schedule and complete all these threads, or there was a bug that prevented threads from progressing.

The second implementation is synchronous, it slows down a thread in which context the callback is called, waiting for data being processed in a new thread and the allocated memory is released.

MmGetPhysicalAddress and MmMapIoSpace are redundant and can be removed, the allocated memory can be accessed by alloc->data. MmMapIoSpace just creates a second virtual address mapping (i.e. an alias) for pages which are resident and already mapped.