[PATCH 1/3] drm/ttm: use apply_page_range instead of vmf_insert_pfn

intel-xe.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] drm/ttm: use apply_page_range instead of vmf_insert_pfn_prot
       [not found] <20250820143739.3422-1-christian.koenig@amd.com>
@ 2025-08-20 14:33 ` Christian König
  2025-08-20 14:33 ` [PATCH 2/3] drm/ttm: reapply increase ttm pre-fault value to PMD size" Christian König
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2025-08-20 14:33 UTC (permalink / raw)
  To: intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, david, dave.hansen,
	luto, peterz

Thomas pointed out that i915 is using apply_page_range instead of
vm_insert_pfn_prot to circumvent the PAT lookup and generally speed up
the page fault handling.

I've thought I give it a try and measure how much this can improve
things and it turned that mapping a 1GiB buffer is now more than 4x times
faster than before.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo_vm.c | 130 ++++++++++++++++----------------
 1 file changed, 64 insertions(+), 66 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index a194db83421d..93764b166678 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -160,6 +160,38 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 }
 EXPORT_SYMBOL(ttm_bo_vm_reserve);
 
+/* State bag for calls to ttm_bo_vm_apply_cb */
+struct ttm_bo_vm_bag {
+	struct mm_struct		*mm;
+	struct ttm_buffer_object	*bo;
+	struct ttm_tt			*ttm;
+	unsigned long			page_offset;
+	pgprot_t			prot;
+};
+
+/* Callback to fill in a specific PTE */
+static int ttm_bo_vm_apply_cb(pte_t *pte, unsigned long addr, void *data)
+{
+        struct ttm_bo_vm_bag *bag = data;
+	struct ttm_buffer_object *bo = bag->bo;
+	unsigned long pfn;
+
+	if (bo->resource->bus.is_iomem) {
+		pfn = ttm_bo_io_mem_pfn(bo, bag->page_offset);
+	} else {
+		struct page *page = bag->ttm->pages[bag->page_offset];
+
+		if (unlikely(!page))
+			return -ENOMEM;
+		pfn = page_to_pfn(page);
+	}
+
+        /* Special PTE are not associated with any struct page */
+        set_pte_at(bag->mm, addr, pte, pte_mkspecial(pfn_pte(pfn, bag->prot)));
+	bag->page_offset++;
+	return 0;
+}
+
 /**
  * ttm_bo_vm_fault_reserved - TTM fault helper
  * @vmf: The struct vm_fault given as argument to the fault callback
@@ -183,101 +215,67 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 				    pgoff_t num_prefault)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = vma->vm_private_data;
-	struct ttm_device *bdev = bo->bdev;
-	unsigned long page_offset;
-	unsigned long page_last;
-	unsigned long pfn;
-	struct ttm_tt *ttm = NULL;
-	struct page *page;
+	struct ttm_bo_vm_bag bag = {
+		.mm = vma->vm_mm,
+		.bo = vma->vm_private_data
+	};
+	unsigned long size;
+	vm_fault_t ret;
 	int err;
-	pgoff_t i;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-	unsigned long address = vmf->address;
 
 	/*
 	 * Wait for buffer data in transit, due to a pipelined
 	 * move.
 	 */
-	ret = ttm_bo_vm_fault_idle(bo, vmf);
+	ret = ttm_bo_vm_fault_idle(bag.bo, vmf);
 	if (unlikely(ret != 0))
 		return ret;
 
-	err = ttm_mem_io_reserve(bdev, bo->resource);
+	err = ttm_mem_io_reserve(bag.bo->bdev, bag.bo->resource);
 	if (unlikely(err != 0))
 		return VM_FAULT_SIGBUS;
 
-	page_offset = ((address - vma->vm_start) >> PAGE_SHIFT) +
-		vma->vm_pgoff - drm_vma_node_start(&bo->base.vma_node);
-	page_last = vma_pages(vma) + vma->vm_pgoff -
-		drm_vma_node_start(&bo->base.vma_node);
-
-	if (unlikely(page_offset >= PFN_UP(bo->base.size)))
+	bag.page_offset = ((vmf->address - vma->vm_start) >> PAGE_SHIFT) +
+		vma->vm_pgoff - drm_vma_node_start(&bag.bo->base.vma_node);
+	if (unlikely(bag.page_offset >= PFN_UP(bag.bo->base.size)))
 		return VM_FAULT_SIGBUS;
 
-	prot = ttm_io_prot(bo, bo->resource, prot);
-	if (!bo->resource->bus.is_iomem) {
+	prot = ttm_io_prot(bag.bo, bag.bo->resource, prot);
+	if (!bag.bo->resource->bus.is_iomem) {
 		struct ttm_operation_ctx ctx = {
 			.interruptible = true,
 			.no_wait_gpu = false,
 			.force_alloc = true
 		};
 
-		ttm = bo->ttm;
-		err = ttm_bo_populate(bo, &ctx);
-		if (err) {
-			if (err == -EINTR || err == -ERESTARTSYS ||
-			    err == -EAGAIN)
-				return VM_FAULT_NOPAGE;
-
-			pr_debug("TTM fault hit %pe.\n", ERR_PTR(err));
-			return VM_FAULT_SIGBUS;
-		}
+		bag.ttm = bag.bo->ttm;
+		err = ttm_bo_populate(bag.bo, &ctx);
+		if (err)
+			goto error;
 	} else {
 		/* Iomem should not be marked encrypted */
 		prot = pgprot_decrypted(prot);
 	}
+	bag.prot = prot;
 
-	/*
-	 * Speculatively prefault a number of pages. Only error on
-	 * first page.
-	 */
-	for (i = 0; i < num_prefault; ++i) {
-		if (bo->resource->bus.is_iomem) {
-			pfn = ttm_bo_io_mem_pfn(bo, page_offset);
-		} else {
-			page = ttm->pages[page_offset];
-			if (unlikely(!page && i == 0)) {
-				return VM_FAULT_OOM;
-			} else if (unlikely(!page)) {
-				break;
-			}
-			pfn = page_to_pfn(page);
-		}
+	/* Speculatively prefault a number of pages. */
+	size = min(num_prefault << PAGE_SHIFT, vma->vm_end - vmf->address);
+	err = apply_to_page_range(vma->vm_mm, vmf->address, size,
+				  ttm_bo_vm_apply_cb, &bag);
 
-		/*
-		 * Note that the value of @prot at this point may differ from
-		 * the value of @vma->vm_page_prot in the caching- and
-		 * encryption bits. This is because the exact location of the
-		 * data may not be known at mmap() time and may also change
-		 * at arbitrary times while the data is mmap'ed.
-		 * See vmf_insert_pfn_prot() for a discussion.
-		 */
-		ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
+error:
+	if (err == -EINTR || err == -ERESTARTSYS || err == -EAGAIN)
+		return VM_FAULT_NOPAGE;
 
-		/* Never error on prefaulted PTEs */
-		if (unlikely((ret & VM_FAULT_ERROR))) {
-			if (i == 0)
-				return VM_FAULT_NOPAGE;
-			else
-				break;
-		}
+	if (err == -ENOMEM)
+		return VM_FAULT_OOM;
 
-		address += PAGE_SIZE;
-		if (unlikely(++page_offset >= page_last))
-			break;
+	if (err) {
+		pr_debug("TTM fault hit %pe.\n", ERR_PTR(err));
+		return VM_FAULT_SIGBUS;
 	}
-	return ret;
+
+	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/3] drm/ttm: reapply increase ttm pre-fault value to PMD size"
       [not found] <20250820143739.3422-1-christian.koenig@amd.com>
  2025-08-20 14:33 ` [PATCH 1/3] drm/ttm: use apply_page_range instead of vmf_insert_pfn_prot Christian König
@ 2025-08-20 14:33 ` Christian König
  2025-08-20 14:33 ` [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2 Christian König
  2025-08-20 15:23 ` David Hildenbrand
  3 siblings, 0 replies; 28+ messages in thread
From: Christian König @ 2025-08-20 14:33 UTC (permalink / raw)
  To: intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, david, dave.hansen,
	luto, peterz

Now that we have improved the handling faulting in a full PMD only
increases the overhead on my test system from 21us to 29us if only a
single page is requested, but massively improves the performance for
all other use cases.

So re-apply that change again to improve the fault handling for large
allocations bringing us close to improving it by a factor of 10.

This reverts commit c358a809cb58af944d496944391a240e02f5837a.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/drm/ttm/ttm_bo.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
index 903cd1030110..e96477606207 100644
--- a/include/drm/ttm/ttm_bo.h
+++ b/include/drm/ttm/ttm_bo.h
@@ -39,7 +39,11 @@
 #include "ttm_device.h"
 
 /* Default number of pre-faulted pages in the TTM fault handler */
+#if CONFIG_PGTABLE_LEVELS > 2
+#define TTM_BO_VM_NUM_PREFAULT (1 << (PMD_SHIFT - PAGE_SHIFT))
+#else
 #define TTM_BO_VM_NUM_PREFAULT 16
+#endif
 
 struct iosys_map;
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2
       [not found] <20250820143739.3422-1-christian.koenig@amd.com>
  2025-08-20 14:33 ` [PATCH 1/3] drm/ttm: use apply_page_range instead of vmf_insert_pfn_prot Christian König
  2025-08-20 14:33 ` [PATCH 2/3] drm/ttm: reapply increase ttm pre-fault value to PMD size" Christian König
@ 2025-08-20 14:33 ` Christian König
  2025-08-20 15:12   ` Borislav Petkov
  2025-08-20 15:23 ` David Hildenbrand
  3 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-08-20 14:33 UTC (permalink / raw)
  To: intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, david, dave.hansen,
	luto, peterz

On some x86 systems (old AMD Athlons, Intel Luna Lake) we have the problem
that changing the caching flags of system memory requires changing the
global MTRR/PAT tables since those CPUs can't handle aliasing caching
attributes.

But on most modern x86 system (e.g. AMD CPUs after 2004) we actually
don't need that any more and can update the caching flags directly in the
PTEs of the userspace and kernel mappings.

We already do this with encryption on x86 64bit for quite a while and all
other supported platforms (Sparc, PowerPC, ARM, MIPS, LONGARCH) as well as
the i915 driver have never done anything different either.

So stop changing the global chaching flags for CPU systems which don't
need it and just insert a clflush to be on the safe side so that we never
return memory with dirty cache lines.

Testing on a Ryzen 5 and 7 shows that the clflush has absolutely no
performance impact, but I'm still waiting for CI systems to confirm
functional correctness.

v2: drop the pool only on AMD CPUs for now

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_pool.c | 37 +++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 83b10706ba89..3f830fb2aea5 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -45,6 +45,7 @@
 #include <drm/ttm/ttm_pool.h>
 #include <drm/ttm/ttm_tt.h>
 #include <drm/ttm/ttm_bo.h>
+#include <drm/drm_cache.h>
 
 #include "ttm_module.h"
 
@@ -119,6 +120,8 @@ module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
+static bool skip_caching_adjustment;
+
 static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS];
 static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
 
@@ -195,7 +198,8 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching,
 	/* We don't care that set_pages_wb is inefficient here. This is only
 	 * used when we have to shrink and CPU overhead is irrelevant then.
 	 */
-	if (caching != ttm_cached && !PageHighMem(p))
+	if (!skip_caching_adjustment &&
+	    caching != ttm_cached && !PageHighMem(p))
 		set_pages_wb(p, 1 << order);
 #endif
 
@@ -223,13 +227,19 @@ static int ttm_pool_apply_caching(struct ttm_pool_alloc_state *alloc)
 	if (!num_pages)
 		return 0;
 
-	switch (alloc->tt_caching) {
-	case ttm_cached:
-		break;
-	case ttm_write_combined:
-		return set_pages_array_wc(alloc->caching_divide, num_pages);
-	case ttm_uncached:
-		return set_pages_array_uc(alloc->caching_divide, num_pages);
+	if (skip_caching_adjustment) {
+		drm_clflush_pages(alloc->caching_divide, num_pages);
+	} else {
+		switch (alloc->tt_caching) {
+		case ttm_cached:
+			break;
+		case ttm_write_combined:
+			return set_pages_array_wc(alloc->caching_divide,
+						  num_pages);
+		case ttm_uncached:
+			return set_pages_array_uc(alloc->caching_divide,
+						  num_pages);
+		}
 	}
 #endif
 	alloc->caching_divide = alloc->pages;
@@ -342,6 +352,9 @@ static struct ttm_pool_type *ttm_pool_select_type(struct ttm_pool *pool,
 		return &pool->caching[caching].orders[order];
 
 #ifdef CONFIG_X86
+	if (skip_caching_adjustment)
+		return NULL;
+
 	switch (caching) {
 	case ttm_write_combined:
 		if (pool->nid != NUMA_NO_NODE)
@@ -981,7 +994,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt,
 
 #ifdef CONFIG_X86
 	/* Anything returned to the system needs to be cached. */
-	if (tt->caching != ttm_cached)
+	if (!skip_caching_adjustment && tt->caching != ttm_cached)
 		set_pages_array_wb(tt->pages, tt->num_pages);
 #endif
 
@@ -1296,6 +1309,12 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	spin_lock_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
+#ifdef CONFIG_X86
+	skip_caching_adjustment =
+		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+		static_cpu_has(X86_FEATURE_CLFLUSH);
+#endif
+
 	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2
  2025-08-20 14:33 ` [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2 Christian König
@ 2025-08-20 15:12   ` Borislav Petkov
  0 siblings, 0 replies; 28+ messages in thread
From: Borislav Petkov @ 2025-08-20 15:12 UTC (permalink / raw)
  To: Christian König
  Cc: intel-xe, intel-gfx, dri-devel, amd-gfx, x86, airlied,
	thomas.hellstrom, matthew.brost, david, dave.hansen, luto, peterz

On Wed, Aug 20, 2025 at 04:33:13PM +0200, Christian König wrote:
> +#ifdef CONFIG_X86
> +	skip_caching_adjustment =
> +		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
> +		static_cpu_has(X86_FEATURE_CLFLUSH);

cpu_feature_enabled()

> +#endif

I'd prefer if this would call a function in arch/x86/ which tests those
instead of using x86 artifacts in drivers.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
       [not found] <20250820143739.3422-1-christian.koenig@amd.com>
                   ` (2 preceding siblings ...)
  2025-08-20 14:33 ` [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2 Christian König
@ 2025-08-20 15:23 ` David Hildenbrand
  2025-08-21  8:10   ` Re: Christian König
       [not found]   ` <7db27720-8cfd-457c-8133-5a7a1094004c@lucifer.local>
  3 siblings, 2 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-20 15:23 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

CCing Lorenzo

On 20.08.25 16:33, Christian König wrote:
> Hi everyone,
> 
> sorry for CCing so many people, but that rabbit hole turned out to be
> deeper than originally thought.
> 
> TTM always had problems with UC/WC mappings on 32bit systems and drivers
> often had to revert to hacks like using GFP_DMA32 to get things working
> while having no rational explanation why that helped (see the TTM AGP,
> radeon and nouveau driver code for that).
> 
> It turned out that the PAT implementation we use on x86 not only enforces
> the same caching attributes for pages in the linear kernel mapping, but
> also for highmem pages through a separate R/B tree.
> 
> That was unexpected and TTM never updated that R/B tree for highmem pages,
> so the function pgprot_set_cachemode() just overwrote the caching
> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
> caused all kind of random trouble.
> 
> An R/B tree is potentially not a good data structure to hold thousands if
> not millions of different attributes for each page, so updating that is
> probably not the way to solve this issue.
> 
> Thomas pointed out that the i915 driver is using apply_page_range()
> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
> just fill in the page tables with what the driver things is the right
> caching attribute.

I assume you mean apply_to_page_range() -- same issue in patch subjects.

Oh this sounds horrible. Why oh why do we have these hacks in core-mm 
and have drivers abuse them :(

Honestly, apply_to_pte_range() is just the entry in doing all kinds of 
weird crap to page tables because "you know better".

All the sanity checks from vmf_insert_pfn(), gone.

Can we please fix the underlying issue properly?

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-20 15:23 ` David Hildenbrand
@ 2025-08-21  8:10   ` Christian König
  2025-08-25 19:10     ` Re: David Hildenbrand
       [not found]   ` <7db27720-8cfd-457c-8133-5a7a1094004c@lucifer.local>
  1 sibling, 1 reply; 28+ messages in thread
From: Christian König @ 2025-08-21  8:10 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 20.08.25 17:23, David Hildenbrand wrote:
> CCing Lorenzo
> 
> On 20.08.25 16:33, Christian König wrote:
>> Hi everyone,
>>
>> sorry for CCing so many people, but that rabbit hole turned out to be
>> deeper than originally thought.
>>
>> TTM always had problems with UC/WC mappings on 32bit systems and drivers
>> often had to revert to hacks like using GFP_DMA32 to get things working
>> while having no rational explanation why that helped (see the TTM AGP,
>> radeon and nouveau driver code for that).
>>
>> It turned out that the PAT implementation we use on x86 not only enforces
>> the same caching attributes for pages in the linear kernel mapping, but
>> also for highmem pages through a separate R/B tree.
>>
>> That was unexpected and TTM never updated that R/B tree for highmem pages,
>> so the function pgprot_set_cachemode() just overwrote the caching
>> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
>> caused all kind of random trouble.
>>
>> An R/B tree is potentially not a good data structure to hold thousands if
>> not millions of different attributes for each page, so updating that is
>> probably not the way to solve this issue.
>>
>> Thomas pointed out that the i915 driver is using apply_page_range()
>> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
>> just fill in the page tables with what the driver things is the right
>> caching attribute.
> 
> I assume you mean apply_to_page_range() -- same issue in patch subjects.

Oh yes, of course. Sorry.

> Oh this sounds horrible. Why oh why do we have these hacks in core-mm and have drivers abuse them :(

Yeah I was also a bit hesitated to use that, but the performance advantage is so high that we probably can't avoid the general approach.

> Honestly, apply_to_pte_range() is just the entry in doing all kinds of weird crap to page tables because "you know better".

Exactly that's the problem I'm pointing out, drivers *do* know it better. The core memory management has applied incorrect values which caused all kind of the trouble.

The problem is not a bug in PAT nor TTM/drivers but rather how they interact with each other.

What I don't understand is why do we have the PAT in the first place? No other architecture does it this way.

Is that because of the of x86 CPUs which have problems when different page tables contain different caching attributes for the same physical memory?

> All the sanity checks from vmf_insert_pfn(), gone.
> 
> Can we please fix the underlying issue properly?

I'm happy to implement anything advised, my question is what should we solve this issue?

Regards,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: your mail
       [not found]   ` <7db27720-8cfd-457c-8133-5a7a1094004c@lucifer.local>
@ 2025-08-21  9:30     ` David Hildenbrand
       [not found]       ` <f6f85c73-2a1e-438a-82c9-f3392d91020c@lucifer.local>
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-21  9:30 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86, airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Liam Howlett

> I will add this xen/apply_to_page_range() thing to my TODOs, which atm
> would invovle changing these drivers to use vmf_insert_pfn_prot() instead.
> 

Busy today (want to reply to Christian) but

a) Re: performance, we would want something like
    vmf_insert_pfns_prot(), similar to vm_insert_pages(), to bulk-insert
    multiple PFNs.

b) Re: PAT, we'll have to figure out why PAT information is wrong here
    (was there no previous PAT reservation from the driver?), but IF we
    really have to override, we'd want a way to tell
    vmf_insert_pfn_prot() to force the selected caching mode.

-- 
Cheers

David / dhildenb

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: your mail
       [not found]       ` <f6f85c73-2a1e-438a-82c9-f3392d91020c@lucifer.local>
@ 2025-08-21 10:16         ` David Hildenbrand
  2025-08-25 18:35         ` Christian König
  1 sibling, 0 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-21 10:16 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86, airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Liam Howlett

On 21.08.25 12:05, Lorenzo Stoakes wrote:
> On Thu, Aug 21, 2025 at 11:30:43AM +0200, David Hildenbrand wrote:
>>> I will add this xen/apply_to_page_range() thing to my TODOs, which atm
>>> would invovle changing these drivers to use vmf_insert_pfn_prot() instead.
>>>
>>
>> Busy today (want to reply to Christian) but
>>
>> a) Re: performance, we would want something like
>>     vmf_insert_pfns_prot(), similar to vm_insert_pages(), to bulk-insert
>>     multiple PFNs.
>>
>> b) Re: PAT, we'll have to figure out why PAT information is wrong here
>>     (was there no previous PAT reservation from the driver?), but IF we
>>     really have to override, we'd want a way to tell
>>     vmf_insert_pfn_prot() to force the selected caching mode.
>>
> 
> Ack, ok good that we have a feasible way forward.
> 
> FYI, spoke to Peter off-list and he mentioned he had a more general series
> to get rid of this kind of [ab]use of apply_to_page_range() (see [0]), I
> gather he hasn't the time to resurrect but perhaps one of us can at some
> point?
> 
> Perhaps we need a shorter term fix to _this_ issue (which involves not
> using this interface), and then follow it up with an adaptation of the
> below?

We need to understand why PAT would be wrong and why it would even be ok
to ignore it. Not hacking around it.

FWIW, I just recently documented:

+/**
+ * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range
+ * @pfn: the start of the pfn range
+ * @size: the size of the pfn range in bytes
+ * @prot: the pgprot to modify
+ *
+ * Lookup the cachemode for the pfn range starting at @pfn with the size
+ * @size and store it in @prot, leaving other data in @prot unchanged.
+ *
+ * This allows for a hardware implementation to have fine-grained control of
+ * memory cache behavior at page level granularity. Without a hardware
+ * implementation, this function does nothing.
+ *
+ * Currently there is only one implementation for this - x86 Page Attribute
+ * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
+ *
+ * This function can fail if the pfn range spans pfns that require differing
+ * cachemodes. If the pfn range was previously verified to have a single
+ * cachemode, it is sufficient to query only a single pfn. The assumption is
+ * that this is the case for drivers using the vmf_insert_pfn*() interface.
+ *
+ * Returns 0 on success and -EINVAL on error.
+ */
+int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
+               pgprot_t *prot);
  extern int track_pfn_copy(struct vm_area_struct *dst_vma,
                 struct vm_area_struct *src_vma, unsigned long *pfn);
  extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
@@ -1563,6 +1584,21 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
  extern void untrack_pfn_clear(struct vm_area_struct *vma);
  #endif
  
+/**
+ * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn
+ * @pfn: the pfn
+ * @prot: the pgprot to modify
+ *
+ * Lookup the cachemode for @pfn and store it in @prot, leaving other
+ * data in @prot unchanged.
+ *
+ * See pfnmap_setup_cachemode() for details.
+ */
+static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
+{
+       pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
+}


There is certainly something missing that the driver would have previously
verified that the cachemode is as expected.

-- 
Cheers

David / dhildenb

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: your mail
       [not found]       ` <f6f85c73-2a1e-438a-82c9-f3392d91020c@lucifer.local>
  2025-08-21 10:16         ` David Hildenbrand
@ 2025-08-25 18:35         ` Christian König
  2025-08-25 19:20           ` David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: Christian König @ 2025-08-25 18:35 UTC (permalink / raw)
  To: Lorenzo Stoakes, David Hildenbrand
  Cc: intel-xe, intel-gfx, dri-devel, amd-gfx, x86, airlied,
	thomas.hellstrom, matthew.brost, dave.hansen, luto, peterz,
	Liam Howlett

On 21.08.25 12:05, Lorenzo Stoakes wrote:
> On Thu, Aug 21, 2025 at 11:30:43AM +0200, David Hildenbrand wrote:
>>> I will add this xen/apply_to_page_range() thing to my TODOs, which atm
>>> would invovle changing these drivers to use vmf_insert_pfn_prot() instead.
>>>
>>
>> Busy today (want to reply to Christian) but
>>
>> a) Re: performance, we would want something like
>>    vmf_insert_pfns_prot(), similar to vm_insert_pages(), to bulk-insert
>>    multiple PFNs.

Yes, exactly that. Ideally something like an iterator/callback like interface.

I've seen at least four or five different representations of the PFNs in drivers.

>> b) Re: PAT, we'll have to figure out why PAT information is wrong here
>>    (was there no previous PAT reservation from the driver?), but IF we
>>    really have to override, we'd want a way to tell
>>    vmf_insert_pfn_prot() to force the selected caching mode.
>>

Well the difference between vmf_insert_pfn() and vmf_insert_pfn_prot() is that the driver actually want to specify the caching modes.

That this is overridden by the PAT even for pages which are not part of the linear mapping is really surprising.

As far as I can see there is no technical necessity for that. Even for pages in the linear mapping only a handful of x86 CPUs actually need that. See Intels i915 GPU driver for reference.

Intel has used that approach for ages and for AMD CPUs the only reference I could find where the kernel needs it are Athlons produced between 1996 and 2004.

Maybe we should disable the PAT on CPUs which actually don't need it?

> Ack, ok good that we have a feasible way forward.
> 
> FYI, spoke to Peter off-list and he mentioned he had a more general series
> to get rid of this kind of [ab]use of apply_to_page_range() (see [0]), I
> gather he hasn't the time to resurrect but perhaps one of us can at some
> point?
> 
> Perhaps we need a shorter term fix to _this_ issue (which involves not
> using this interface), and then follow it up with an adaptation of the
> below?

Sounds like a plan to me.

Regards,
Christian.

> 
> Cheers, Lorenzo
> 
> [0]:https://lore.kernel.org/all/20210412080012.357146277@infradead.org/
> 
> 
> 
>> --
>> Cheers
>>
>> David / dhildenb

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-21  8:10   ` Re: Christian König
@ 2025-08-25 19:10     ` David Hildenbrand
  2025-08-26  8:38       ` Re: Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-25 19:10 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 21.08.25 10:10, Christian König wrote:
> On 20.08.25 17:23, David Hildenbrand wrote:
>> CCing Lorenzo
>>
>> On 20.08.25 16:33, Christian König wrote:
>>> Hi everyone,
>>>
>>> sorry for CCing so many people, but that rabbit hole turned out to be
>>> deeper than originally thought.
>>>
>>> TTM always had problems with UC/WC mappings on 32bit systems and drivers
>>> often had to revert to hacks like using GFP_DMA32 to get things working
>>> while having no rational explanation why that helped (see the TTM AGP,
>>> radeon and nouveau driver code for that).
>>>
>>> It turned out that the PAT implementation we use on x86 not only enforces
>>> the same caching attributes for pages in the linear kernel mapping, but
>>> also for highmem pages through a separate R/B tree.
>>>
>>> That was unexpected and TTM never updated that R/B tree for highmem pages,
>>> so the function pgprot_set_cachemode() just overwrote the caching
>>> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
>>> caused all kind of random trouble.
>>>
>>> An R/B tree is potentially not a good data structure to hold thousands if
>>> not millions of different attributes for each page, so updating that is
>>> probably not the way to solve this issue.
>>>
>>> Thomas pointed out that the i915 driver is using apply_page_range()
>>> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
>>> just fill in the page tables with what the driver things is the right
>>> caching attribute.
>>
>> I assume you mean apply_to_page_range() -- same issue in patch subjects.
> 
> Oh yes, of course. Sorry.
> 
>> Oh this sounds horrible. Why oh why do we have these hacks in core-mm and have drivers abuse them :(
> 
> Yeah I was also a bit hesitated to use that, but the performance advantage is so high that we probably can't avoid the general approach.
> 
>> Honestly, apply_to_pte_range() is just the entry in doing all kinds of weird crap to page tables because "you know better".
> 
> Exactly that's the problem I'm pointing out, drivers *do* know it better. The core memory management has applied incorrect values which caused all kind of the trouble.
> 
> The problem is not a bug in PAT nor TTM/drivers but rather how they interact with each other.
> 
> What I don't understand is why do we have the PAT in the first place? No other architecture does it this way.

Probably because no other architecture has these weird glitches I assume 
... skimming over memtype_reserve() and friends there are quite some 
corner cases the code is handling (BIOS, ACPI, low ISA, system RAM, ...)

I did a lot of work on the higher PAT level functions, but I am no 
expert on the lower level management functions, and in particular all 
the special cases with different memory types.

IIRC, the goal of the PAT subsystem is to make sure that no two page 
tables map the same PFN with different caching attributes.

It treats ordinary system RAM (IORESOURCE_SYSTEM_RAM) usually in a 
special way: no special caching mode.

For everything else, it expects that someone first reserves a memory 
range for a specific caching mode.

For example, remap_pfn_range()...->pfnmap_track()->memtype_reserve() 
will make sure that there are no conflicts, to the call 
memtype_kernel_map_sync() to make sure the identity mapping is updated 
to the new type.

In case someone ends up calling pfnmap_setup_cachemode(), the 
expectation is that there was a previous call to memtype_reserve_io() or 
similar, such that pfnmap_setup_cachemode() will find that caching mode.

So my assumption would be that that is missing for the drivers here?

Last time I asked where this reservation is done, Peter Xu explained [1] 
it at least for VFIO:

vfio_pci_core_mmap
   pci_iomap
     pci_iomap_range
       ...
         __ioremap_caller
           memtype_reserve

Now, could it be that something like that is missing in these drivers 
(ioremap etc)?

[1] https://lkml.kernel.org/r/aBDXr-Qp4z0tS50P@x1.local

> 
> Is that because of the of x86 CPUs which have problems when different page tables contain different caching attributes for the same physical memory?

Yes, but I don't think x86 is special here.

-- 
Cheers

David / dhildenb

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: your mail
  2025-08-25 18:35         ` Christian König
@ 2025-08-25 19:20           ` David Hildenbrand
  0 siblings, 0 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-25 19:20 UTC (permalink / raw)
  To: Christian König, Lorenzo Stoakes
  Cc: intel-xe, intel-gfx, dri-devel, amd-gfx, x86, airlied,
	thomas.hellstrom, matthew.brost, dave.hansen, luto, peterz,
	Liam Howlett

On 25.08.25 20:35, Christian König wrote:
> On 21.08.25 12:05, Lorenzo Stoakes wrote:
>> On Thu, Aug 21, 2025 at 11:30:43AM +0200, David Hildenbrand wrote:
>>>> I will add this xen/apply_to_page_range() thing to my TODOs, which atm
>>>> would invovle changing these drivers to use vmf_insert_pfn_prot() instead.
>>>>
>>>
>>> Busy today (want to reply to Christian) but
>>>
>>> a) Re: performance, we would want something like
>>>     vmf_insert_pfns_prot(), similar to vm_insert_pages(), to bulk-insert
>>>     multiple PFNs.
> 
> Yes, exactly that. Ideally something like an iterator/callback like interface.
> 
> I've seen at least four or five different representations of the PFNs in drivers.
> 
>>> b) Re: PAT, we'll have to figure out why PAT information is wrong here
>>>     (was there no previous PAT reservation from the driver?), but IF we
>>>     really have to override, we'd want a way to tell
>>>     vmf_insert_pfn_prot() to force the selected caching mode.
>>>
> 
> Well the difference between vmf_insert_pfn() and vmf_insert_pfn_prot() is that the driver actually want to specify the caching modes.

Yes, it's all a mess. x86/PAT doesn't want inconsistencies, so it 
expects that a previous reservation would make sure that that caching 
mode is actually valid.

> 
> That this is overridden by the PAT even for pages which are not part of the linear mapping is really surprising.

Yes, IIUC, it expects an earlier reservation on PAT systems.

> 
> As far as I can see there is no technical necessity for that. Even for pages in the linear mapping only a handful of x86 CPUs actually need that. See Intels i915 GPU driver for reference.
> 
> Intel has used that approach for ages and for AMD CPUs the only reference I could find where the kernel needs it are Athlons produced between 1996 and 2004.
> 
> Maybe we should disable the PAT on CPUs which actually don't need it?

Not sure if that will solve our problems on systems that need it because 
of some devices.

I guess the problem of pfnmap_setup_cachemode_pfn() is that there is no 
interface to undo it: pfnmap_track() is pared with pfnmap_untrack() such 
that it can simply do/undo the reservation itself.

That's why pfnmap_setup_cachemode_pfn() leaves it up to the caller that 
a reservation was trigger earlier differently -- which can properly be 
undone.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-25 19:10     ` Re: David Hildenbrand
@ 2025-08-26  8:38       ` Christian König
  2025-08-26  8:46         ` Re: David Hildenbrand
  2025-08-26 12:37         ` David Hildenbrand
  0 siblings, 2 replies; 28+ messages in thread
From: Christian König @ 2025-08-26  8:38 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 25.08.25 21:10, David Hildenbrand wrote:
> On 21.08.25 10:10, Christian König wrote:
>> On 20.08.25 17:23, David Hildenbrand wrote:
>>> CCing Lorenzo
>>>
>>> On 20.08.25 16:33, Christian König wrote:
>>>> Hi everyone,
>>>>
>>>> sorry for CCing so many people, but that rabbit hole turned out to be
>>>> deeper than originally thought.
>>>>
>>>> TTM always had problems with UC/WC mappings on 32bit systems and drivers
>>>> often had to revert to hacks like using GFP_DMA32 to get things working
>>>> while having no rational explanation why that helped (see the TTM AGP,
>>>> radeon and nouveau driver code for that).
>>>>
>>>> It turned out that the PAT implementation we use on x86 not only enforces
>>>> the same caching attributes for pages in the linear kernel mapping, but
>>>> also for highmem pages through a separate R/B tree.
>>>>
>>>> That was unexpected and TTM never updated that R/B tree for highmem pages,
>>>> so the function pgprot_set_cachemode() just overwrote the caching
>>>> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
>>>> caused all kind of random trouble.
>>>>
>>>> An R/B tree is potentially not a good data structure to hold thousands if
>>>> not millions of different attributes for each page, so updating that is
>>>> probably not the way to solve this issue.
>>>>
>>>> Thomas pointed out that the i915 driver is using apply_page_range()
>>>> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
>>>> just fill in the page tables with what the driver things is the right
>>>> caching attribute.
>>>
>>> I assume you mean apply_to_page_range() -- same issue in patch subjects.
>>
>> Oh yes, of course. Sorry.
>>
>>> Oh this sounds horrible. Why oh why do we have these hacks in core-mm and have drivers abuse them :(
>>
>> Yeah I was also a bit hesitated to use that, but the performance advantage is so high that we probably can't avoid the general approach.
>>
>>> Honestly, apply_to_pte_range() is just the entry in doing all kinds of weird crap to page tables because "you know better".
>>
>> Exactly that's the problem I'm pointing out, drivers *do* know it better. The core memory management has applied incorrect values which caused all kind of the trouble.
>>
>> The problem is not a bug in PAT nor TTM/drivers but rather how they interact with each other.
>>
>> What I don't understand is why do we have the PAT in the first place? No other architecture does it this way.
> 
> Probably because no other architecture has these weird glitches I assume ... skimming over memtype_reserve() and friends there are quite some corner cases the code is handling (BIOS, ACPI, low ISA, system RAM, ...)
> 
> 
> I did a lot of work on the higher PAT level functions, but I am no expert on the lower level management functions, and in particular all the special cases with different memory types.
> 
> IIRC, the goal of the PAT subsystem is to make sure that no two page tables map the same PFN with different caching attributes.

Yeah, that actually makes sense. Thomas from Intel recently explained the technical background to me:

Some x86 CPUs write back cache lines even if they aren't dirty and what can happen is that because of the linear mapping the CPU speculatively loads a cache line which is elsewhere mapped uncached.

So the end result is that the writeback of not dirty cache lines potentially corrupts the data in the otherwise uncached system memory.

But that a) only applies to memory in the linear mapping and b) only to a handful of x86 CPU types (e.g. recently Intels Luna Lake, AMD Athlons produced before 2004, maybe others).

> It treats ordinary system RAM (IORESOURCE_SYSTEM_RAM) usually in a special way: no special caching mode.
> 
> For everything else, it expects that someone first reserves a memory range for a specific caching mode.
> 
> For example, remap_pfn_range()...->pfnmap_track()->memtype_reserve() will make sure that there are no conflicts, to the call memtype_kernel_map_sync() to make sure the identity mapping is updated to the new type.
> 
> In case someone ends up calling pfnmap_setup_cachemode(), the expectation is that there was a previous call to memtype_reserve_io() or similar, such that pfnmap_setup_cachemode() will find that caching mode.
> 
> 
> So my assumption would be that that is missing for the drivers here?

Well yes and no.

See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.

So what would happen is that we completely clutter the R/B tree used by the PAT with thousands if not millions of entries.

> 
> Last time I asked where this reservation is done, Peter Xu explained [1] it at least for VFIO:
> 
> vfio_pci_core_mmap
>   pci_iomap
>     pci_iomap_range
>       ...
>         __ioremap_caller
>           memtype_reserve
> 
> 
> Now, could it be that something like that is missing in these drivers (ioremap etc)?

Well that would solve the issue temporary, but I'm pretty sure that will just go boom at a different place then :(

One possibility would be to say that the PAT only overrides the attributes if they aren't normal cached and leaves everything else alone.

What do you think?

Thanks,
Christian.

> 
> 
> 
> [1] https://lkml.kernel.org/r/aBDXr-Qp4z0tS50P@x1.local
> 
> 
>>
>> Is that because of the of x86 CPUs which have problems when different page tables contain different caching attributes for the same physical memory?
> 
> Yes, but I don't think x86 is special here.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  8:38       ` Re: Christian König
@ 2025-08-26  8:46         ` David Hildenbrand
  2025-08-26  9:00           ` Re: Christian König
  2025-08-26 12:37         ` David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-26  8:46 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 10:38, Christian König wrote:
> On 25.08.25 21:10, David Hildenbrand wrote:
>> On 21.08.25 10:10, Christian König wrote:
>>> On 20.08.25 17:23, David Hildenbrand wrote:
>>>> CCing Lorenzo
>>>>
>>>> On 20.08.25 16:33, Christian König wrote:
>>>>> Hi everyone,
>>>>>
>>>>> sorry for CCing so many people, but that rabbit hole turned out to be
>>>>> deeper than originally thought.
>>>>>
>>>>> TTM always had problems with UC/WC mappings on 32bit systems and drivers
>>>>> often had to revert to hacks like using GFP_DMA32 to get things working
>>>>> while having no rational explanation why that helped (see the TTM AGP,
>>>>> radeon and nouveau driver code for that).
>>>>>
>>>>> It turned out that the PAT implementation we use on x86 not only enforces
>>>>> the same caching attributes for pages in the linear kernel mapping, but
>>>>> also for highmem pages through a separate R/B tree.
>>>>>
>>>>> That was unexpected and TTM never updated that R/B tree for highmem pages,
>>>>> so the function pgprot_set_cachemode() just overwrote the caching
>>>>> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
>>>>> caused all kind of random trouble.
>>>>>
>>>>> An R/B tree is potentially not a good data structure to hold thousands if
>>>>> not millions of different attributes for each page, so updating that is
>>>>> probably not the way to solve this issue.
>>>>>
>>>>> Thomas pointed out that the i915 driver is using apply_page_range()
>>>>> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
>>>>> just fill in the page tables with what the driver things is the right
>>>>> caching attribute.
>>>>
>>>> I assume you mean apply_to_page_range() -- same issue in patch subjects.
>>>
>>> Oh yes, of course. Sorry.
>>>
>>>> Oh this sounds horrible. Why oh why do we have these hacks in core-mm and have drivers abuse them :(
>>>
>>> Yeah I was also a bit hesitated to use that, but the performance advantage is so high that we probably can't avoid the general approach.
>>>
>>>> Honestly, apply_to_pte_range() is just the entry in doing all kinds of weird crap to page tables because "you know better".
>>>
>>> Exactly that's the problem I'm pointing out, drivers *do* know it better. The core memory management has applied incorrect values which caused all kind of the trouble.
>>>
>>> The problem is not a bug in PAT nor TTM/drivers but rather how they interact with each other.
>>>
>>> What I don't understand is why do we have the PAT in the first place? No other architecture does it this way.
>>
>> Probably because no other architecture has these weird glitches I assume ... skimming over memtype_reserve() and friends there are quite some corner cases the code is handling (BIOS, ACPI, low ISA, system RAM, ...)
>>
>>
>> I did a lot of work on the higher PAT level functions, but I am no expert on the lower level management functions, and in particular all the special cases with different memory types.
>>
>> IIRC, the goal of the PAT subsystem is to make sure that no two page tables map the same PFN with different caching attributes.
> 
> Yeah, that actually makes sense. Thomas from Intel recently explained the technical background to me:
> 
> Some x86 CPUs write back cache lines even if they aren't dirty and what can happen is that because of the linear mapping the CPU speculatively loads a cache line which is elsewhere mapped uncached.
> 
> So the end result is that the writeback of not dirty cache lines potentially corrupts the data in the otherwise uncached system memory.
> 
> But that a) only applies to memory in the linear mapping and b) only to a handful of x86 CPU types (e.g. recently Intels Luna Lake, AMD Athlons produced before 2004, maybe others).
> 
>> It treats ordinary system RAM (IORESOURCE_SYSTEM_RAM) usually in a special way: no special caching mode.
>>
>> For everything else, it expects that someone first reserves a memory range for a specific caching mode.
>>
>> For example, remap_pfn_range()...->pfnmap_track()->memtype_reserve() will make sure that there are no conflicts, to the call memtype_kernel_map_sync() to make sure the identity mapping is updated to the new type.
>>
>> In case someone ends up calling pfnmap_setup_cachemode(), the expectation is that there was a previous call to memtype_reserve_io() or similar, such that pfnmap_setup_cachemode() will find that caching mode.
>>
>>
>> So my assumption would be that that is missing for the drivers here?
> 
> Well yes and no.
> 
> See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.
> 
> So what would happen is that we completely clutter the R/B tree used by the PAT with thousands if not millions of entries.
> 

Hm, above you're saying that there is no direct map, but now you are 
saying that the pages were obtained through get_free_page()?

I agree that what you describe here sounds suboptimal. But if the pages 
where obtained from the buddy, there surely is a direct map -- unless we 
explicitly remove it :(

If we're talking about individual pages without a directmap, I would 
wonder if they are actually part of a bigger memory region that can just 
be reserved in one go (similar to how remap_pfn_range()) would handle it.

Can you briefly describe how your use case obtains these PFNs, and how 
scattered tehy + their caching attributes might be?

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  8:46         ` Re: David Hildenbrand
@ 2025-08-26  9:00           ` Christian König
  2025-08-26  9:17             ` Re: David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-08-26  9:00 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 10:46, David Hildenbrand wrote:
>>> So my assumption would be that that is missing for the drivers here?
>>
>> Well yes and no.
>>
>> See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.
>>
>> So what would happen is that we completely clutter the R/B tree used by the PAT with thousands if not millions of entries.
>>
> 
> Hm, above you're saying that there is no direct map, but now you are saying that the pages were obtained through get_free_page()?

The problem only happens with highmem pages on 32bit kernels. Those pages are not in the linear mapping.

> I agree that what you describe here sounds suboptimal. But if the pages where obtained from the buddy, there surely is a direct map -- unless we explicitly remove it :(
> 
> If we're talking about individual pages without a directmap, I would wonder if they are actually part of a bigger memory region that can just be reserved in one go (similar to how remap_pfn_range()) would handle it.
> 
> Can you briefly describe how your use case obtains these PFNs, and how scattered tehy + their caching attributes might be?

What drivers do is to call get_free_page() or alloc_pages_node() with the GFP_HIGHUSER flag set.

For non highmem pages drivers then calls set_pages_wc/uc() which changes the caching of the linear mapping, but for highmem pages there is no linear mapping so set_pages_wc() or set_pages_uc() doesn't work and drivers avoid calling it.

Those are basically just random system memory pages. So they are potentially scattered over the whole memory address space.

Regards,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  9:00           ` Re: Christian König
@ 2025-08-26  9:17             ` David Hildenbrand
  2025-08-26  9:56               ` Re: Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-26  9:17 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 11:00, Christian König wrote:
> On 26.08.25 10:46, David Hildenbrand wrote:
>>>> So my assumption would be that that is missing for the drivers here?
>>>
>>> Well yes and no.
>>>
>>> See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.
>>>
>>> So what would happen is that we completely clutter the R/B tree used by the PAT with thousands if not millions of entries.
>>>
>>
>> Hm, above you're saying that there is no direct map, but now you are saying that the pages were obtained through get_free_page()?
> 
> The problem only happens with highmem pages on 32bit kernels. Those pages are not in the linear mapping.

Right, in the common case there is a direct map.

> 
>> I agree that what you describe here sounds suboptimal. But if the pages where obtained from the buddy, there surely is a direct map -- unless we explicitly remove it :(
>>
>> If we're talking about individual pages without a directmap, I would wonder if they are actually part of a bigger memory region that can just be reserved in one go (similar to how remap_pfn_range()) would handle it.
>>
>> Can you briefly describe how your use case obtains these PFNs, and how scattered tehy + their caching attributes might be?
> 
> What drivers do is to call get_free_page() or alloc_pages_node() with the GFP_HIGHUSER flag set.
> 
> For non highmem pages drivers then calls set_pages_wc/uc() which changes the caching of the linear mapping, but for highmem pages there is no linear mapping so set_pages_wc() or set_pages_uc() doesn't work and drivers avoid calling it.
> 
> Those are basically just random system memory pages. So they are potentially scattered over the whole memory address space.

Thanks, that's valuable information.

So essentially these drivers maintain their own consistency and PAT is 
not aware of that.

And the real problem is ordinary system RAM.

There are various ways forward.

1) We use another interface that consumes pages instead of PFNs, like a
    vm_insert_pages_pgprot() we would be adding.

    Is there any strong requirement for inserting non-refcounted PFNs?

2) We add another interface that consumes PFNs, but explicitly states
    that it is only for ordinary system RAM, and that the user is
    required for updating the direct map.

    We could sanity-check the direct map in debug kernels.

3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
    system RAM differently.


There is also the option for a mixture between 1 and 2, where we get 
pages, but we map them non-refcounted in a VM_PFNMAP.

In general, having pages makes it easier to assert that they are likely 
ordinary system ram pages, and that the interface is not getting abused 
for something else.

We could also perform the set_pages_wc/uc() from inside that function, 
but maybe it depends on the use case whether we want to do that whenever 
we map them into a process?

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  9:17             ` Re: David Hildenbrand
@ 2025-08-26  9:56               ` Christian König
  2025-08-26 12:07                 ` Re: David Hildenbrand
  2025-08-26 14:27                 ` Thomas Hellström
  0 siblings, 2 replies; 28+ messages in thread
From: Christian König @ 2025-08-26  9:56 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 11:17, David Hildenbrand wrote:
> On 26.08.25 11:00, Christian König wrote:
>> On 26.08.25 10:46, David Hildenbrand wrote:
>>>>> So my assumption would be that that is missing for the drivers here?
>>>>
>>>> Well yes and no.
>>>>
>>>> See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.
>>>>
>>>> So what would happen is that we completely clutter the R/B tree used by the PAT with thousands if not millions of entries.
>>>>
>>>
>>> Hm, above you're saying that there is no direct map, but now you are saying that the pages were obtained through get_free_page()?
>>
>> The problem only happens with highmem pages on 32bit kernels. Those pages are not in the linear mapping.
> 
> Right, in the common case there is a direct map.
> 
>>
>>> I agree that what you describe here sounds suboptimal. But if the pages where obtained from the buddy, there surely is a direct map -- unless we explicitly remove it :(
>>>
>>> If we're talking about individual pages without a directmap, I would wonder if they are actually part of a bigger memory region that can just be reserved in one go (similar to how remap_pfn_range()) would handle it.
>>>
>>> Can you briefly describe how your use case obtains these PFNs, and how scattered tehy + their caching attributes might be?
>>
>> What drivers do is to call get_free_page() or alloc_pages_node() with the GFP_HIGHUSER flag set.
>>
>> For non highmem pages drivers then calls set_pages_wc/uc() which changes the caching of the linear mapping, but for highmem pages there is no linear mapping so set_pages_wc() or set_pages_uc() doesn't work and drivers avoid calling it.
>>
>> Those are basically just random system memory pages. So they are potentially scattered over the whole memory address space.
> 
> Thanks, that's valuable information.
> 
> So essentially these drivers maintain their own consistency and PAT is not aware of that.
> 
> And the real problem is ordinary system RAM.
> 
> There are various ways forward.
> 
> 1) We use another interface that consumes pages instead of PFNs, like a
>    vm_insert_pages_pgprot() we would be adding.
> 
>    Is there any strong requirement for inserting non-refcounted PFNs?

Yes, there is a strong requirement to insert non-refcounted PFNs.

We had a lot of trouble with KVM people trying to grab a reference to those pages even if the VMA had the VM_PFNMAP flag set.

> 2) We add another interface that consumes PFNs, but explicitly states
>    that it is only for ordinary system RAM, and that the user is
>    required for updating the direct map.
> 
>    We could sanity-check the direct map in debug kernels.

I would rather like to see vmf_insert_pfn_prot() fixed instead.

That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.

> 
> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>    system RAM differently.
> 
> 
> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
> 
> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.

Well, exactly that's the use case here and that is not abusive at all as far as I can see.

What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.

That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.

> We could also perform the set_pages_wc/uc() from inside that function, but maybe it depends on the use case whether we want to do that whenever we map them into a process?

It sounds like a good idea in theory, but I think it is potentially to much overhead to be applicable.

Thanks,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  9:56               ` Re: Christian König
@ 2025-08-26 12:07                 ` David Hildenbrand
  2025-08-26 16:09                   ` Re: Christian König
  2025-08-26 14:27                 ` Thomas Hellström
  1 sibling, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-26 12:07 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

>>
>> 1) We use another interface that consumes pages instead of PFNs, like a
>>     vm_insert_pages_pgprot() we would be adding.
>>
>>     Is there any strong requirement for inserting non-refcounted PFNs?
> 
> Yes, there is a strong requirement to insert non-refcounted PFNs.
> 
> We had a lot of trouble with KVM people trying to grab a reference to those pages even if the VMA had the VM_PFNMAP flag set.

Yes, KVM ignored (and maybe still does) VM_PFNMAP to some degree, which 
is rather nasty.

> 
>> 2) We add another interface that consumes PFNs, but explicitly states
>>     that it is only for ordinary system RAM, and that the user is
>>     required for updating the direct map.
>>
>>     We could sanity-check the direct map in debug kernels.
> 
> I would rather like to see vmf_insert_pfn_prot() fixed instead.
> 
> That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.

It's all a bit tricky :(

> 
>>
>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>>     system RAM differently.
>>
>>
>> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
>>
>> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.
> 
> Well, exactly that's the use case here and that is not abusive at all as far as I can see.
> 
> What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.

I mean, the use case of "allocate pages from the buddy and fixup the 
linear map" sounds perfectly reasonable to me. Absolutely no reason to 
get PAT involved. Nobody else should be messing with that memory after all.

As soon as we are talking about other memory ranges (iomem) that are not 
from the buddy, it gets weird to bypass PAT, and the question I am 
asking myself is, when is it okay, and when not.

> 
> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.

I'll have to think about this a bit: assuming only vmf_insert_pfn() 
calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, 
how could we sanity check that somebody is doing something against the 
will of PAT.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  8:38       ` Re: Christian König
  2025-08-26  8:46         ` Re: David Hildenbrand
@ 2025-08-26 12:37         ` David Hildenbrand
  1 sibling, 0 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-26 12:37 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 10:38, Christian König wrote:
> On 25.08.25 21:10, David Hildenbrand wrote:
>> On 21.08.25 10:10, Christian König wrote:
>>> On 20.08.25 17:23, David Hildenbrand wrote:
>>>> CCing Lorenzo
>>>>
>>>> On 20.08.25 16:33, Christian König wrote:
>>>>> Hi everyone,
>>>>>
>>>>> sorry for CCing so many people, but that rabbit hole turned out to be
>>>>> deeper than originally thought.
>>>>>
>>>>> TTM always had problems with UC/WC mappings on 32bit systems and drivers
>>>>> often had to revert to hacks like using GFP_DMA32 to get things working
>>>>> while having no rational explanation why that helped (see the TTM AGP,
>>>>> radeon and nouveau driver code for that).
>>>>>
>>>>> It turned out that the PAT implementation we use on x86 not only enforces
>>>>> the same caching attributes for pages in the linear kernel mapping, but
>>>>> also for highmem pages through a separate R/B tree.
>>>>>
>>>>> That was unexpected and TTM never updated that R/B tree for highmem pages,
>>>>> so the function pgprot_set_cachemode() just overwrote the caching
>>>>> attributes drivers passed in to vmf_insert_pfn_prot() and that essentially
>>>>> caused all kind of random trouble.
>>>>>
>>>>> An R/B tree is potentially not a good data structure to hold thousands if
>>>>> not millions of different attributes for each page, so updating that is
>>>>> probably not the way to solve this issue.
>>>>>
>>>>> Thomas pointed out that the i915 driver is using apply_page_range()
>>>>> instead of vmf_insert_pfn_prot() to circumvent the PAT implementation and
>>>>> just fill in the page tables with what the driver things is the right
>>>>> caching attribute.
>>>>
>>>> I assume you mean apply_to_page_range() -- same issue in patch subjects.
>>>
>>> Oh yes, of course. Sorry.
>>>
>>>> Oh this sounds horrible. Why oh why do we have these hacks in core-mm and have drivers abuse them :(
>>>
>>> Yeah I was also a bit hesitated to use that, but the performance advantage is so high that we probably can't avoid the general approach.
>>>
>>>> Honestly, apply_to_pte_range() is just the entry in doing all kinds of weird crap to page tables because "you know better".
>>>
>>> Exactly that's the problem I'm pointing out, drivers *do* know it better. The core memory management has applied incorrect values which caused all kind of the trouble.
>>>
>>> The problem is not a bug in PAT nor TTM/drivers but rather how they interact with each other.
>>>
>>> What I don't understand is why do we have the PAT in the first place? No other architecture does it this way.
>>
>> Probably because no other architecture has these weird glitches I assume ... skimming over memtype_reserve() and friends there are quite some corner cases the code is handling (BIOS, ACPI, low ISA, system RAM, ...)
>>
>>
>> I did a lot of work on the higher PAT level functions, but I am no expert on the lower level management functions, and in particular all the special cases with different memory types.
>>
>> IIRC, the goal of the PAT subsystem is to make sure that no two page tables map the same PFN with different caching attributes.
> 
> Yeah, that actually makes sense. Thomas from Intel recently explained the technical background to me:
> 
> Some x86 CPUs write back cache lines even if they aren't dirty and what can happen is that because of the linear mapping the CPU speculatively loads a cache line which is elsewhere mapped uncached.
> 
> So the end result is that the writeback of not dirty cache lines potentially corrupts the data in the otherwise uncached system memory.
> 
> But that a) only applies to memory in the linear mapping and b) only to a handful of x86 CPU types (e.g. recently Intels Luna Lake, AMD Athlons produced before 2004, maybe others).
> 
>> It treats ordinary system RAM (IORESOURCE_SYSTEM_RAM) usually in a special way: no special caching mode.
>>
>> For everything else, it expects that someone first reserves a memory range for a specific caching mode.
>>
>> For example, remap_pfn_range()...->pfnmap_track()->memtype_reserve() will make sure that there are no conflicts, to the call memtype_kernel_map_sync() to make sure the identity mapping is updated to the new type.
>>
>> In case someone ends up calling pfnmap_setup_cachemode(), the expectation is that there was a previous call to memtype_reserve_io() or similar, such that pfnmap_setup_cachemode() will find that caching mode.
>>
>>
>> So my assumption would be that that is missing for the drivers here?
> 
> Well yes and no.
> 
> See the PAT is optimized for applying specific caching attributes to ranges [A..B] (e.g. it uses an R/B tree). But what drivers do here is that they have single pages (usually for get_free_page or similar) and want to apply a certain caching attribute to it.

One clarification after staring at PAT code once again: for pages (RAM), 
the caching attribute is stored in the page flags, not in the R/B tree.

If nothing was set, it defaults to _PAGE_CACHE_MODE_WB AFAIKs.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26  9:56               ` Re: Christian König
  2025-08-26 12:07                 ` Re: David Hildenbrand
@ 2025-08-26 14:27                 ` Thomas Hellström
  2025-08-28 21:01                   ` stupid PAT :) David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Hellström @ 2025-08-26 14:27 UTC (permalink / raw)
  To: Christian König, David Hildenbrand, intel-xe, intel-gfx,
	dri-devel, amd-gfx, x86
  Cc: airlied, matthew.brost, dave.hansen, luto, peterz,
	Lorenzo Stoakes

Hi, Christian,

On Tue, 2025-08-26 at 11:56 +0200, Christian König wrote:
> On 26.08.25 11:17, David Hildenbrand wrote:
> > On 26.08.25 11:00, Christian König wrote:
> > > On 26.08.25 10:46, David Hildenbrand wrote:
> > > > > > So my assumption would be that that is missing for the
> > > > > > drivers here?
> > > > > 
> > > > > Well yes and no.
> > > > > 
> > > > > See the PAT is optimized for applying specific caching
> > > > > attributes to ranges [A..B] (e.g. it uses an R/B tree). But
> > > > > what drivers do here is that they have single pages (usually
> > > > > for get_free_page or similar) and want to apply a certain
> > > > > caching attribute to it.
> > > > > 
> > > > > So what would happen is that we completely clutter the R/B
> > > > > tree used by the PAT with thousands if not millions of
> > > > > entries.
> > > > > 
> > > > 
> > > > Hm, above you're saying that there is no direct map, but now
> > > > you are saying that the pages were obtained through
> > > > get_free_page()?
> > > 
> > > The problem only happens with highmem pages on 32bit kernels.
> > > Those pages are not in the linear mapping.
> > 
> > Right, in the common case there is a direct map.
> > 
> > > 
> > > > I agree that what you describe here sounds suboptimal. But if
> > > > the pages where obtained from the buddy, there surely is a
> > > > direct map -- unless we explicitly remove it :(
> > > > 
> > > > If we're talking about individual pages without a directmap, I
> > > > would wonder if they are actually part of a bigger memory
> > > > region that can just be reserved in one go (similar to how
> > > > remap_pfn_range()) would handle it.
> > > > 
> > > > Can you briefly describe how your use case obtains these PFNs,
> > > > and how scattered tehy + their caching attributes might be?
> > > 
> > > What drivers do is to call get_free_page() or alloc_pages_node()
> > > with the GFP_HIGHUSER flag set.
> > > 
> > > For non highmem pages drivers then calls set_pages_wc/uc() which
> > > changes the caching of the linear mapping, but for highmem pages
> > > there is no linear mapping so set_pages_wc() or set_pages_uc()
> > > doesn't work and drivers avoid calling it.
> > > 
> > > Those are basically just random system memory pages. So they are
> > > potentially scattered over the whole memory address space.
> > 
> > Thanks, that's valuable information.
> > 
> > So essentially these drivers maintain their own consistency and PAT
> > is not aware of that.
> > 
> > And the real problem is ordinary system RAM.
> > 
> > There are various ways forward.
> > 
> > 1) We use another interface that consumes pages instead of PFNs,
> > like a
> >    vm_insert_pages_pgprot() we would be adding.
> > 
> >    Is there any strong requirement for inserting non-refcounted
> > PFNs?
> 
> Yes, there is a strong requirement to insert non-refcounted PFNs.
> 
> We had a lot of trouble with KVM people trying to grab a reference to
> those pages even if the VMA had the VM_PFNMAP flag set.
> 
> > 2) We add another interface that consumes PFNs, but explicitly
> > states
> >    that it is only for ordinary system RAM, and that the user is
> >    required for updating the direct map.
> > 
> >    We could sanity-check the direct map in debug kernels.
> 
> I would rather like to see vmf_insert_pfn_prot() fixed instead.
> 
> That function was explicitly added to insert the PFN with the given
> attributes and as far as I can see all users of that function expect
> exactly that.
> 
> > 
> > 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating
> > this
> >    system RAM differently.
> > 
> > 
> > There is also the option for a mixture between 1 and 2, where we
> > get pages, but we map them non-refcounted in a VM_PFNMAP.
> > 
> > In general, having pages makes it easier to assert that they are
> > likely ordinary system ram pages, and that the interface is not
> > getting abused for something else.
> 
> Well, exactly that's the use case here and that is not abusive at all
> as far as I can see.
> 
> What drivers want is to insert a PFN with a certain set of caching
> attributes regardless if it's system memory or iomem. That's why
> vmf_insert_pfn_prot() was created in the first place.
> 
> That drivers need to call set_pages_wc/uc() for the linear mapping on
> x86 manually is correct and checking that is clearly a good idea for
> debug kernels.

So where is this trending? Is the current suggestion to continue
disallowing aliased mappings with conflicting caching modes and enforce
checks in debug kernels?

/Thomas


> 
> > We could also perform the set_pages_wc/uc() from inside that
> > function, but maybe it depends on the use case whether we want to
> > do that whenever we map them into a process?
> 
> It sounds like a good idea in theory, but I think it is potentially
> to much overhead to be applicable.
> 
> Thanks,
> Christian.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re:
  2025-08-26 12:07                 ` Re: David Hildenbrand
@ 2025-08-26 16:09                   ` Christian König
  2025-08-27  9:13                     ` [PATCH 0/3] drm/ttm: Michel Dänzer
  2025-08-28 21:18                     ` stupid and complicated PAT :) David Hildenbrand
  0 siblings, 2 replies; 28+ messages in thread
From: Christian König @ 2025-08-26 16:09 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 14:07, David Hildenbrand wrote: 
>>
>>> 2) We add another interface that consumes PFNs, but explicitly states
>>>     that it is only for ordinary system RAM, and that the user is
>>>     required for updating the direct map.
>>>
>>>     We could sanity-check the direct map in debug kernels.
>>
>> I would rather like to see vmf_insert_pfn_prot() fixed instead.
>>
>> That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.
> 
> It's all a bit tricky :(

I would rather say horrible complicated :(

>>>
>>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>>>     system RAM differently.
>>>
>>>
>>> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
>>>
>>> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.
>>
>> Well, exactly that's the use case here and that is not abusive at all as far as I can see.
>>
>> What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.
> 
> I mean, the use case of "allocate pages from the buddy and fixup the linear map" sounds perfectly reasonable to me. Absolutely no reason to get PAT involved. Nobody else should be messing with that memory after all.
> 
> As soon as we are talking about other memory ranges (iomem) that are not from the buddy, it gets weird to bypass PAT, and the question I am asking myself is, when is it okay, and when not.

Ok let me try to explain parts of the history and the big picture for at least the graphics use case on x86.

In 1996/97 Intel came up with the idea of AGP: https://en.wikipedia.org/wiki/Accelerated_Graphics_Port

At that time the CPUs, PCI bus and system memory were all connected together through the north bridge: https://en.wikipedia.org/wiki/Northbridge_(computing)

The problem was that AGP also introduced the concept of putting large amounts of data for the video controller (PCI device) into system memory when you don't have enough local device memory (VRAM).

But that meant when that memory is cached that the north bridge always had to snoop the CPU cache over the front side bus for every access the video controller made. This meant a huge performance bottleneck, so the idea was born to access that data uncached.

Well that was nearly 30years ago, PCI, AGP and front side bus are long gone, but the concept of putting video controller (GPU) stuff into uncached system memory has prevailed.

So for example even modern AMD CPU based laptops need uncached system memory if their local memory is not large enough to contain the picture to display on the monitor. And with modern 8k monitors that can actually happen quite fast...

What drivers do today is to call vmf_insert_pfn_prot() either with the PFN of their local memory (iomem) or uncached/wc system memory.

To summarize that we have an interface to fill in the page tables with either iomem or system memory is actually part of the design. That's how the HW driver is expected to work.

>> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.
> 
> I'll have to think about this a bit: assuming only vmf_insert_pfn() calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, how could we sanity check that somebody is doing something against the will of PAT.

I think the most defensive approach for a quick fix is this change here:

 static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
 {
-       *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
-                        cachemode2protval(pcm));
+       if (pcm != _PAGE_CACHE_MODE_WB)
+               *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
+                                cachemode2protval(pcm));
 }

This applies the PAT value if it's anything else than _PAGE_CACHE_MODE_WB but still allows callers to use something different on normal WB system memory.

What do you think?

Regards,
Christian

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] drm/ttm: ...
  2025-08-26 16:09                   ` Re: Christian König
@ 2025-08-27  9:13                     ` Michel Dänzer
  2025-08-28 21:18                     ` stupid and complicated PAT :) David Hildenbrand
  1 sibling, 0 replies; 28+ messages in thread
From: Michel Dänzer @ 2025-08-27  9:13 UTC (permalink / raw)
  To: Christian König, David Hildenbrand, intel-xe, intel-gfx,
	dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes


Public Service Announcement from your friendly neighbourhood mailing list moderation queue processor:

I recommend setting a non-empty subject in follow-ups to this thread, or I might accidentally discard them from the moderation queue (I just almost did).


-- 
Earthling Michel Dänzer       \        GNOME / Xwayland / Mesa developer
https://redhat.com             \               Libre software enthusiast

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid PAT :)
  2025-08-26 14:27                 ` Thomas Hellström
@ 2025-08-28 21:01                   ` David Hildenbrand
  0 siblings, 0 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-28 21:01 UTC (permalink / raw)
  To: Thomas Hellström, Christian König, intel-xe, intel-gfx,
	dri-devel, amd-gfx, x86
  Cc: airlied, matthew.brost, dave.hansen, luto, peterz,
	Lorenzo Stoakes

On 26.08.25 16:27, Thomas Hellström wrote:
> Hi, Christian,
> 
> On Tue, 2025-08-26 at 11:56 +0200, Christian König wrote:
>> On 26.08.25 11:17, David Hildenbrand wrote:
>>> On 26.08.25 11:00, Christian König wrote:
>>>> On 26.08.25 10:46, David Hildenbrand wrote:
>>>>>>> So my assumption would be that that is missing for the
>>>>>>> drivers here?
>>>>>>
>>>>>> Well yes and no.
>>>>>>
>>>>>> See the PAT is optimized for applying specific caching
>>>>>> attributes to ranges [A..B] (e.g. it uses an R/B tree). But
>>>>>> what drivers do here is that they have single pages (usually
>>>>>> for get_free_page or similar) and want to apply a certain
>>>>>> caching attribute to it.
>>>>>>
>>>>>> So what would happen is that we completely clutter the R/B
>>>>>> tree used by the PAT with thousands if not millions of
>>>>>> entries.
>>>>>>
>>>>>
>>>>> Hm, above you're saying that there is no direct map, but now
>>>>> you are saying that the pages were obtained through
>>>>> get_free_page()?
>>>>
>>>> The problem only happens with highmem pages on 32bit kernels.
>>>> Those pages are not in the linear mapping.
>>>
>>> Right, in the common case there is a direct map.
>>>
>>>>
>>>>> I agree that what you describe here sounds suboptimal. But if
>>>>> the pages where obtained from the buddy, there surely is a
>>>>> direct map -- unless we explicitly remove it :(
>>>>>
>>>>> If we're talking about individual pages without a directmap, I
>>>>> would wonder if they are actually part of a bigger memory
>>>>> region that can just be reserved in one go (similar to how
>>>>> remap_pfn_range()) would handle it.
>>>>>
>>>>> Can you briefly describe how your use case obtains these PFNs,
>>>>> and how scattered tehy + their caching attributes might be?
>>>>
>>>> What drivers do is to call get_free_page() or alloc_pages_node()
>>>> with the GFP_HIGHUSER flag set.
>>>>
>>>> For non highmem pages drivers then calls set_pages_wc/uc() which
>>>> changes the caching of the linear mapping, but for highmem pages
>>>> there is no linear mapping so set_pages_wc() or set_pages_uc()
>>>> doesn't work and drivers avoid calling it.
>>>>
>>>> Those are basically just random system memory pages. So they are
>>>> potentially scattered over the whole memory address space.
>>>
>>> Thanks, that's valuable information.
>>>
>>> So essentially these drivers maintain their own consistency and PAT
>>> is not aware of that.
>>>
>>> And the real problem is ordinary system RAM.
>>>
>>> There are various ways forward.
>>>
>>> 1) We use another interface that consumes pages instead of PFNs,
>>> like a
>>>     vm_insert_pages_pgprot() we would be adding.
>>>
>>>     Is there any strong requirement for inserting non-refcounted
>>> PFNs?
>>
>> Yes, there is a strong requirement to insert non-refcounted PFNs.
>>
>> We had a lot of trouble with KVM people trying to grab a reference to
>> those pages even if the VMA had the VM_PFNMAP flag set.
>>
>>> 2) We add another interface that consumes PFNs, but explicitly
>>> states
>>>     that it is only for ordinary system RAM, and that the user is
>>>     required for updating the direct map.
>>>
>>>     We could sanity-check the direct map in debug kernels.
>>
>> I would rather like to see vmf_insert_pfn_prot() fixed instead.
>>
>> That function was explicitly added to insert the PFN with the given
>> attributes and as far as I can see all users of that function expect
>> exactly that.
>>
>>>
>>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating
>>> this
>>>     system RAM differently.
>>>
>>>
>>> There is also the option for a mixture between 1 and 2, where we
>>> get pages, but we map them non-refcounted in a VM_PFNMAP.
>>>
>>> In general, having pages makes it easier to assert that they are
>>> likely ordinary system ram pages, and that the interface is not
>>> getting abused for something else.
>>
>> Well, exactly that's the use case here and that is not abusive at all
>> as far as I can see.
>>
>> What drivers want is to insert a PFN with a certain set of caching
>> attributes regardless if it's system memory or iomem. That's why
>> vmf_insert_pfn_prot() was created in the first place.
>>
>> That drivers need to call set_pages_wc/uc() for the linear mapping on
>> x86 manually is correct and checking that is clearly a good idea for
>> debug kernels.
> 
> So where is this trending? Is the current suggestion to continue
> disallowing aliased mappings with conflicting caching modes and enforce
> checks in debug kernels?

Not sure, it's a mess. The big question is to find out when it is really 
ok to bypass PAT and when to better let it have a saying.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-26 16:09                   ` Re: Christian König
  2025-08-27  9:13                     ` [PATCH 0/3] drm/ttm: Michel Dänzer
@ 2025-08-28 21:18                     ` David Hildenbrand
  2025-08-28 21:28                       ` David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-28 21:18 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 26.08.25 18:09, Christian König wrote:
> On 26.08.25 14:07, David Hildenbrand wrote:
>>>
>>>> 2) We add another interface that consumes PFNs, but explicitly states
>>>>      that it is only for ordinary system RAM, and that the user is
>>>>      required for updating the direct map.
>>>>
>>>>      We could sanity-check the direct map in debug kernels.
>>>
>>> I would rather like to see vmf_insert_pfn_prot() fixed instead.
>>>
>>> That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.
>>
>> It's all a bit tricky :(
> 
> I would rather say horrible complicated :(
> 
>>>>
>>>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>>>>      system RAM differently.
>>>>
>>>>
>>>> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
>>>>
>>>> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.
>>>
>>> Well, exactly that's the use case here and that is not abusive at all as far as I can see.
>>>
>>> What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.
>>
>> I mean, the use case of "allocate pages from the buddy and fixup the linear map" sounds perfectly reasonable to me. Absolutely no reason to get PAT involved. Nobody else should be messing with that memory after all.
>>
>> As soon as we are talking about other memory ranges (iomem) that are not from the buddy, it gets weird to bypass PAT, and the question I am asking myself is, when is it okay, and when not.
> 
> Ok let me try to explain parts of the history and the big picture for at least the graphics use case on x86.
> 
> In 1996/97 Intel came up with the idea of AGP: https://en.wikipedia.org/wiki/Accelerated_Graphics_Port
> 
> At that time the CPUs, PCI bus and system memory were all connected together through the north bridge: https://en.wikipedia.org/wiki/Northbridge_(computing)
> 
> The problem was that AGP also introduced the concept of putting large amounts of data for the video controller (PCI device) into system memory when you don't have enough local device memory (VRAM).
> 
> But that meant when that memory is cached that the north bridge always had to snoop the CPU cache over the front side bus for every access the video controller made. This meant a huge performance bottleneck, so the idea was born to access that data uncached.

Ack.

> 
> 
> Well that was nearly 30years ago, PCI, AGP and front side bus are long gone, but the concept of putting video controller (GPU) stuff into uncached system memory has prevailed.
> 
> So for example even modern AMD CPU based laptops need uncached system memory if their local memory is not large enough to contain the picture to display on the monitor. And with modern 8k monitors that can actually happen quite fast...
> 
> What drivers do today is to call vmf_insert_pfn_prot() either with the PFN of their local memory (iomem) or uncached/wc system memory.

That makes perfect sense. I assume we might or might not have "struct 
page" (pfn_valid) for the iomem, depending on where these areas reside, 
correct?

> 
> 
> To summarize that we have an interface to fill in the page tables with either iomem or system memory is actually part of the design. That's how the HW driver is expected to work.
> 
>>> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.
>>
>> I'll have to think about this a bit: assuming only vmf_insert_pfn() calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, how could we sanity check that somebody is doing something against the will of PAT.
> 
> I think the most defensive approach for a quick fix is this change here:
> 
>   static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
>   {
> -       *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
> -                        cachemode2protval(pcm));
> +       if (pcm != _PAGE_CACHE_MODE_WB)
> +               *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
> +                                cachemode2protval(pcm));
>   }
> 
> This applies the PAT value if it's anything else than _PAGE_CACHE_MODE_WB but still allows callers to use something different on normal WB system memory.
> 
> What do you think?

This feels like too big of a hammer. In particular, it changes things 
like phys_mem_access_prot_allowed(), which requires more care.

First, I thought we should limit what we do to vmf_insert_pfn_prot() 
only. But then I realized that we have stuff like

	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

I'm still trying to find out the easy way out that is not a complete hack.

Will the iomem ever be mapped by the driver again with a different cache 
mode? (e.g., WB -> UC -> WB)

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-28 21:18                     ` stupid and complicated PAT :) David Hildenbrand
@ 2025-08-28 21:28                       ` David Hildenbrand
  2025-08-28 21:32                         ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-28 21:28 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 28.08.25 23:18, David Hildenbrand wrote:
> On 26.08.25 18:09, Christian König wrote:
>> On 26.08.25 14:07, David Hildenbrand wrote:
>>>>
>>>>> 2) We add another interface that consumes PFNs, but explicitly states
>>>>>       that it is only for ordinary system RAM, and that the user is
>>>>>       required for updating the direct map.
>>>>>
>>>>>       We could sanity-check the direct map in debug kernels.
>>>>
>>>> I would rather like to see vmf_insert_pfn_prot() fixed instead.
>>>>
>>>> That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.
>>>
>>> It's all a bit tricky :(
>>
>> I would rather say horrible complicated :(
>>
>>>>>
>>>>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>>>>>       system RAM differently.
>>>>>
>>>>>
>>>>> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
>>>>>
>>>>> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.
>>>>
>>>> Well, exactly that's the use case here and that is not abusive at all as far as I can see.
>>>>
>>>> What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.
>>>
>>> I mean, the use case of "allocate pages from the buddy and fixup the linear map" sounds perfectly reasonable to me. Absolutely no reason to get PAT involved. Nobody else should be messing with that memory after all.
>>>
>>> As soon as we are talking about other memory ranges (iomem) that are not from the buddy, it gets weird to bypass PAT, and the question I am asking myself is, when is it okay, and when not.
>>
>> Ok let me try to explain parts of the history and the big picture for at least the graphics use case on x86.
>>
>> In 1996/97 Intel came up with the idea of AGP: https://en.wikipedia.org/wiki/Accelerated_Graphics_Port
>>
>> At that time the CPUs, PCI bus and system memory were all connected together through the north bridge: https://en.wikipedia.org/wiki/Northbridge_(computing)
>>
>> The problem was that AGP also introduced the concept of putting large amounts of data for the video controller (PCI device) into system memory when you don't have enough local device memory (VRAM).
>>
>> But that meant when that memory is cached that the north bridge always had to snoop the CPU cache over the front side bus for every access the video controller made. This meant a huge performance bottleneck, so the idea was born to access that data uncached.
> 
> Ack.
> 
>>
>>
>> Well that was nearly 30years ago, PCI, AGP and front side bus are long gone, but the concept of putting video controller (GPU) stuff into uncached system memory has prevailed.
>>
>> So for example even modern AMD CPU based laptops need uncached system memory if their local memory is not large enough to contain the picture to display on the monitor. And with modern 8k monitors that can actually happen quite fast...
>>
>> What drivers do today is to call vmf_insert_pfn_prot() either with the PFN of their local memory (iomem) or uncached/wc system memory.
> 
> That makes perfect sense. I assume we might or might not have "struct
> page" (pfn_valid) for the iomem, depending on where these areas reside,
> correct?
> 
>>
>>
>> To summarize that we have an interface to fill in the page tables with either iomem or system memory is actually part of the design. That's how the HW driver is expected to work.
>>
>>>> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.
>>>
>>> I'll have to think about this a bit: assuming only vmf_insert_pfn() calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, how could we sanity check that somebody is doing something against the will of PAT.
>>
>> I think the most defensive approach for a quick fix is this change here:
>>
>>    static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
>>    {
>> -       *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>> -                        cachemode2protval(pcm));
>> +       if (pcm != _PAGE_CACHE_MODE_WB)
>> +               *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>> +                                cachemode2protval(pcm));
>>    }
>>
>> This applies the PAT value if it's anything else than _PAGE_CACHE_MODE_WB but still allows callers to use something different on normal WB system memory.
>>
>> What do you think?
> 
> This feels like too big of a hammer. In particular, it changes things
> like phys_mem_access_prot_allowed(), which requires more care.
> 
> First, I thought we should limit what we do to vmf_insert_pfn_prot()
> only. But then I realized that we have stuff like
> 
> 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> 
> I'm still trying to find out the easy way out that is not a complete hack.
> 
> Will the iomem ever be mapped by the driver again with a different cache
> mode? (e.g., WB -> UC -> WB)

What I am currently wondering is: assume we get a 
pfnmap_setup_cachemode_pfn() call and we could reliably identify whether 
there was a previous registration, then we could do

(a) No previous registration: don't modify pgprot. Hopefully the driver
     knows what it is doing. Maybe we can add sanity checks that the
     direct map was already updated etc.

(b) A previous registration: modify pgprot like we do today.

System RAM is the problem. I wonder how many of these registrations we 
really get and if we could just store them in the same tree as !system 
RAM instead of abusing page flags.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-28 21:28                       ` David Hildenbrand
@ 2025-08-28 21:32                         ` David Hildenbrand
  2025-08-29 10:50                           ` Christian König
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-28 21:32 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 28.08.25 23:28, David Hildenbrand wrote:
> On 28.08.25 23:18, David Hildenbrand wrote:
>> On 26.08.25 18:09, Christian König wrote:
>>> On 26.08.25 14:07, David Hildenbrand wrote:
>>>>>
>>>>>> 2) We add another interface that consumes PFNs, but explicitly states
>>>>>>        that it is only for ordinary system RAM, and that the user is
>>>>>>        required for updating the direct map.
>>>>>>
>>>>>>        We could sanity-check the direct map in debug kernels.
>>>>>
>>>>> I would rather like to see vmf_insert_pfn_prot() fixed instead.
>>>>>
>>>>> That function was explicitly added to insert the PFN with the given attributes and as far as I can see all users of that function expect exactly that.
>>>>
>>>> It's all a bit tricky :(
>>>
>>> I would rather say horrible complicated :(
>>>
>>>>>>
>>>>>> 3) We teach PAT code in pfnmap_setup_cachemode_pfn() about treating this
>>>>>>        system RAM differently.
>>>>>>
>>>>>>
>>>>>> There is also the option for a mixture between 1 and 2, where we get pages, but we map them non-refcounted in a VM_PFNMAP.
>>>>>>
>>>>>> In general, having pages makes it easier to assert that they are likely ordinary system ram pages, and that the interface is not getting abused for something else.
>>>>>
>>>>> Well, exactly that's the use case here and that is not abusive at all as far as I can see.
>>>>>
>>>>> What drivers want is to insert a PFN with a certain set of caching attributes regardless if it's system memory or iomem. That's why vmf_insert_pfn_prot() was created in the first place.
>>>>
>>>> I mean, the use case of "allocate pages from the buddy and fixup the linear map" sounds perfectly reasonable to me. Absolutely no reason to get PAT involved. Nobody else should be messing with that memory after all.
>>>>
>>>> As soon as we are talking about other memory ranges (iomem) that are not from the buddy, it gets weird to bypass PAT, and the question I am asking myself is, when is it okay, and when not.
>>>
>>> Ok let me try to explain parts of the history and the big picture for at least the graphics use case on x86.
>>>
>>> In 1996/97 Intel came up with the idea of AGP: https://en.wikipedia.org/wiki/Accelerated_Graphics_Port
>>>
>>> At that time the CPUs, PCI bus and system memory were all connected together through the north bridge: https://en.wikipedia.org/wiki/Northbridge_(computing)
>>>
>>> The problem was that AGP also introduced the concept of putting large amounts of data for the video controller (PCI device) into system memory when you don't have enough local device memory (VRAM).
>>>
>>> But that meant when that memory is cached that the north bridge always had to snoop the CPU cache over the front side bus for every access the video controller made. This meant a huge performance bottleneck, so the idea was born to access that data uncached.
>>
>> Ack.
>>
>>>
>>>
>>> Well that was nearly 30years ago, PCI, AGP and front side bus are long gone, but the concept of putting video controller (GPU) stuff into uncached system memory has prevailed.
>>>
>>> So for example even modern AMD CPU based laptops need uncached system memory if their local memory is not large enough to contain the picture to display on the monitor. And with modern 8k monitors that can actually happen quite fast...
>>>
>>> What drivers do today is to call vmf_insert_pfn_prot() either with the PFN of their local memory (iomem) or uncached/wc system memory.
>>
>> That makes perfect sense. I assume we might or might not have "struct
>> page" (pfn_valid) for the iomem, depending on where these areas reside,
>> correct?
>>
>>>
>>>
>>> To summarize that we have an interface to fill in the page tables with either iomem or system memory is actually part of the design. That's how the HW driver is expected to work.
>>>
>>>>> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.
>>>>
>>>> I'll have to think about this a bit: assuming only vmf_insert_pfn() calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, how could we sanity check that somebody is doing something against the will of PAT.
>>>
>>> I think the most defensive approach for a quick fix is this change here:
>>>
>>>     static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
>>>     {
>>> -       *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>>> -                        cachemode2protval(pcm));
>>> +       if (pcm != _PAGE_CACHE_MODE_WB)
>>> +               *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>>> +                                cachemode2protval(pcm));
>>>     }
>>>
>>> This applies the PAT value if it's anything else than _PAGE_CACHE_MODE_WB but still allows callers to use something different on normal WB system memory.
>>>
>>> What do you think?
>>
>> This feels like too big of a hammer. In particular, it changes things
>> like phys_mem_access_prot_allowed(), which requires more care.
>>
>> First, I thought we should limit what we do to vmf_insert_pfn_prot()
>> only. But then I realized that we have stuff like
>>
>> 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
>>
>> I'm still trying to find out the easy way out that is not a complete hack.
>>
>> Will the iomem ever be mapped by the driver again with a different cache
>> mode? (e.g., WB -> UC -> WB)
> 
> What I am currently wondering is: assume we get a
> pfnmap_setup_cachemode_pfn() call and we could reliably identify whether
> there was a previous registration, then we could do
> 
> (a) No previous registration: don't modify pgprot. Hopefully the driver
>       knows what it is doing. Maybe we can add sanity checks that the
>       direct map was already updated etc.
> 
> (b) A previous registration: modify pgprot like we do today.
> 
> System RAM is the problem. I wonder how many of these registrations we
> really get and if we could just store them in the same tree as !system
> RAM instead of abusing page flags.

commit 9542ada803198e6eba29d3289abb39ea82047b92
Author: Suresh Siddha <suresh.b.siddha@intel.com>
Date:   Wed Sep 24 08:53:33 2008 -0700

     x86: track memtype for RAM in page struct
     
     Track the memtype for RAM pages in page struct instead of using the
     memtype list. This avoids the explosion in the number of entries in
     memtype list (of the order of 20,000 with AGP) and makes the PAT
     tracking simpler.
     
     We are using PG_arch_1 bit in page->flags.
     
     We still use the memtype list for non RAM pages.


I do wonder if that explosion is still an issue today.


-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-28 21:32                         ` David Hildenbrand
@ 2025-08-29 10:50                           ` Christian König
  2025-08-29 19:52                             ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: Christian König @ 2025-08-29 10:50 UTC (permalink / raw)
  To: David Hildenbrand, intel-xe, intel-gfx, dri-devel, amd-gfx, x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

You write mails faster than I can answer :)

I will try to answer all questions and comment here, here but please ping me if I miss something.

On 28.08.25 23:32, David Hildenbrand wrote:
> On 28.08.25 23:28, David Hildenbrand wrote:
>> On 28.08.25 23:18, David Hildenbrand wrote:
>>> On 26.08.25 18:09, Christian König wrote:
>>>> On 26.08.25 14:07, David Hildenbrand wrote:
[SNIP]
>>>> Well that was nearly 30years ago, PCI, AGP and front side bus are long gone, but the concept of putting video controller (GPU) stuff into uncached system memory has prevailed.
>>>>
>>>> So for example even modern AMD CPU based laptops need uncached system memory if their local memory is not large enough to contain the picture to display on the monitor. And with modern 8k monitors that can actually happen quite fast...
>>>>
>>>> What drivers do today is to call vmf_insert_pfn_prot() either with the PFN of their local memory (iomem) or uncached/wc system memory.
>>>
>>> That makes perfect sense. I assume we might or might not have "struct
>>> page" (pfn_valid) for the iomem, depending on where these areas reside,
>>> correct?

Exactly that, yes.

>>>> To summarize that we have an interface to fill in the page tables with either iomem or system memory is actually part of the design. That's how the HW driver is expected to work.
>>>>
>>>>>> That drivers need to call set_pages_wc/uc() for the linear mapping on x86 manually is correct and checking that is clearly a good idea for debug kernels.
>>>>>
>>>>> I'll have to think about this a bit: assuming only vmf_insert_pfn() calls pfnmap_setup_cachemode_pfn() but vmf_insert_pfn_prot() doesn't, how could we sanity check that somebody is doing something against the will of PAT.
>>>>
>>>> I think the most defensive approach for a quick fix is this change here:
>>>>
>>>>     static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
>>>>     {
>>>> -       *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>>>> -                        cachemode2protval(pcm));
>>>> +       if (pcm != _PAGE_CACHE_MODE_WB)
>>>> +               *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
>>>> +                                cachemode2protval(pcm));
>>>>     }
>>>>
>>>> This applies the PAT value if it's anything else than _PAGE_CACHE_MODE_WB but still allows callers to use something different on normal WB system memory.
>>>>
>>>> What do you think?
>>>
>>> This feels like too big of a hammer. In particular, it changes things
>>> like phys_mem_access_prot_allowed(), which requires more care.
>>>
>>> First, I thought we should limit what we do to vmf_insert_pfn_prot()
>>> only. But then I realized that we have stuff like
>>>
>>>     vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
>>>
>>> I'm still trying to find out the easy way out that is not a complete hack.

Well I think when the change is limited to only to vmf_insert_pfn_prot() for now we can limit the risk quite a bit as well.

Background is that only a handful of callers are using vmf_insert_pfn_prot() and it looks like all of those actually do know what they are doing and using the right flags.

>>> Will the iomem ever be mapped by the driver again with a different cache
>>> mode? (e.g., WB -> UC -> WB)

Yes, that can absolutely happen. But for iomem we would have an explicit call to ioremap(), ioremap_wc(), ioremap_cache() for that before anybody would map anything into userspace page tables.

But thinking more about it I just had an OMFG moment! Is it possible that the PAT currently already has a problem with that?

We had customer projects where BARs of different PCIe devices ended up on different physical addresses after a hot remove/re-add.

Is it possible that the PAT keeps enforcing certain caching attributes for a physical address? E.g. for example because a driver doesn't clean up properly on hot remove?

If yes than that would explain a massive number of problems we had with hot add/remove.

>> What I am currently wondering is: assume we get a
>> pfnmap_setup_cachemode_pfn() call and we could reliably identify whether
>> there was a previous registration, then we could do
>>
>> (a) No previous registration: don't modify pgprot. Hopefully the driver
>>       knows what it is doing. Maybe we can add sanity checks that the
>>       direct map was already updated etc.
>> (b) A previous registration: modify pgprot like we do today.

That would work for me.

>> System RAM is the problem. I wonder how many of these registrations we
>> really get and if we could just store them in the same tree as !system
>> RAM instead of abusing page flags.
> 
> commit 9542ada803198e6eba29d3289abb39ea82047b92
> Author: Suresh Siddha <suresh.b.siddha@intel.com>
> Date:   Wed Sep 24 08:53:33 2008 -0700
> 
>     x86: track memtype for RAM in page struct
>         Track the memtype for RAM pages in page struct instead of using the
>     memtype list. This avoids the explosion in the number of entries in
>     memtype list (of the order of 20,000 with AGP) and makes the PAT
>     tracking simpler.
>         We are using PG_arch_1 bit in page->flags.
>         We still use the memtype list for non RAM pages.
> 
> 
> I do wonder if that explosion is still an issue today.

Yes it is. That is exactly the issue I'm working on here.

It's just that AGP was replaced by internal GPU MMUs over time and so we don't use the old AGP code any more but just call get_free_pages() (or similar) directly.

Thanks a lot for the help,
Christian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-29 10:50                           ` Christian König
@ 2025-08-29 19:52                             ` David Hildenbrand
  2025-08-29 19:58                               ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2025-08-29 19:52 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes


> 
> Yes, that can absolutely happen. But for iomem we would have an explicit call to ioremap(), ioremap_wc(), ioremap_cache() for that before anybody would map anything into userspace page tables.
> 
> But thinking more about it I just had an OMFG moment! Is it possible that the PAT currently already has a problem with that?
> 
> We had customer projects where BARs of different PCIe devices ended up on different physical addresses after a hot remove/re-add.
> 
> Is it possible that the PAT keeps enforcing certain caching attributes for a physical address? E.g. for example because a driver doesn't clean up properly on hot remove?
> 
> If yes than that would explain a massive number of problems we had with hot add/remove.

The code is a mess, so if a driver messed up, likely everything is possible.

TBH, the more I look at this all, the more WTF moments I am having.

> 
>>> What I am currently wondering is: assume we get a
>>> pfnmap_setup_cachemode_pfn() call and we could reliably identify whether
>>> there was a previous registration, then we could do
>>>
>>> (a) No previous registration: don't modify pgprot. Hopefully the driver
>>>        knows what it is doing. Maybe we can add sanity checks that the
>>>        direct map was already updated etc.
>>> (b) A previous registration: modify pgprot like we do today.
> 
> That would work for me.
> 
>>> System RAM is the problem. I wonder how many of these registrations we
>>> really get and if we could just store them in the same tree as !system
>>> RAM instead of abusing page flags.
>>
>> commit 9542ada803198e6eba29d3289abb39ea82047b92
>> Author: Suresh Siddha <suresh.b.siddha@intel.com>
>> Date:   Wed Sep 24 08:53:33 2008 -0700
>>
>>      x86: track memtype for RAM in page struct
>>          Track the memtype for RAM pages in page struct instead of using the
>>      memtype list. This avoids the explosion in the number of entries in
>>      memtype list (of the order of 20,000 with AGP) and makes the PAT
>>      tracking simpler.
>>          We are using PG_arch_1 bit in page->flags.
>>          We still use the memtype list for non RAM pages.
>>
>>
>> I do wonder if that explosion is still an issue today.
> 
> Yes it is. That is exactly the issue I'm working on here.
> 
> It's just that AGP was replaced by internal GPU MMUs over time and so we don't use the old AGP code any more but just call get_free_pages() (or similar) directly.

Okay, I thought I slowly understood how it works, then I stumbled over 
the set_memory_uc / set_memory_wc implementation and now I am *all 
confused*.

I mean, that does perform a PAT reservation.

But when is that reservation ever freed again? :/

How can set_memory_wc() followed by set_memory_uc() possibly work? I am 
pretty sure I am missing a piece of the puzzle.

I think you mentioned that set_memory_uc() is avoided by drivers because 
of highmem mess, but what are drivers then using to modify the direct map?

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: stupid and complicated PAT :)
  2025-08-29 19:52                             ` David Hildenbrand
@ 2025-08-29 19:58                               ` David Hildenbrand
  0 siblings, 0 replies; 28+ messages in thread
From: David Hildenbrand @ 2025-08-29 19:58 UTC (permalink / raw)
  To: Christian König, intel-xe, intel-gfx, dri-devel, amd-gfx,
	x86
  Cc: airlied, thomas.hellstrom, matthew.brost, dave.hansen, luto,
	peterz, Lorenzo Stoakes

On 29.08.25 21:52, David Hildenbrand wrote:
> 
>>
>> Yes, that can absolutely happen. But for iomem we would have an explicit call to ioremap(), ioremap_wc(), ioremap_cache() for that before anybody would map anything into userspace page tables.
>>
>> But thinking more about it I just had an OMFG moment! Is it possible that the PAT currently already has a problem with that?
>>
>> We had customer projects where BARs of different PCIe devices ended up on different physical addresses after a hot remove/re-add.
>>
>> Is it possible that the PAT keeps enforcing certain caching attributes for a physical address? E.g. for example because a driver doesn't clean up properly on hot remove?
>>
>> If yes than that would explain a massive number of problems we had with hot add/remove.
> 
> The code is a mess, so if a driver messed up, likely everything is possible.
> 
> TBH, the more I look at this all, the more WTF moments I am having.
> 
>>
>>>> What I am currently wondering is: assume we get a
>>>> pfnmap_setup_cachemode_pfn() call and we could reliably identify whether
>>>> there was a previous registration, then we could do
>>>>
>>>> (a) No previous registration: don't modify pgprot. Hopefully the driver
>>>>         knows what it is doing. Maybe we can add sanity checks that the
>>>>         direct map was already updated etc.
>>>> (b) A previous registration: modify pgprot like we do today.
>>
>> That would work for me.
>>
>>>> System RAM is the problem. I wonder how many of these registrations we
>>>> really get and if we could just store them in the same tree as !system
>>>> RAM instead of abusing page flags.
>>>
>>> commit 9542ada803198e6eba29d3289abb39ea82047b92
>>> Author: Suresh Siddha <suresh.b.siddha@intel.com>
>>> Date:   Wed Sep 24 08:53:33 2008 -0700
>>>
>>>       x86: track memtype for RAM in page struct
>>>           Track the memtype for RAM pages in page struct instead of using the
>>>       memtype list. This avoids the explosion in the number of entries in
>>>       memtype list (of the order of 20,000 with AGP) and makes the PAT
>>>       tracking simpler.
>>>           We are using PG_arch_1 bit in page->flags.
>>>           We still use the memtype list for non RAM pages.
>>>
>>>
>>> I do wonder if that explosion is still an issue today.
>>
>> Yes it is. That is exactly the issue I'm working on here.
>>
>> It's just that AGP was replaced by internal GPU MMUs over time and so we don't use the old AGP code any more but just call get_free_pages() (or similar) directly.
> 
> Okay, I thought I slowly understood how it works, then I stumbled over
> the set_memory_uc / set_memory_wc implementation and now I am *all
> confused*.
> 
> I mean, that does perform a PAT reservation.
> 
> But when is that reservation ever freed again? :/

Ah, set_memory_wb() does that. It just frees stuff. It should have been 
called something like "reset", probably.

-- 
Cheers

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2025-08-29 19:58 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250820143739.3422-1-christian.koenig@amd.com>
2025-08-20 14:33 ` [PATCH 1/3] drm/ttm: use apply_page_range instead of vmf_insert_pfn_prot Christian König
2025-08-20 14:33 ` [PATCH 2/3] drm/ttm: reapply increase ttm pre-fault value to PMD size" Christian König
2025-08-20 14:33 ` [PATCH 3/3] drm/ttm: disable changing the global caching flags on newer AMD CPUs v2 Christian König
2025-08-20 15:12   ` Borislav Petkov
2025-08-20 15:23 ` David Hildenbrand
2025-08-21  8:10   ` Re: Christian König
2025-08-25 19:10     ` Re: David Hildenbrand
2025-08-26  8:38       ` Re: Christian König
2025-08-26  8:46         ` Re: David Hildenbrand
2025-08-26  9:00           ` Re: Christian König
2025-08-26  9:17             ` Re: David Hildenbrand
2025-08-26  9:56               ` Re: Christian König
2025-08-26 12:07                 ` Re: David Hildenbrand
2025-08-26 16:09                   ` Re: Christian König
2025-08-27  9:13                     ` [PATCH 0/3] drm/ttm: Michel Dänzer
2025-08-28 21:18                     ` stupid and complicated PAT :) David Hildenbrand
2025-08-28 21:28                       ` David Hildenbrand
2025-08-28 21:32                         ` David Hildenbrand
2025-08-29 10:50                           ` Christian König
2025-08-29 19:52                             ` David Hildenbrand
2025-08-29 19:58                               ` David Hildenbrand
2025-08-26 14:27                 ` Thomas Hellström
2025-08-28 21:01                   ` stupid PAT :) David Hildenbrand
2025-08-26 12:37         ` David Hildenbrand
     [not found]   ` <7db27720-8cfd-457c-8133-5a7a1094004c@lucifer.local>
2025-08-21  9:30     ` your mail David Hildenbrand
     [not found]       ` <f6f85c73-2a1e-438a-82c9-f3392d91020c@lucifer.local>
2025-08-21 10:16         ` David Hildenbrand
2025-08-25 18:35         ` Christian König
2025-08-25 19:20           ` David Hildenbrand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).