Re: [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Maciej Patelczyk <maciej.patelczyk@intel.com>
To: Matthew Brost <matthew.brost@intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: <stuart.summers@intel.com>, <arvind.yadav@intel.com>,
	<himal.prasad.ghimiray@intel.com>,
	<thomas.hellstrom@linux.intel.com>, <francois.dugast@intel.com>
Subject: Re: [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers
Date: Wed, 6 May 2026 17:46:30 +0200	[thread overview]
Message-ID: <59b9532d-68ad-42b1-b7eb-c693b648b564@intel.com> (raw)
In-Reply-To: <20260226042834.2963245-5-matthew.brost@intel.com>

On 26/02/2026 05:28, Matthew Brost wrote:

> With fine-grained page-fault locking, it no longer makes sense to
> maintain multiple page-fault queues, as we no longer hash queues based
> on the VM’s ASID. Multiple workers can pull page faults from a single
> queue, eliminating any head-of-queue blocking. Refactor the structures
> and code to use a single shared queue.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_device_types.h    | 12 +++---
>   drivers/gpu/drm/xe/xe_pagefault.c       | 52 +++++++++++++------------
>   drivers/gpu/drm/xe/xe_pagefault_types.h | 17 +++++++-
>   3 files changed, 50 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 1eb0fe118940..0558dfd52541 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -304,8 +304,8 @@ struct xe_device {
>   		struct xarray asid_to_vm;
>   		/** @usm.next_asid: next ASID, used to cyclical alloc asids */
>   		u32 next_asid;
> -		/** @usm.current_pf_queue: current page fault queue */
> -		u32 current_pf_queue;
> +		/** @usm.current_pf_work: current page fault work item */
> +		u32 current_pf_work;
>   		/** @usm.lock: protects UM state */
>   		struct rw_semaphore lock;
>   		/** @usm.pf_wq: page fault work queue, unbound, high priority */
> @@ -315,9 +315,11 @@ struct xe_device {
>   		 * yields the best bandwidth utilization of the kernel paging
>   		 * engine.
>   		 */
> -#define XE_PAGEFAULT_QUEUE_COUNT	4
> -		/** @usm.pf_queue: Page fault queues */
> -		struct xe_pagefault_queue pf_queue[XE_PAGEFAULT_QUEUE_COUNT];
> +#define XE_PAGEFAULT_WORK_COUNT	4
> +		/** @usm.pf_workers: Page fault workers */
> +		struct xe_pagefault_work pf_workers[XE_PAGEFAULT_WORK_COUNT];
> +		/** @usm.pf_queue: Page fault queue */
> +		struct xe_pagefault_queue pf_queue;
>   #if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
>   		/** @usm.pagemap_shrinker: Shrinker for unused pagemaps */
>   		struct drm_pagemap_shrinker *dpagemap_shrinker;
> diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c
> index a372db7cd839..7880fc7e7eb4 100644
> --- a/drivers/gpu/drm/xe/xe_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_pagefault.c
> @@ -222,6 +222,7 @@ static void xe_pagefault_queue_retry(struct xe_pagefault_queue *pf_queue,
>   		pf_queue->tail = pf_queue->size - xe_pagefault_entry_size();
>   	else
>   		pf_queue->tail -= xe_pagefault_entry_size();
> +	memcpy(pf_queue->data + pf_queue->tail, pf, sizeof(*pf));
>   	spin_unlock_irq(&pf_queue->lock);
>   }
>   
> @@ -267,8 +268,10 @@ static void xe_pagefault_print(struct xe_pagefault *pf)
>   
>   static void xe_pagefault_queue_work(struct work_struct *w)
>   {
> -	struct xe_pagefault_queue *pf_queue =
> -		container_of(w, typeof(*pf_queue), worker);
> +	struct xe_pagefault_work *pf_work =
> +		container_of(w, typeof(*pf_work), work);
> +	struct xe_device *xe = pf_work->xe;
> +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
>   	struct xe_pagefault pf;
>   	unsigned long threshold;
>   
> @@ -285,7 +288,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
>   
>   		if (err == -EAGAIN) {
>   			xe_pagefault_queue_retry(pf_queue, &pf);
> -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
> +			queue_work(xe->usm.pf_wq, w);
>   			break;
>   		} else if (err) {
>   			if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
> @@ -302,7 +305,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
>   		pf.producer.ops->ack_fault(&pf, err);
>   
>   		if (time_after(jiffies, threshold)) {
> -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
> +			queue_work(xe->usm.pf_wq, w);
>   			break;
>   		}
>   	}
> @@ -348,7 +351,6 @@ static int xe_pagefault_queue_init(struct xe_device *xe,
>   		xe_pagefault_entry_size(), total_num_eus, pf_queue->size);
>   
>   	spin_lock_init(&pf_queue->lock);
> -	INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work);
>   
>   	pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL);
>   	if (!pf_queue->data)
> @@ -381,14 +383,20 @@ int xe_pagefault_init(struct xe_device *xe)
>   
>   	xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue",
>   					WQ_UNBOUND | WQ_HIGHPRI,
> -					XE_PAGEFAULT_QUEUE_COUNT);
> +					XE_PAGEFAULT_WORK_COUNT);
>   	if (!xe->usm.pf_wq)
>   		return -ENOMEM;
>   
> -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) {
> -		err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i);
> -		if (err)
> -			goto err_out;
> +	err = xe_pagefault_queue_init(xe, &xe->usm.pf_queue);
> +	if (err)
> +		goto err_out;
> +
> +	for (i = 0; i < XE_PAGEFAULT_WORK_COUNT; ++i) {
> +		struct xe_pagefault_work *pf_work = xe->usm.pf_workers + i;
> +
> +		pf_work->xe = xe;
> +		pf_work->id = i;
> +		INIT_WORK(&pf_work->work, xe_pagefault_queue_work);
>   	}
>   
>   	return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe);
> @@ -430,10 +438,7 @@ static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt,
>    */
>   void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt)
>   {
> -	int i;
> -
> -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i)
> -		xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i);
> +	xe_pagefault_queue_reset(xe, gt, &xe->usm.pf_queue);
>   }
>   
>   static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
> @@ -448,13 +453,11 @@ static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
>    * This function can race with multiple page fault producers, but worst case we
>    * stick a page fault on the same queue for consumption.
>    */
> -static int xe_pagefault_queue_index(struct xe_device *xe)
> +static int xe_pagefault_work_index(struct xe_device *xe)
>   {
> -	u32 old_pf_queue = READ_ONCE(xe->usm.current_pf_queue);
> -
> -	WRITE_ONCE(xe->usm.current_pf_queue, (old_pf_queue + 1));
> +	lockdep_assert_held(&xe->usm.pf_queue.lock);
>   
> -	return old_pf_queue % XE_PAGEFAULT_QUEUE_COUNT;
> +	return xe->usm.current_pf_work++ % XE_PAGEFAULT_WORK_COUNT;
>   }
>   
>   /**
> @@ -469,22 +472,23 @@ static int xe_pagefault_queue_index(struct xe_device *xe)
>    */
>   int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf)
>   {
> -	int queue_index = xe_pagefault_queue_index(xe);
> -	struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + queue_index;
> +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
>   	unsigned long flags;
> +	int work_index;
>   	bool full;
>   
>   	spin_lock_irqsave(&pf_queue->lock, flags);
> +	work_index = xe_pagefault_work_index(xe);
>   	full = xe_pagefault_queue_full(pf_queue);
>   	if (!full) {
>   		memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf));
>   		pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) %
>   			pf_queue->size;
> -		queue_work(xe->usm.pf_wq, &pf_queue->worker);
> +		queue_work(xe->usm.pf_wq,
> +			   &xe->usm.pf_workers[work_index].work);
>   	} else {
>   		drm_warn(&xe->drm,
> -			 "PageFault Queue (%d) full, shouldn't be possible\n",
> -			 queue_index);
> +			 "PageFault Queue full, shouldn't be possible\n");
>   	}
>   	spin_unlock_irqrestore(&pf_queue->lock, flags);
>   
> diff --git a/drivers/gpu/drm/xe/xe_pagefault_types.h b/drivers/gpu/drm/xe/xe_pagefault_types.h
> index b3289219b1be..45065c25c25f 100644
> --- a/drivers/gpu/drm/xe/xe_pagefault_types.h
> +++ b/drivers/gpu/drm/xe/xe_pagefault_types.h
> @@ -131,8 +131,21 @@ struct xe_pagefault_queue {
>   	u32 tail;
>   	/** @lock: protects page fault queue */
>   	spinlock_t lock;
> -	/** @worker: to process page faults */
> -	struct work_struct worker;
> +};
> +
> +/**
> + * struct xe_pagefault_work - Xe page fault work item (consumer)
> + *
> + * Represents a worker that pops a &struct xe_pagefault from the page fault
> + * queue and processes it.
> + */
> +struct xe_pagefault_work {
> +	/** @xe: Back-pointer to the Xe device */
> +	struct xe_device *xe;
> +	/** @id: Identifier for this work item */
> +	int id;
> +	/** @work: Work item used to process the page fault */
> +	struct work_struct work;
>   };
>   
>   #endif

Matt,

There were total 4 pf_queues each of size = (total_num_eus + 
XE_NUM_HW_ENGINES) * xe_pagefault_entry_size() * PF_MULTIPLIER 
additionally bigger of roundup_pow_of_two().

Each of this queue had a dedicated worker.

There is a comment on queue calculation size in xe_pagefault_queue_init():

"XXX: Multiplier required as compute UMD are getting PF queue errors

without it. Follow on why this multiplier is required."

PF queue errors could be due to slow pf processing by handler in KMD 
plus generating PF for a single VM (asid) therefore hitting constantly 
single queue.


Now there is a single queue which is 4 times smaller (overall) but it 
has 4 workers and there are optimizations which potentially drastically 
decrease processing time.

In the end it could resolve to a case where a single queue had 4 workers 
instead of one which would be still faster than it is now.

Still, not sure if queue size is not too small.

Did you have a thought about it?


And I think this XXX comment becomes obsolete with such change.


Regards,

Maciej

next prev parent reply	other threads:[~2026-05-06 15:46 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-26  4:28 [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Matthew Brost
2026-02-26  4:28 ` [PATCH v4 01/12] drm/xe: Fine grained page fault locking Matthew Brost
2026-02-26  4:28 ` [PATCH v4 02/12] drm/xe: Allow prefetch-only VM bind IOCTLs to use VM read lock Matthew Brost
2026-02-26  4:28 ` [PATCH v4 03/12] drm/xe: Thread prefetch of SVM ranges Matthew Brost
2026-02-26  4:28 ` [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers Matthew Brost
2026-05-06 15:46   ` Maciej Patelczyk [this message]
2026-05-06 19:42     ` Matthew Brost
2026-05-07 12:41       ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 05/12] drm/xe: Add num_pf_work modparam Matthew Brost
2026-05-06 15:59   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 06/12] drm/xe: Engine class and instance into a u8 Matthew Brost
2026-05-06 16:04   ` Maciej Patelczyk
2026-05-07 16:20     ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 07/12] drm/xe: Track pagefault worker runtime Matthew Brost
2026-05-07 12:51   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 08/12] drm/xe: Chain page faults via queue-resident cache to avoid fault storms Matthew Brost
2026-05-08 12:03   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 09/12] drm/xe: Add pagefault chaining stats Matthew Brost
2026-05-07 13:15   ` Maciej Patelczyk
2026-05-07 13:52     ` Francois Dugast
2026-02-26  4:28 ` [PATCH v4 10/12] drm/xe: Add debugfs pagefault_info Matthew Brost
2026-05-07 10:07   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 11/12] drm/xe: batch CT pagefault acks with periodic flush Matthew Brost
2026-05-08  9:24   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 12/12] drm/xe: Track parallel page fault activity in GT stats Matthew Brost
2026-05-07 13:56   ` Maciej Patelczyk
2026-05-07 14:23     ` Francois Dugast
2026-02-26  4:35 ` ✗ CI.checkpatch: warning for Fine grained fault locking, threaded prefetch, storm cache (rev4) Patchwork
2026-02-26  4:36 ` ✓ CI.KUnit: success " Patchwork
2026-02-26  5:26 ` ✗ Xe.CI.BAT: failure " Patchwork
2026-02-26  8:59 ` ✗ Xe.CI.FULL: " Patchwork
2026-02-26 13:43 ` [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Thomas Hellström
2026-02-26 19:36   ` Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=59b9532d-68ad-42b1-b7eb-c693b648b564@intel.com \
    --to=maciej.patelczyk@intel.com \
    --cc=arvind.yadav@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=stuart.summers@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox