Re: [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: Maciej Patelczyk <maciej.patelczyk@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <stuart.summers@intel.com>,
	<arvind.yadav@intel.com>, <himal.prasad.ghimiray@intel.com>,
	<thomas.hellstrom@linux.intel.com>, <francois.dugast@intel.com>
Subject: Re: [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers
Date: Wed, 6 May 2026 12:42:35 -0700	[thread overview]
Message-ID: <afuZq0+X070Inmhk@gsse-cloud1.jf.intel.com> (raw)
In-Reply-To: <59b9532d-68ad-42b1-b7eb-c693b648b564@intel.com>

On Wed, May 06, 2026 at 05:46:30PM +0200, Maciej Patelczyk wrote:
> On 26/02/2026 05:28, Matthew Brost wrote:
> 
> > With fine-grained page-fault locking, it no longer makes sense to
> > maintain multiple page-fault queues, as we no longer hash queues based
> > on the VM’s ASID. Multiple workers can pull page faults from a single
> > queue, eliminating any head-of-queue blocking. Refactor the structures
> > and code to use a single shared queue.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_device_types.h    | 12 +++---
> >   drivers/gpu/drm/xe/xe_pagefault.c       | 52 +++++++++++++------------
> >   drivers/gpu/drm/xe/xe_pagefault_types.h | 17 +++++++-
> >   3 files changed, 50 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > index 1eb0fe118940..0558dfd52541 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -304,8 +304,8 @@ struct xe_device {
> >   		struct xarray asid_to_vm;
> >   		/** @usm.next_asid: next ASID, used to cyclical alloc asids */
> >   		u32 next_asid;
> > -		/** @usm.current_pf_queue: current page fault queue */
> > -		u32 current_pf_queue;
> > +		/** @usm.current_pf_work: current page fault work item */
> > +		u32 current_pf_work;
> >   		/** @usm.lock: protects UM state */
> >   		struct rw_semaphore lock;
> >   		/** @usm.pf_wq: page fault work queue, unbound, high priority */
> > @@ -315,9 +315,11 @@ struct xe_device {
> >   		 * yields the best bandwidth utilization of the kernel paging
> >   		 * engine.
> >   		 */
> > -#define XE_PAGEFAULT_QUEUE_COUNT	4
> > -		/** @usm.pf_queue: Page fault queues */
> > -		struct xe_pagefault_queue pf_queue[XE_PAGEFAULT_QUEUE_COUNT];
> > +#define XE_PAGEFAULT_WORK_COUNT	4
> > +		/** @usm.pf_workers: Page fault workers */
> > +		struct xe_pagefault_work pf_workers[XE_PAGEFAULT_WORK_COUNT];
> > +		/** @usm.pf_queue: Page fault queue */
> > +		struct xe_pagefault_queue pf_queue;
> >   #if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
> >   		/** @usm.pagemap_shrinker: Shrinker for unused pagemaps */
> >   		struct drm_pagemap_shrinker *dpagemap_shrinker;
> > diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c
> > index a372db7cd839..7880fc7e7eb4 100644
> > --- a/drivers/gpu/drm/xe/xe_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_pagefault.c
> > @@ -222,6 +222,7 @@ static void xe_pagefault_queue_retry(struct xe_pagefault_queue *pf_queue,
> >   		pf_queue->tail = pf_queue->size - xe_pagefault_entry_size();
> >   	else
> >   		pf_queue->tail -= xe_pagefault_entry_size();
> > +	memcpy(pf_queue->data + pf_queue->tail, pf, sizeof(*pf));
> >   	spin_unlock_irq(&pf_queue->lock);
> >   }
> > @@ -267,8 +268,10 @@ static void xe_pagefault_print(struct xe_pagefault *pf)
> >   static void xe_pagefault_queue_work(struct work_struct *w)
> >   {
> > -	struct xe_pagefault_queue *pf_queue =
> > -		container_of(w, typeof(*pf_queue), worker);
> > +	struct xe_pagefault_work *pf_work =
> > +		container_of(w, typeof(*pf_work), work);
> > +	struct xe_device *xe = pf_work->xe;
> > +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
> >   	struct xe_pagefault pf;
> >   	unsigned long threshold;
> > @@ -285,7 +288,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
> >   		if (err == -EAGAIN) {
> >   			xe_pagefault_queue_retry(pf_queue, &pf);
> > -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
> > +			queue_work(xe->usm.pf_wq, w);
> >   			break;
> >   		} else if (err) {
> >   			if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
> > @@ -302,7 +305,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
> >   		pf.producer.ops->ack_fault(&pf, err);
> >   		if (time_after(jiffies, threshold)) {
> > -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
> > +			queue_work(xe->usm.pf_wq, w);
> >   			break;
> >   		}
> >   	}
> > @@ -348,7 +351,6 @@ static int xe_pagefault_queue_init(struct xe_device *xe,
> >   		xe_pagefault_entry_size(), total_num_eus, pf_queue->size);
> >   	spin_lock_init(&pf_queue->lock);
> > -	INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work);
> >   	pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL);
> >   	if (!pf_queue->data)
> > @@ -381,14 +383,20 @@ int xe_pagefault_init(struct xe_device *xe)
> >   	xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue",
> >   					WQ_UNBOUND | WQ_HIGHPRI,
> > -					XE_PAGEFAULT_QUEUE_COUNT);
> > +					XE_PAGEFAULT_WORK_COUNT);
> >   	if (!xe->usm.pf_wq)
> >   		return -ENOMEM;
> > -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) {
> > -		err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i);
> > -		if (err)
> > -			goto err_out;
> > +	err = xe_pagefault_queue_init(xe, &xe->usm.pf_queue);
> > +	if (err)
> > +		goto err_out;
> > +
> > +	for (i = 0; i < XE_PAGEFAULT_WORK_COUNT; ++i) {
> > +		struct xe_pagefault_work *pf_work = xe->usm.pf_workers + i;
> > +
> > +		pf_work->xe = xe;
> > +		pf_work->id = i;
> > +		INIT_WORK(&pf_work->work, xe_pagefault_queue_work);
> >   	}
> >   	return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe);
> > @@ -430,10 +438,7 @@ static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt,
> >    */
> >   void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt)
> >   {
> > -	int i;
> > -
> > -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i)
> > -		xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i);
> > +	xe_pagefault_queue_reset(xe, gt, &xe->usm.pf_queue);
> >   }
> >   static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
> > @@ -448,13 +453,11 @@ static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
> >    * This function can race with multiple page fault producers, but worst case we
> >    * stick a page fault on the same queue for consumption.
> >    */
> > -static int xe_pagefault_queue_index(struct xe_device *xe)
> > +static int xe_pagefault_work_index(struct xe_device *xe)
> >   {
> > -	u32 old_pf_queue = READ_ONCE(xe->usm.current_pf_queue);
> > -
> > -	WRITE_ONCE(xe->usm.current_pf_queue, (old_pf_queue + 1));
> > +	lockdep_assert_held(&xe->usm.pf_queue.lock);
> > -	return old_pf_queue % XE_PAGEFAULT_QUEUE_COUNT;
> > +	return xe->usm.current_pf_work++ % XE_PAGEFAULT_WORK_COUNT;
> >   }
> >   /**
> > @@ -469,22 +472,23 @@ static int xe_pagefault_queue_index(struct xe_device *xe)
> >    */
> >   int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf)
> >   {
> > -	int queue_index = xe_pagefault_queue_index(xe);
> > -	struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + queue_index;
> > +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
> >   	unsigned long flags;
> > +	int work_index;
> >   	bool full;
> >   	spin_lock_irqsave(&pf_queue->lock, flags);
> > +	work_index = xe_pagefault_work_index(xe);
> >   	full = xe_pagefault_queue_full(pf_queue);
> >   	if (!full) {
> >   		memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf));
> >   		pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) %
> >   			pf_queue->size;
> > -		queue_work(xe->usm.pf_wq, &pf_queue->worker);
> > +		queue_work(xe->usm.pf_wq,
> > +			   &xe->usm.pf_workers[work_index].work);
> >   	} else {
> >   		drm_warn(&xe->drm,
> > -			 "PageFault Queue (%d) full, shouldn't be possible\n",
> > -			 queue_index);
> > +			 "PageFault Queue full, shouldn't be possible\n");
> >   	}
> >   	spin_unlock_irqrestore(&pf_queue->lock, flags);
> > diff --git a/drivers/gpu/drm/xe/xe_pagefault_types.h b/drivers/gpu/drm/xe/xe_pagefault_types.h
> > index b3289219b1be..45065c25c25f 100644
> > --- a/drivers/gpu/drm/xe/xe_pagefault_types.h
> > +++ b/drivers/gpu/drm/xe/xe_pagefault_types.h
> > @@ -131,8 +131,21 @@ struct xe_pagefault_queue {
> >   	u32 tail;
> >   	/** @lock: protects page fault queue */
> >   	spinlock_t lock;
> > -	/** @worker: to process page faults */
> > -	struct work_struct worker;
> > +};
> > +
> > +/**
> > + * struct xe_pagefault_work - Xe page fault work item (consumer)
> > + *
> > + * Represents a worker that pops a &struct xe_pagefault from the page fault
> > + * queue and processes it.
> > + */
> > +struct xe_pagefault_work {
> > +	/** @xe: Back-pointer to the Xe device */
> > +	struct xe_device *xe;
> > +	/** @id: Identifier for this work item */
> > +	int id;
> > +	/** @work: Work item used to process the page fault */
> > +	struct work_struct work;
> >   };
> >   #endif
> 
> Matt,
> 
> There were total 4 pf_queues each of size = (total_num_eus +
> XE_NUM_HW_ENGINES) * xe_pagefault_entry_size() * PF_MULTIPLIER additionally
> bigger of roundup_pow_of_two().
> 
> Each of this queue had a dedicated worker.
> 
> There is a comment on queue calculation size in xe_pagefault_queue_init():
> 
> "XXX: Multiplier required as compute UMD are getting PF queue errors
> 
> without it. Follow on why this multiplier is required."
> 
> PF queue errors could be due to slow pf processing by handler in KMD plus
> generating PF for a single VM (asid) therefore hitting constantly single
> queue.
> 
> 
> Now there is a single queue which is 4 times smaller (overall) but it has 4
> workers and there are optimizations which potentially drastically decrease
> processing time.
> 
> In the end it could resolve to a case where a single queue had 4 workers
> instead of one which would be still faster than it is now.
> 
> Still, not sure if queue size is not too small.
> 
> Did you have a thought about it?
> 
> 
> And I think this XXX comment becomes obsolete with such change.
> 

I think the XXX comment was always wrong. We kept increasing the queue
size because of random overflows, but the actual bug was that we didn’t
round up to a power of two, and CIRC_SPACE relies on values being powers
of two.

I believe we never got around to deleting the XXX comment or removing
the multiplier. We can handle this in a follow-up after this series, as
I’d like a large change like this to sit for a while so we can test and
ensure there are no regressions. Then we can clean up the XXX comment
and the multiplier in a follow-up.

Matt

> 
> Regards,
> 
> Maciej
> 
>

next prev parent reply	other threads:[~2026-05-06 19:42 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-26  4:28 [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Matthew Brost
2026-02-26  4:28 ` [PATCH v4 01/12] drm/xe: Fine grained page fault locking Matthew Brost
2026-02-26  4:28 ` [PATCH v4 02/12] drm/xe: Allow prefetch-only VM bind IOCTLs to use VM read lock Matthew Brost
2026-02-26  4:28 ` [PATCH v4 03/12] drm/xe: Thread prefetch of SVM ranges Matthew Brost
2026-02-26  4:28 ` [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers Matthew Brost
2026-05-06 15:46   ` Maciej Patelczyk
2026-05-06 19:42     ` Matthew Brost [this message]
2026-05-07 12:41       ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 05/12] drm/xe: Add num_pf_work modparam Matthew Brost
2026-05-06 15:59   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 06/12] drm/xe: Engine class and instance into a u8 Matthew Brost
2026-05-06 16:04   ` Maciej Patelczyk
2026-05-07 16:20     ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 07/12] drm/xe: Track pagefault worker runtime Matthew Brost
2026-05-07 12:51   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 08/12] drm/xe: Chain page faults via queue-resident cache to avoid fault storms Matthew Brost
2026-05-08 12:03   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 09/12] drm/xe: Add pagefault chaining stats Matthew Brost
2026-05-07 13:15   ` Maciej Patelczyk
2026-05-07 13:52     ` Francois Dugast
2026-02-26  4:28 ` [PATCH v4 10/12] drm/xe: Add debugfs pagefault_info Matthew Brost
2026-05-07 10:07   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 11/12] drm/xe: batch CT pagefault acks with periodic flush Matthew Brost
2026-05-08  9:24   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 12/12] drm/xe: Track parallel page fault activity in GT stats Matthew Brost
2026-05-07 13:56   ` Maciej Patelczyk
2026-05-07 14:23     ` Francois Dugast
2026-02-26  4:35 ` ✗ CI.checkpatch: warning for Fine grained fault locking, threaded prefetch, storm cache (rev4) Patchwork
2026-02-26  4:36 ` ✓ CI.KUnit: success " Patchwork
2026-02-26  5:26 ` ✗ Xe.CI.BAT: failure " Patchwork
2026-02-26  8:59 ` ✗ Xe.CI.FULL: " Patchwork
2026-02-26 13:43 ` [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Thomas Hellström
2026-02-26 19:36   ` Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=afuZq0+X070Inmhk@gsse-cloud1.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=arvind.yadav@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=maciej.patelczyk@intel.com \
    --cc=stuart.summers@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox