All of lore.kernel.org
 help / color / mirror / Atom feed
From: Maciej Patelczyk <maciej.patelczyk@intel.com>
To: Matthew Brost <matthew.brost@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <stuart.summers@intel.com>,
	<arvind.yadav@intel.com>, <himal.prasad.ghimiray@intel.com>,
	<thomas.hellstrom@linux.intel.com>, <francois.dugast@intel.com>
Subject: Re: [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers
Date: Thu, 7 May 2026 14:41:06 +0200	[thread overview]
Message-ID: <1eb0e018-494e-41a2-9a8b-440711fc30ba@intel.com> (raw)
In-Reply-To: <afuZq0+X070Inmhk@gsse-cloud1.jf.intel.com>

On 06/05/2026 21:42, Matthew Brost wrote:

> On Wed, May 06, 2026 at 05:46:30PM +0200, Maciej Patelczyk wrote:
>> On 26/02/2026 05:28, Matthew Brost wrote:
>>
>>> With fine-grained page-fault locking, it no longer makes sense to
>>> maintain multiple page-fault queues, as we no longer hash queues based
>>> on the VM’s ASID. Multiple workers can pull page faults from a single
>>> queue, eliminating any head-of-queue blocking. Refactor the structures
>>> and code to use a single shared queue.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_device_types.h    | 12 +++---
>>>    drivers/gpu/drm/xe/xe_pagefault.c       | 52 +++++++++++++------------
>>>    drivers/gpu/drm/xe/xe_pagefault_types.h | 17 +++++++-
>>>    3 files changed, 50 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>>> index 1eb0fe118940..0558dfd52541 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -304,8 +304,8 @@ struct xe_device {
>>>    		struct xarray asid_to_vm;
>>>    		/** @usm.next_asid: next ASID, used to cyclical alloc asids */
>>>    		u32 next_asid;
>>> -		/** @usm.current_pf_queue: current page fault queue */
>>> -		u32 current_pf_queue;
>>> +		/** @usm.current_pf_work: current page fault work item */
>>> +		u32 current_pf_work;
>>>    		/** @usm.lock: protects UM state */
>>>    		struct rw_semaphore lock;
>>>    		/** @usm.pf_wq: page fault work queue, unbound, high priority */
>>> @@ -315,9 +315,11 @@ struct xe_device {
>>>    		 * yields the best bandwidth utilization of the kernel paging
>>>    		 * engine.
>>>    		 */
>>> -#define XE_PAGEFAULT_QUEUE_COUNT	4
>>> -		/** @usm.pf_queue: Page fault queues */
>>> -		struct xe_pagefault_queue pf_queue[XE_PAGEFAULT_QUEUE_COUNT];
>>> +#define XE_PAGEFAULT_WORK_COUNT	4
>>> +		/** @usm.pf_workers: Page fault workers */
>>> +		struct xe_pagefault_work pf_workers[XE_PAGEFAULT_WORK_COUNT];
>>> +		/** @usm.pf_queue: Page fault queue */
>>> +		struct xe_pagefault_queue pf_queue;
>>>    #if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
>>>    		/** @usm.pagemap_shrinker: Shrinker for unused pagemaps */
>>>    		struct drm_pagemap_shrinker *dpagemap_shrinker;
>>> diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c
>>> index a372db7cd839..7880fc7e7eb4 100644
>>> --- a/drivers/gpu/drm/xe/xe_pagefault.c
>>> +++ b/drivers/gpu/drm/xe/xe_pagefault.c
>>> @@ -222,6 +222,7 @@ static void xe_pagefault_queue_retry(struct xe_pagefault_queue *pf_queue,
>>>    		pf_queue->tail = pf_queue->size - xe_pagefault_entry_size();
>>>    	else
>>>    		pf_queue->tail -= xe_pagefault_entry_size();
>>> +	memcpy(pf_queue->data + pf_queue->tail, pf, sizeof(*pf));
>>>    	spin_unlock_irq(&pf_queue->lock);
>>>    }
>>> @@ -267,8 +268,10 @@ static void xe_pagefault_print(struct xe_pagefault *pf)
>>>    static void xe_pagefault_queue_work(struct work_struct *w)
>>>    {
>>> -	struct xe_pagefault_queue *pf_queue =
>>> -		container_of(w, typeof(*pf_queue), worker);
>>> +	struct xe_pagefault_work *pf_work =
>>> +		container_of(w, typeof(*pf_work), work);
>>> +	struct xe_device *xe = pf_work->xe;
>>> +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
>>>    	struct xe_pagefault pf;
>>>    	unsigned long threshold;
>>> @@ -285,7 +288,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
>>>    		if (err == -EAGAIN) {
>>>    			xe_pagefault_queue_retry(pf_queue, &pf);
>>> -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
>>> +			queue_work(xe->usm.pf_wq, w);
>>>    			break;
>>>    		} else if (err) {
>>>    			if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
>>> @@ -302,7 +305,7 @@ static void xe_pagefault_queue_work(struct work_struct *w)
>>>    		pf.producer.ops->ack_fault(&pf, err);
>>>    		if (time_after(jiffies, threshold)) {
>>> -			queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
>>> +			queue_work(xe->usm.pf_wq, w);
>>>    			break;
>>>    		}
>>>    	}
>>> @@ -348,7 +351,6 @@ static int xe_pagefault_queue_init(struct xe_device *xe,
>>>    		xe_pagefault_entry_size(), total_num_eus, pf_queue->size);
>>>    	spin_lock_init(&pf_queue->lock);
>>> -	INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work);
>>>    	pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL);
>>>    	if (!pf_queue->data)
>>> @@ -381,14 +383,20 @@ int xe_pagefault_init(struct xe_device *xe)
>>>    	xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue",
>>>    					WQ_UNBOUND | WQ_HIGHPRI,
>>> -					XE_PAGEFAULT_QUEUE_COUNT);
>>> +					XE_PAGEFAULT_WORK_COUNT);
>>>    	if (!xe->usm.pf_wq)
>>>    		return -ENOMEM;
>>> -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) {
>>> -		err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i);
>>> -		if (err)
>>> -			goto err_out;
>>> +	err = xe_pagefault_queue_init(xe, &xe->usm.pf_queue);
>>> +	if (err)
>>> +		goto err_out;
>>> +
>>> +	for (i = 0; i < XE_PAGEFAULT_WORK_COUNT; ++i) {
>>> +		struct xe_pagefault_work *pf_work = xe->usm.pf_workers + i;
>>> +
>>> +		pf_work->xe = xe;
>>> +		pf_work->id = i;
>>> +		INIT_WORK(&pf_work->work, xe_pagefault_queue_work);
>>>    	}
>>>    	return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe);
>>> @@ -430,10 +438,7 @@ static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt,
>>>     */
>>>    void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt)
>>>    {
>>> -	int i;
>>> -
>>> -	for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i)
>>> -		xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i);
>>> +	xe_pagefault_queue_reset(xe, gt, &xe->usm.pf_queue);
>>>    }
>>>    static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
>>> @@ -448,13 +453,11 @@ static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
>>>     * This function can race with multiple page fault producers, but worst case we
>>>     * stick a page fault on the same queue for consumption.
>>>     */
>>> -static int xe_pagefault_queue_index(struct xe_device *xe)
>>> +static int xe_pagefault_work_index(struct xe_device *xe)
>>>    {
>>> -	u32 old_pf_queue = READ_ONCE(xe->usm.current_pf_queue);
>>> -
>>> -	WRITE_ONCE(xe->usm.current_pf_queue, (old_pf_queue + 1));
>>> +	lockdep_assert_held(&xe->usm.pf_queue.lock);
>>> -	return old_pf_queue % XE_PAGEFAULT_QUEUE_COUNT;
>>> +	return xe->usm.current_pf_work++ % XE_PAGEFAULT_WORK_COUNT;
>>>    }
>>>    /**
>>> @@ -469,22 +472,23 @@ static int xe_pagefault_queue_index(struct xe_device *xe)
>>>     */
>>>    int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf)
>>>    {
>>> -	int queue_index = xe_pagefault_queue_index(xe);
>>> -	struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + queue_index;
>>> +	struct xe_pagefault_queue *pf_queue = &xe->usm.pf_queue;
>>>    	unsigned long flags;
>>> +	int work_index;
>>>    	bool full;
>>>    	spin_lock_irqsave(&pf_queue->lock, flags);
>>> +	work_index = xe_pagefault_work_index(xe);
>>>    	full = xe_pagefault_queue_full(pf_queue);
>>>    	if (!full) {
>>>    		memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf));
>>>    		pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) %
>>>    			pf_queue->size;
>>> -		queue_work(xe->usm.pf_wq, &pf_queue->worker);
>>> +		queue_work(xe->usm.pf_wq,
>>> +			   &xe->usm.pf_workers[work_index].work);
>>>    	} else {
>>>    		drm_warn(&xe->drm,
>>> -			 "PageFault Queue (%d) full, shouldn't be possible\n",
>>> -			 queue_index);
>>> +			 "PageFault Queue full, shouldn't be possible\n");
>>>    	}
>>>    	spin_unlock_irqrestore(&pf_queue->lock, flags);
>>> diff --git a/drivers/gpu/drm/xe/xe_pagefault_types.h b/drivers/gpu/drm/xe/xe_pagefault_types.h
>>> index b3289219b1be..45065c25c25f 100644
>>> --- a/drivers/gpu/drm/xe/xe_pagefault_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_pagefault_types.h
>>> @@ -131,8 +131,21 @@ struct xe_pagefault_queue {
>>>    	u32 tail;
>>>    	/** @lock: protects page fault queue */
>>>    	spinlock_t lock;
>>> -	/** @worker: to process page faults */
>>> -	struct work_struct worker;
>>> +};
>>> +
>>> +/**
>>> + * struct xe_pagefault_work - Xe page fault work item (consumer)
>>> + *
>>> + * Represents a worker that pops a &struct xe_pagefault from the page fault
>>> + * queue and processes it.
>>> + */
>>> +struct xe_pagefault_work {
>>> +	/** @xe: Back-pointer to the Xe device */
>>> +	struct xe_device *xe;
>>> +	/** @id: Identifier for this work item */
>>> +	int id;
>>> +	/** @work: Work item used to process the page fault */
>>> +	struct work_struct work;
>>>    };
>>>    #endif
>> Matt,
>>
>> There were total 4 pf_queues each of size = (total_num_eus +
>> XE_NUM_HW_ENGINES) * xe_pagefault_entry_size() * PF_MULTIPLIER additionally
>> bigger of roundup_pow_of_two().
>>
>> Each of this queue had a dedicated worker.
>>
>> There is a comment on queue calculation size in xe_pagefault_queue_init():
>>
>> "XXX: Multiplier required as compute UMD are getting PF queue errors
>>
>> without it. Follow on why this multiplier is required."
>>
>> PF queue errors could be due to slow pf processing by handler in KMD plus
>> generating PF for a single VM (asid) therefore hitting constantly single
>> queue.
>>
>>
>> Now there is a single queue which is 4 times smaller (overall) but it has 4
>> workers and there are optimizations which potentially drastically decrease
>> processing time.
>>
>> In the end it could resolve to a case where a single queue had 4 workers
>> instead of one which would be still faster than it is now.
>>
>> Still, not sure if queue size is not too small.
>>
>> Did you have a thought about it?
>>
>>
>> And I think this XXX comment becomes obsolete with such change.
>>
> I think the XXX comment was always wrong. We kept increasing the queue
> size because of random overflows, but the actual bug was that we didn’t
> round up to a power of two, and CIRC_SPACE relies on values being powers
> of two.
>
> I believe we never got around to deleting the XXX comment or removing
> the multiplier. We can handle this in a follow-up after this series, as
> I’d like a large change like this to sit for a while so we can test and
> ensure there are no regressions. Then we can clean up the XXX comment
> and the multiplier in a follow-up.
>
> Matt

All right,

Reviewed-by: Maciej Patelczyk <maciej.patelczyk@intel.com>


>> Regards,
>>
>> Maciej
>>
>>

  reply	other threads:[~2026-05-07 12:41 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-26  4:28 [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Matthew Brost
2026-02-26  4:28 ` [PATCH v4 01/12] drm/xe: Fine grained page fault locking Matthew Brost
2026-02-26  4:28 ` [PATCH v4 02/12] drm/xe: Allow prefetch-only VM bind IOCTLs to use VM read lock Matthew Brost
2026-02-26  4:28 ` [PATCH v4 03/12] drm/xe: Thread prefetch of SVM ranges Matthew Brost
2026-02-26  4:28 ` [PATCH v4 04/12] drm/xe: Use a single page-fault queue with multiple workers Matthew Brost
2026-05-06 15:46   ` Maciej Patelczyk
2026-05-06 19:42     ` Matthew Brost
2026-05-07 12:41       ` Maciej Patelczyk [this message]
2026-02-26  4:28 ` [PATCH v4 05/12] drm/xe: Add num_pf_work modparam Matthew Brost
2026-05-06 15:59   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 06/12] drm/xe: Engine class and instance into a u8 Matthew Brost
2026-05-06 16:04   ` Maciej Patelczyk
2026-05-07 16:20     ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 07/12] drm/xe: Track pagefault worker runtime Matthew Brost
2026-05-07 12:51   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 08/12] drm/xe: Chain page faults via queue-resident cache to avoid fault storms Matthew Brost
2026-05-08 12:03   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 09/12] drm/xe: Add pagefault chaining stats Matthew Brost
2026-05-07 13:15   ` Maciej Patelczyk
2026-05-07 13:52     ` Francois Dugast
2026-02-26  4:28 ` [PATCH v4 10/12] drm/xe: Add debugfs pagefault_info Matthew Brost
2026-05-07 10:07   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 11/12] drm/xe: batch CT pagefault acks with periodic flush Matthew Brost
2026-05-08  9:24   ` Maciej Patelczyk
2026-02-26  4:28 ` [PATCH v4 12/12] drm/xe: Track parallel page fault activity in GT stats Matthew Brost
2026-05-07 13:56   ` Maciej Patelczyk
2026-05-07 14:23     ` Francois Dugast
2026-02-26  4:35 ` ✗ CI.checkpatch: warning for Fine grained fault locking, threaded prefetch, storm cache (rev4) Patchwork
2026-02-26  4:36 ` ✓ CI.KUnit: success " Patchwork
2026-02-26  5:26 ` ✗ Xe.CI.BAT: failure " Patchwork
2026-02-26  8:59 ` ✗ Xe.CI.FULL: " Patchwork
2026-02-26 13:43 ` [PATCH v4 00/12] Fine grained fault locking, threaded prefetch, storm cache Thomas Hellström
2026-02-26 19:36   ` Matthew Brost

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1eb0e018-494e-41a2-9a8b-440711fc30ba@intel.com \
    --to=maciej.patelczyk@intel.com \
    --cc=arvind.yadav@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.brost@intel.com \
    --cc=stuart.summers@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.