Re: [PATCH] drm/xe: Convert GT stats to per-cpu counters

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Matthew Brost <matthew.brost@intel.com>
To: "Summers, Stuart" <stuart.summers@intel.com>
Cc: "intel-xe@lists.freedesktop.org" <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH] drm/xe: Convert GT stats to per-cpu counters
Date: Tue, 17 Feb 2026 15:18:18 -0800	[thread overview]
Message-ID: <aZT3OqKwKBFk6g99@lstrano-desk.jf.intel.com> (raw)
In-Reply-To: <874e96d6cad9793ae5f8d7cd8657cd1bb923a92c.camel@intel.com>

On Tue, Feb 17, 2026 at 04:13:14PM -0700, Summers, Stuart wrote:
> On Tue, 2026-02-17 at 12:05 -0800, Matthew Brost wrote:
> > Current GT statistics use atomic64_t counters. Atomic operations
> > incur
> > a global coherency penalty.
> > 
> > Transition to dynamic per-cpu counters using alloc_percpu(). This
> > allows
> > stats to be incremented via this_cpu_add(), which compiles to a
> > single
> > non-locking instruction. This approach keeps the hot-path updates
> > local
> > to the CPU, avoiding expensive cross-core cache invalidation traffic.
> > 
> > Use for_each_possible_cpu() during aggregation and clear operations
> > to
> > ensure data consistency across CPU hotplug events.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_gt.c             |  5 +++
> >  drivers/gpu/drm/xe/xe_gt_stats.c       | 57 +++++++++++++++++++++---
> > --
> >  drivers/gpu/drm/xe/xe_gt_stats.h       |  6 +++
> >  drivers/gpu/drm/xe/xe_gt_stats_types.h | 19 +++++++++
> >  drivers/gpu/drm/xe/xe_gt_types.h       |  5 +--
> >  5 files changed, 78 insertions(+), 14 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> > index 68c4771de040..1203d087b68f 100644
> > --- a/drivers/gpu/drm/xe/xe_gt.c
> > +++ b/drivers/gpu/drm/xe/xe_gt.c
> > @@ -33,6 +33,7 @@
> >  #include "xe_gt_printk.h"
> >  #include "xe_gt_sriov_pf.h"
> >  #include "xe_gt_sriov_vf.h"
> > +#include "xe_gt_stats.h"
> >  #include "xe_gt_sysfs.h"
> >  #include "xe_gt_topology.h"
> >  #include "xe_guc_exec_queue_types.h"
> > @@ -455,6 +456,10 @@ int xe_gt_init_early(struct xe_gt *gt)
> >         if (err)
> >                 return err;
> >  
> > +       err = xe_gt_stats_init(gt);
> > +       if (err)
> > +               return err;
> > +
> >         CLASS(xe_force_wake, fw_ref)(gt_to_fw(gt), XE_FW_GT);
> >         if (!fw_ref.domains)
> >                 return -ETIMEDOUT;
> > diff --git a/drivers/gpu/drm/xe/xe_gt_stats.c
> > b/drivers/gpu/drm/xe/xe_gt_stats.c
> > index 37506434d7a3..f4bb0bf33995 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_stats.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_stats.c
> > @@ -3,12 +3,37 @@
> >   * Copyright © 2024 Intel Corporation
> >   */
> >  
> > -#include <linux/atomic.h>
> > -
> > +#include <drm/drm_managed.h>
> >  #include <drm/drm_print.h>
> >  
> > +#include "xe_device.h"
> >  #include "xe_gt_stats.h"
> > -#include "xe_gt_types.h"
> > +
> > +static void xe_gt_stats_fini(struct drm_device *drm, void *arg)
> > +{
> > +       struct xe_gt *gt = arg;
> > +
> > +       free_percpu(gt->stats);
> > +}
> > +
> > +/**
> > + * xe_gt_stats_init() - Initialize GT statistics
> > + * @gt: GT structure
> > + *
> > + * Allocate per-CPU GT statistics. Using per-CPU stats allows
> > increments
> > + * to occur without cross-CPU atomics.
> > + *
> > + * Return: 0 on success, -ENOMEM on failure.
> > + */
> > +int xe_gt_stats_init(struct xe_gt *gt)
> > +{
> > +       gt->stats = alloc_percpu(struct xe_gt_stats);
> > +       if (!gt->stats)
> > +               return -ENOMEM;
> > +
> > +       return drmm_add_action_or_reset(&gt_to_xe(gt)->drm,
> > xe_gt_stats_fini,
> > +                                       gt);
> > +}
> >  
> >  /**
> >   * xe_gt_stats_incr - Increments the specified stats counter
> > @@ -23,7 +48,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum
> > xe_gt_stats_id id, int incr)
> >         if (id >= __XE_GT_STATS_NUM_IDS)
> >                 return;
> >  
> > -       atomic64_add(incr, &gt->stats.counters[id]);
> > +       this_cpu_add(gt->stats->counters[id], incr);
> >  }
> >  
> >  #define DEF_STAT_STR(ID, name) [XE_GT_STATS_ID_##ID] = name
> > @@ -94,9 +119,18 @@ int xe_gt_stats_print_info(struct xe_gt *gt,
> > struct drm_printer *p)
> >  {
> >         enum xe_gt_stats_id id;
> >  
> > -       for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id)
> > -               drm_printf(p, "%s: %lld\n", stat_description[id],
> > -                          atomic64_read(&gt->stats.counters[id]));
> > +       for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id) {
> > +               u64 total = 0;
> > +               int cpu;
> > +
> > +               for_each_possible_cpu(cpu) {
> > +                       struct xe_gt_stats *s = per_cpu_ptr(gt-
> > >stats, cpu);
> > +
> > +                       total += s->counters[id];
> > +               }
> > +
> > +               drm_printf(p, "%s: %lld\n", stat_description[id],
> > total);
> > +       }
> >  
> >         return 0;
> >  }
> > @@ -109,8 +143,11 @@ int xe_gt_stats_print_info(struct xe_gt *gt,
> > struct drm_printer *p)
> >   */
> >  void xe_gt_stats_clear(struct xe_gt *gt)
> >  {
> > -       int id;
> > +       int cpu;
> > +
> > +       for_each_possible_cpu(cpu) {
> > +               struct xe_gt_stats *s = per_cpu_ptr(gt->stats, cpu);
> >  
> > -       for (id = 0; id < ARRAY_SIZE(gt->stats.counters); ++id)
> > -               atomic64_set(&gt->stats.counters[id], 0);
> > +               memset(s, 0, sizeof(*s));
> 
> Do we need a lock around this and the total get above? I get that the
> CPU increments should be ok, but when we aggregate, isn't there a
> chance we're overwriting something here?
> 

Worst case your clear gets overwritten or total reads a somewhat stale
value. I think that is a fair tradeoff vs using atomics. We have started
to add these stats all over the place in hotpaths so not doing atomics
seems like a win given these are typically enable on production builds.

Matt

> Thanks,
> Stuart
> 
> > +       }
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_gt_stats.h
> > b/drivers/gpu/drm/xe/xe_gt_stats.h
> > index 59a7bf60e242..3d0defab9b30 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_stats.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_stats.h
> > @@ -14,10 +14,16 @@ struct xe_gt;
> >  struct drm_printer;
> >  
> >  #ifdef CONFIG_DEBUG_FS
> > +int xe_gt_stats_init(struct xe_gt *gt);
> >  int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p);
> >  void xe_gt_stats_clear(struct xe_gt *gt);
> >  void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id
> > id, int incr);
> >  #else
> > +static inline int xe_gt_stats_init(struct xe_gt *gt)
> > +{
> > +       return 0;
> > +}
> > +
> >  static inline void
> >  xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id,
> >                  int incr)
> > diff --git a/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > b/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > index b8accdbc54eb..79568591bd67 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_stats_types.h
> > @@ -6,6 +6,8 @@
> >  #ifndef _XE_GT_STATS_TYPES_H_
> >  #define _XE_GT_STATS_TYPES_H_
> >  
> > +#include <linux/types.h>
> > +
> >  enum xe_gt_stats_id {
> >         XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT,
> >         XE_GT_STATS_ID_TLB_INVAL,
> > @@ -58,4 +60,21 @@ enum xe_gt_stats_id {
> >         __XE_GT_STATS_NUM_IDS,
> >  };
> >  
> > +/**
> > + * struct xe_gt_stats - Per-CPU GT statistics counters
> > + * @counters: Array of 64-bit counters indexed by &enum
> > xe_gt_stats_id
> > + *
> > + * This structure is used for high-frequency, per-CPU statistics
> > collection
> > + * in the Xe driver. By using a per-CPU allocation and ensuring the
> > structure
> > + * is cache-line aligned, we avoid the performance-heavy atomics and
> > cache
> > + * coherency traffic.
> > + *
> > + * Updates to these counters should be performed using the
> > this_cpu_add()
> > + * macro to ensure they are atomic with respect to local interrupts
> > and
> > + * preemption-safe without the overhead of explicit locking.
> > + */
> > +struct xe_gt_stats {
> > +       u64 counters[__XE_GT_STATS_NUM_IDS];
> > +} ____cacheline_aligned;
> > +
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
> > b/drivers/gpu/drm/xe/xe_gt_types.h
> > index caf7e7e78be9..8b55cf25a75f 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> > @@ -158,10 +158,7 @@ struct xe_gt {
> >  
> >  #if IS_ENABLED(CONFIG_DEBUG_FS)
> >         /** @stats: GT stats */
> > -       struct {
> > -               /** @stats.counters: counters for various GT stats */
> > -               atomic64_t counters[__XE_GT_STATS_NUM_IDS];
> > -       } stats;
> > +       struct xe_gt_stats __percpu *stats;
> >  #endif
> >  
> >         /**
>

next prev parent reply	other threads:[~2026-02-17 23:18 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-17 20:05 [PATCH] drm/xe: Convert GT stats to per-cpu counters Matthew Brost
2026-02-17 20:11 ` ✓ CI.KUnit: success for " Patchwork
2026-02-17 20:55 ` ✓ Xe.CI.BAT: " Patchwork
2026-02-17 23:13 ` [PATCH] " Summers, Stuart
2026-02-17 23:18   ` Matthew Brost [this message]
2026-02-17 23:22     ` Summers, Stuart
2026-02-18  0:16 ` ✗ Xe.CI.FULL: failure for " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aZT3OqKwKBFk6g99@lstrano-desk.jf.intel.com \
    --to=matthew.brost@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=stuart.summers@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox