Re: [PATCH 2/2] drm/i915/pmu: Add queued counter

From: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
To: "tursulin@ursulin.net" <tursulin@ursulin.net>
Cc: "Intel-gfx@lists.freedesktop.org" <Intel-gfx@lists.freedesktop.org>
Subject: Re: [PATCH 2/2] drm/i915/pmu: Add queued counter
Date: Wed, 22 Nov 2017 21:15:24 +0000	[thread overview]
Message-ID: <1511356368.15021.5.camel@intel.com> (raw)
In-Reply-To: <20171122124622.32638-2-tvrtko.ursulin@linux.intel.com>

On Wed, 2017-11-22 at 12:46 +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> We add a PMU counter to expose the number of requests currently submitted
> to the GPU, plus the number of runnable requests waiting on GPU time.
> 
> This is useful to analyze the overall load of the system.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_pmu.c | 30 +++++++++++++++++++++++++-----
>  include/uapi/drm/i915_drm.h     |  6 ++++++
>  2 files changed, 31 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 112243720ff3..b2b4b32af35f 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -36,7 +36,8 @@
>  #define ENGINE_SAMPLE_MASK \
>  	(BIT(I915_SAMPLE_BUSY) | \
>  	 BIT(I915_SAMPLE_WAIT) | \
> -	 BIT(I915_SAMPLE_SEMA))
> +	 BIT(I915_SAMPLE_SEMA) | \
> +	 BIT(I915_SAMPLE_QUEUED))
>  
>  #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
>  
> @@ -223,6 +224,12 @@ static void engines_sample(struct drm_i915_private *dev_priv)
>  
>  		update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
>  			      PERIOD, !!(val & RING_WAIT_SEMAPHORE));
> +
> +		if (engine->pmu.enable & BIT(I915_SAMPLE_QUEUED))
> +			update_sample(&engine->pmu.sample[I915_SAMPLE_QUEUED],
> +				      1 / I915_SAMPLE_QUEUED_SCALE,
> +				      engine->queued +
> +				      (last_seqno - current_seqno));
>  	}
>  
>  	if (fw)
> @@ -310,6 +317,10 @@ static int engine_event_init(struct perf_event *event)
>  		if (INTEL_GEN(i915) < 6)
>  			return -ENODEV;
>  		break;
> +	case I915_SAMPLE_QUEUED:
> +		if (INTEL_GEN(i915) < 8)
> +			return -ENODEV;
> +		break;
>  	default:
>  		return -ENOENT;
>  	}
> @@ -399,6 +410,10 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
>  		} else if (sample == I915_SAMPLE_BUSY &&
>  			   engine->pmu.busy_stats) {
>  			val = ktime_to_ns(intel_engine_get_busy_time(engine));
> +		} else if (sample == I915_SAMPLE_QUEUED) {
> +			val =
> +			   div_u64(engine->pmu.sample[I915_SAMPLE_QUEUED].cur,
> +				   FREQUENCY);
>  		} else {
>  			val = engine->pmu.sample[sample].cur;
>  		}
> @@ -679,13 +694,18 @@ static ssize_t i915_pmu_event_show(struct device *dev,
>  	I915_EVENT_STR(_name.unit, _unit)
>  
>  #define I915_ENGINE_EVENT(_name, _class, _instance, _sample) \
> -	I915_EVENT_ATTR(_name, __I915_PMU_ENGINE(_class, _instance, _sample)), \
> +	I915_EVENT_ATTR(_name, __I915_PMU_ENGINE(_class, _instance, _sample))
> +
> +#define I915_ENGINE_EVENT_NS(_name, _class, _instance, _sample) \
> +	I915_ENGINE_EVENT(_name, _class, _instance, _sample), \
>  	I915_EVENT_STR(_name.unit, "ns")
>  
>  #define I915_ENGINE_EVENTS(_name, _class, _instance) \
> -	I915_ENGINE_EVENT(_name##_instance-busy, _class, _instance, I915_SAMPLE_BUSY), \
> -	I915_ENGINE_EVENT(_name##_instance-sema, _class, _instance, I915_SAMPLE_SEMA), \
> -	I915_ENGINE_EVENT(_name##_instance-wait, _class, _instance, I915_SAMPLE_WAIT)
> +	I915_ENGINE_EVENT_NS(_name##_instance-busy, _class, _instance, I915_SAMPLE_BUSY), \
> +	I915_ENGINE_EVENT_NS(_name##_instance-sema, _class, _instance, I915_SAMPLE_SEMA), \
> +	I915_ENGINE_EVENT_NS(_name##_instance-wait, _class, _instance, I915_SAMPLE_WAIT), \
> +	I915_ENGINE_EVENT(_name##_instance-queued, _class, _instance, I915_SAMPLE_QUEUED), \
> +	I915_EVENT_STR(_name##_instance-queued.scale, __stringify(I915_SAMPLE_QUEUED_SCALE))

We expose queued as an "instant" metric, i.e. that's a number of
requests on the very moment when we query the metric, i.e. that's not an
ever growing counter - is that right? I doubt such a metric will make
sense for perf-stat. Can we somehow restrict it to be queried by uAPI
only and avoid perf-stat for it?

>  
>  static struct attribute *i915_pmu_events_attrs[] = {
>  	I915_ENGINE_EVENTS(rcs, I915_ENGINE_CLASS_RENDER, 0),
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 915a6e85a855..20ee668d1428 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -111,9 +111,12 @@ enum drm_i915_pmu_engine_sample {
>  	I915_SAMPLE_BUSY = 0,
>  	I915_SAMPLE_WAIT = 1,
>  	I915_SAMPLE_SEMA = 2,
> +	I915_SAMPLE_QUEUED = 3,
>  	I915_ENGINE_SAMPLE_MAX /* non-ABI */
>  };
>  
> +#define I915_SAMPLE_QUEUED_SCALE 1e-2 /* No braces please. */
> +
>  #define I915_PMU_SAMPLE_BITS (4)
>  #define I915_PMU_SAMPLE_MASK (0xf)
>  #define I915_PMU_SAMPLE_INSTANCE_BITS (8)
> @@ -134,6 +137,9 @@ enum drm_i915_pmu_engine_sample {
>  #define I915_PMU_ENGINE_SEMA(class, instance) \
>  	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>  
> +#define I915_PMU_ENGINE_QUEUED(class, instance) \
> +	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
> +
>  #define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>  
>  #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx