Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
To: Francois Dugast <francois.dugast@intel.com>
Cc: intel-xe@lists.freedesktop.org, Rodrigo Vivi <rodrigo.vivi@intel.com>
Subject: Re: [Intel-xe] [PATCH v3 03/30] drm/xe: Correlate engine and cpu timestamps with better accuracy
Date: Tue, 26 Sep 2023 11:43:36 -0700	[thread overview]
Message-ID: <ZRMmWLusAMpwiGIr@unerlige-ril> (raw)
In-Reply-To: <20230926125540.7-4-francois.dugast@intel.com>

On Tue, Sep 26, 2023 at 12:55:13PM +0000, Francois Dugast wrote:
>From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>
>Perf measurements rely on CPU and engine timestamps to correlate
>events of interest across these time domains. Current mechanisms get
>these timestamps separately and the calculated delta between these
>timestamps lack enough accuracy.
>
>To improve the accuracy of these time measurements to within a few us,
>add a query that returns the engine and cpu timestamps captured as
>close to each other as possible.
>
>Prior work: https://patchwork.freedesktop.org/series/87552/
>
>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>---
> drivers/gpu/drm/xe/xe_query.c | 141 ++++++++++++++++++++++++++++++++++
> include/uapi/drm/xe_drm.h     |  95 ++++++++++++++++++-----
> 2 files changed, 218 insertions(+), 18 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
>index cbccd5c3dbc8..be9affd4f181 100644
>--- a/drivers/gpu/drm/xe/xe_query.c
>+++ b/drivers/gpu/drm/xe/xe_query.c
>@@ -6,10 +6,12 @@
> #include "xe_query.h"
>
> #include <linux/nospec.h>
>+#include <linux/sched/clock.h>
>
> #include <drm/ttm/ttm_placement.h>
> #include <drm/xe_drm.h>
>
>+#include "regs/xe_engine_regs.h"
> #include "xe_bo.h"
> #include "xe_device.h"
> #include "xe_exec_queue.h"
>@@ -17,6 +19,7 @@
> #include "xe_gt.h"
> #include "xe_guc_hwconfig.h"
> #include "xe_macros.h"
>+#include "xe_mmio.h"
> #include "xe_ttm_vram_mgr.h"
>
> static const u16 xe_to_user_engine_class[] = {
>@@ -27,6 +30,14 @@ static const u16 xe_to_user_engine_class[] = {
> 	[XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE,
> };
>
>+static const enum xe_engine_class user_to_xe_engine_class[] = {
>+	[DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER,
>+	[DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY,
>+	[DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE,
>+	[DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE,
>+	[DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
>+};
>+
> static size_t calc_hw_engine_info_size(struct xe_device *xe)
> {
> 	struct xe_hw_engine *hwe;
>@@ -45,6 +56,135 @@ static size_t calc_hw_engine_info_size(struct xe_device *xe)
> 	return i * sizeof(struct drm_xe_engine_class_instance);
> }
>
>+typedef u64 (*__ktime_func_t)(void);
>+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>+{
>+	/*
>+	 * Use logic same as the perf subsystem to allow user to select the
>+	 * reference clock id to be used for timestamps.
>+	 */
>+	switch (clk_id) {
>+	case CLOCK_MONOTONIC:
>+		return &ktime_get_ns;
>+	case CLOCK_MONOTONIC_RAW:
>+		return &ktime_get_raw_ns;
>+	case CLOCK_REALTIME:
>+		return &ktime_get_real_ns;
>+	case CLOCK_BOOTTIME:
>+		return &ktime_get_boottime_ns;
>+	case CLOCK_TAI:
>+		return &ktime_get_clocktai_ns;
>+	default:
>+		return NULL;
>+	}
>+}
>+
>+static void
>+__read_timestamps(struct xe_gt *gt,
>+		  struct xe_reg lower_reg,
>+		  struct xe_reg upper_reg,
>+		  u64 *cs_ts,
>+		  u64 *cpu_ts,
>+		  u64 *cpu_delta,
>+		  __ktime_func_t cpu_clock)
>+{
>+	u32 upper, lower, old_upper, loop = 0;
>+
>+	upper = xe_mmio_read32(gt, upper_reg);
>+	do {
>+		*cpu_delta = local_clock();
>+		*cpu_ts = cpu_clock();
>+		lower = xe_mmio_read32(gt, lower_reg);
>+		*cpu_delta = local_clock() - *cpu_delta;
>+		old_upper = upper;
>+		upper = xe_mmio_read32(gt, upper_reg);
>+	} while (upper != old_upper && loop++ < 2);
>+
>+	*cs_ts = (u64)upper << 32 | lower;
>+}
>+
>+static int
>+query_cs_cycles(struct xe_device *xe,
>+		struct drm_xe_device_query *query)
>+{
>+	struct drm_xe_query_cs_cycles __user *query_ptr;
>+	struct drm_xe_engine_class_instance *eci;
>+	struct drm_xe_query_cs_cycles resp;
>+	size_t size = sizeof(resp);
>+	__ktime_func_t cpu_clock;
>+	struct xe_hw_engine *hwe;
>+	struct xe_gt *gt;
>+
>+	if (query->size == 0) {
>+		query->size = size;
>+		return 0;
>+	} else if (XE_IOCTL_DBG(xe, query->size != size)) {
>+		return -EINVAL;
>+	}
>+
>+	query_ptr = u64_to_user_ptr(query->data);
>+	if (copy_from_user(&resp, query_ptr, size))
>+		return -EFAULT;
>+
>+	if (resp.rsvd)
>+		return -EINVAL;
>+
>+	cpu_clock = __clock_id_to_func(resp.clockid);
>+	if (!cpu_clock)
>+		return -EINVAL;
>+
>+	eci = &resp.eci;
>+	if (eci->gt_id > XE_MAX_GT_PER_TILE)
>+		return -EINVAL;
>+
>+	gt = xe_device_get_gt(xe, eci->gt_id);
>+	if (!gt)
>+		return -EINVAL;
>+
>+	if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class))
>+		return -EINVAL;
>+
>+	hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class],
>+			      eci->engine_instance, true);
>+	if (!hwe)
>+		return -EINVAL;
>+
>+	resp.cs_frequency = gt->info.clock_freq;
>+
>+	xe_device_mem_access_get(xe);
>+	xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>+
>+	__read_timestamps(gt,
>+			  RING_TIMESTAMP(hwe->mmio_base),
>+			  RING_TIMESTAMP_UDW(hwe->mmio_base),
>+			  &resp.cs_cycles,
>+			  &resp.cpu_timestamp,
>+			  &resp.cpu_delta,
>+			  cpu_clock);
>+
>+	xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>+	xe_device_mem_access_put(xe);
>+	resp.width = 36;
>+
>+	/* Only write to the output fields of user query */
>+	if (put_user(resp.cs_frequency, &query_ptr->cs_frequency))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cs_cycles, &query_ptr->cs_cycles))
>+		return -EFAULT;
>+
>+	if (put_user(resp.width, &query_ptr->width))
>+		return -EFAULT;
>+
>+	return 0;
>+}
>+
> static int query_engines(struct xe_device *xe,
> 			 struct drm_xe_device_query *query)
> {
>@@ -369,6 +509,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
> 	query_gts,
> 	query_hwconfig,
> 	query_gt_topology,
>+	query_cs_cycles,
> };
>
> int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>index d48d8e3c898c..fde1378a60b9 100644
>--- a/include/uapi/drm/xe_drm.h
>+++ b/include/uapi/drm/xe_drm.h
>@@ -128,6 +128,24 @@ struct xe_user_extension {
> #define DRM_IOCTL_XE_WAIT_USER_FENCE		DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
> #define DRM_IOCTL_XE_VM_MADVISE			 DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise)
>
>+/** struct drm_xe_engine_class_instance - instance of an engine class */
>+struct drm_xe_engine_class_instance {
>+#define DRM_XE_ENGINE_CLASS_RENDER		0
>+#define DRM_XE_ENGINE_CLASS_COPY		1
>+#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
>+#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
>+#define DRM_XE_ENGINE_CLASS_COMPUTE		4
>+	/*
>+	 * Kernel only class (not actual hardware engine class). Used for
>+	 * creating ordered queues of VM bind operations.
>+	 */
>+#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>+	__u16 engine_class;
>+
>+	__u16 engine_instance;
>+	__u16 gt_id;
>+};
>+
> /**
>  * enum drm_xe_memory_class - Supported memory classes.
>  */
>@@ -219,6 +237,64 @@ struct drm_xe_query_mem_region {
> 	__u64 reserved[6];
> };
>
>+/**
>+ * struct drm_xe_query_cs_cycles - correlate CPU and GPU timestamps
>+ *
>+ * If a query is made with a struct drm_xe_device_query where .query
>+ * is equal to DRM_XE_QUERY_CS_CYCLES, then the reply uses
>+ * struct drm_xe_query_cs_cycles in .data.
>+ *
>+ * struct drm_xe_query_cs_cycles is allocated by the user and .data points to
>+ * this allocated structure. The user must pass .eci and .clockid as inputs to
>+ * this query. eci determines the engine and tile info required to fetch the
>+ * relevant GPU timestamp. clockid is used to return the specific CPU
>+ * timestamp.
>+ *
>+ * The query returns the command streamer cycles and the frequency that can
>+ * be used to calculate the command streamer timestamp. In addition the
>+ * query returns a set of cpu timestamps that indicate when the command
>+ * streamer cycle count was captured.
>+ */
>+struct drm_xe_query_cs_cycles {
>+	/** Engine for which command streamer cycles is queried. */
>+	struct drm_xe_engine_class_instance eci;
>+
>+	/** MBZ (pad eci to 64 bit) */
>+	__u16 rsvd;
>+
>+	/**
>+	 * Command streamer cycles as read from the command streamer
>+	 * register at 0x358 offset.
>+	 */
>+	__u64 cs_cycles;
>+
>+	/** Frequency of the cs cycles in Hz. */
>+	__u64 cs_frequency;
>+
>+	/**
>+	 * CPU timestamp in ns. The timestamp is captured before reading the
>+	 * cs_cycles register using the reference clockid set by the user.
>+	 */
>+	__u64 cpu_timestamp;
>+
>+	/**
>+	 * Time delta in ns captured around reading the lower dword of the
>+	 * cs_cycles register.
>+	 */
>+	__u64 cpu_delta;
>+
>+	/**
>+	 * Reference clock id for CPU timestamp. For definition, see
>+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
>+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
>+	 * CLOCK_TAI.
>+	 */
>+	__s32 clockid;
>+
>+	/** Width of the cs cycle counter in bits. */
>+	__u32 width;
>+};
>+
> /**
>  * struct drm_xe_query_mem_usage - describe memory regions and usage
>  *
>@@ -391,6 +467,7 @@ struct drm_xe_device_query {
> #define DRM_XE_DEVICE_QUERY_GTS		3
> #define DRM_XE_DEVICE_QUERY_HWCONFIG	4
> #define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY	5
>+#define DRM_XE_QUERY_CS_CYCLES		6
> 	/** @query: The type of data to query */
> 	__u32 query;
>
>@@ -732,24 +809,6 @@ struct drm_xe_exec_queue_set_property {
> 	__u64 reserved[2];
> };
>
>-/** struct drm_xe_engine_class_instance - instance of an engine class */
>-struct drm_xe_engine_class_instance {
>-#define DRM_XE_ENGINE_CLASS_RENDER		0
>-#define DRM_XE_ENGINE_CLASS_COPY		1
>-#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
>-#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
>-#define DRM_XE_ENGINE_CLASS_COMPUTE		4
>-	/*
>-	 * Kernel only class (not actual hardware engine class). Used for
>-	 * creating ordered queues of VM bind operations.
>-	 */
>-#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>-	__u16 engine_class;
>-
>-	__u16 engine_instance;
>-	__u16 gt_id;

Patch 26/30 where a pad is added here (drm/xe/uapi: Add pad to 
drm_xe_engine_class_instance) should appear before this patch in the 
series.

Thanks,
Umesh

>-};
>-
> struct drm_xe_exec_queue_create {
> #define XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY               0
> 	/** @extensions: Pointer to the first extension struct, if any */
>-- 
>2.34.1
>

  parent reply	other threads:[~2023-09-26 18:43 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-26 12:55 [Intel-xe] [PATCH v3 00/30] uAPI Alignment - take 1 v3 Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 01/30] drm/xe: Fix array bounds check for queries Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 02/30] drm/xe: Set the correct type for xe_to_user_engine_class Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 03/30] drm/xe: Correlate engine and cpu timestamps with better accuracy Francois Dugast
2023-09-26 16:42   ` Souza, Jose
2023-09-26 18:43   ` Umesh Nerlige Ramappa [this message]
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 04/30] drm/xe/uapi: Separate VM_BIND's operation and flag Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 05/30] drm/xe/vm: Remove VM_BIND_OP macro Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 06/30] drm/xe/uapi: Remove MMIO ioctl Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 07/30] drm/xe: Fix xe_exec_queue_is_idle for parallel exec queues Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 08/30] drm/xe: Deprecate XE_EXEC_QUEUE_SET_PROPERTY_COMPUTE_MODE implementation Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 09/30] drm/xe: Rename exec_queue_kill_compute to xe_vm_remove_compute_exec_queue Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 10/30] drm/xe: Remove XE_EXEC_QUEUE_SET_PROPERTY_COMPUTE_MODE from uAPI Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 11/30] drm/xe/uapi: Use common drm_xe_ext_set_property extension Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 12/30] drm/xe: Kill XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS extension Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 13/30] drm/xe/uapi: Kill DRM_XE_UFENCE_WAIT_VM_ERROR Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 14/30] drm/xe: Remove async worker and rework sync binds Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 15/30] drm/xe: Fix VM bind out-sync signaling ordering Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 16/30] drm/xe/uapi: Document drm_xe_query_gt Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 17/30] drm/xe/uapi: Replace useless 'instance' per unique gt_id Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 18/30] drm/xe/uapi: Remove unused field of drm_xe_query_gt Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 19/30] drm/xe/uapi: Rename gts to gt_list Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 20/30] drm/xe/uapi: Fix naming of XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 21/30] drm/xe/uapi: Add documentation for query Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 22/30] drm/xe/uapi: Crystal Reference Clock updates Francois Dugast
2023-09-26 16:40   ` Souza, Jose
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 23/30] drm/xe: Extend drm_xe_vm_bind_op Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 24/30] drm/xe: Add uAPI to query micro-controler firmware version Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 25/30] drm/xe/uapi: Document DRM_XE_DEVICE_QUERY_HWCONFIG Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 26/30] drm/xe/uapi: Add pad to drm_xe_engine_class_instance Francois Dugast
2023-09-29  0:36   ` Umesh Nerlige Ramappa
2023-09-29  9:06     ` Francois Dugast
2023-09-29 16:00       ` Umesh Nerlige Ramappa
2023-09-29 16:45         ` Souza, Jose
2023-10-03 18:15           ` Umesh Nerlige Ramappa
2023-10-04 10:55             ` Francois Dugast
2023-10-05  2:35               ` Umesh Nerlige Ramappa
2023-10-09 17:05                 ` Umesh Nerlige Ramappa
2023-10-09 17:16                   ` Francois Dugast
2023-10-06  2:07               ` Umesh Nerlige Ramappa
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 27/30] drm/xe: Extend uAPI to query HuC micro-controler firmware version Francois Dugast
2023-09-26 16:46   ` Souza, Jose
2023-09-27 17:04     ` Rodrigo Vivi
2023-09-27 17:22       ` Souza, Jose
2023-10-04  0:48         ` John Harrison
2023-10-09 13:08           ` Francois Dugast
2023-10-09 13:35             ` Souza, Jose
2023-10-10 19:10   ` Lucas De Marchi
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 28/30] drm/xe: Remove useless query config num_params Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 29/30] drm/xe/uapi: Add missing DRM_ prefix in uAPI constants Francois Dugast
2023-09-26 16:24   ` Souza, Jose
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 30/30] drm/xe/uapi: Add _FLAG to uAPI constants usable for flags Francois Dugast
2023-09-26 13:12 ` [Intel-xe] ✓ CI.Patch_applied: success for uAPI Alignment - take 1 v3 Patchwork
2023-09-26 13:13 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-09-26 13:14 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-09-26 13:21 ` [Intel-xe] ✓ CI.Build: " Patchwork
2023-09-26 13:22 ` [Intel-xe] ✗ CI.Hooks: failure " Patchwork
2023-09-26 13:23 ` [Intel-xe] ✓ CI.checksparse: success " Patchwork
2023-09-26 13:49 ` [Intel-xe] ✗ CI.BAT: failure " Patchwork
2023-10-04  0:31 ` [Intel-xe] [PATCH v3 00/30] " John Harrison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZRMmWLusAMpwiGIr@unerlige-ril \
    --to=umesh.nerlige.ramappa@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox