All of lore.kernel.org
 help / color / mirror / Atom feed
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
To: Francois Dugast <francois.dugast@intel.com>
Cc: intel-xe@lists.freedesktop.org, Rodrigo Vivi <rodrigo.vivi@intel.com>
Subject: Re: [Intel-xe] [PATCH v3 03/30] drm/xe: Correlate engine and cpu timestamps with better accuracy
Date: Tue, 26 Sep 2023 11:43:36 -0700	[thread overview]
Message-ID: <ZRMmWLusAMpwiGIr@unerlige-ril> (raw)
In-Reply-To: <20230926125540.7-4-francois.dugast@intel.com>

On Tue, Sep 26, 2023 at 12:55:13PM +0000, Francois Dugast wrote:
>From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>
>Perf measurements rely on CPU and engine timestamps to correlate
>events of interest across these time domains. Current mechanisms get
>these timestamps separately and the calculated delta between these
>timestamps lack enough accuracy.
>
>To improve the accuracy of these time measurements to within a few us,
>add a query that returns the engine and cpu timestamps captured as
>close to each other as possible.
>
>Prior work: https://patchwork.freedesktop.org/series/87552/
>
>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>---
> drivers/gpu/drm/xe/xe_query.c | 141 ++++++++++++++++++++++++++++++++++
> include/uapi/drm/xe_drm.h     |  95 ++++++++++++++++++-----
> 2 files changed, 218 insertions(+), 18 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
>index cbccd5c3dbc8..be9affd4f181 100644
>--- a/drivers/gpu/drm/xe/xe_query.c
>+++ b/drivers/gpu/drm/xe/xe_query.c
>@@ -6,10 +6,12 @@
> #include "xe_query.h"
>
> #include <linux/nospec.h>
>+#include <linux/sched/clock.h>
>
> #include <drm/ttm/ttm_placement.h>
> #include <drm/xe_drm.h>
>
>+#include "regs/xe_engine_regs.h"
> #include "xe_bo.h"
> #include "xe_device.h"
> #include "xe_exec_queue.h"
>@@ -17,6 +19,7 @@
> #include "xe_gt.h"
> #include "xe_guc_hwconfig.h"
> #include "xe_macros.h"
>+#include "xe_mmio.h"
> #include "xe_ttm_vram_mgr.h"
>
> static const u16 xe_to_user_engine_class[] = {
>@@ -27,6 +30,14 @@ static const u16 xe_to_user_engine_class[] = {
> 	[XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE,
> };
>
>+static const enum xe_engine_class user_to_xe_engine_class[] = {
>+	[DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER,
>+	[DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY,
>+	[DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE,
>+	[DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE,
>+	[DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
>+};
>+
> static size_t calc_hw_engine_info_size(struct xe_device *xe)
> {
> 	struct xe_hw_engine *hwe;
>@@ -45,6 +56,135 @@ static size_t calc_hw_engine_info_size(struct xe_device *xe)
> 	return i * sizeof(struct drm_xe_engine_class_instance);
> }
>
>+typedef u64 (*__ktime_func_t)(void);
>+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>+{
>+	/*
>+	 * Use logic same as the perf subsystem to allow user to select the
>+	 * reference clock id to be used for timestamps.
>+	 */
>+	switch (clk_id) {
>+	case CLOCK_MONOTONIC:
>+		return &ktime_get_ns;
>+	case CLOCK_MONOTONIC_RAW:
>+		return &ktime_get_raw_ns;
>+	case CLOCK_REALTIME:
>+		return &ktime_get_real_ns;
>+	case CLOCK_BOOTTIME:
>+		return &ktime_get_boottime_ns;
>+	case CLOCK_TAI:
>+		return &ktime_get_clocktai_ns;
>+	default:
>+		return NULL;
>+	}
>+}
>+
>+static void
>+__read_timestamps(struct xe_gt *gt,
>+		  struct xe_reg lower_reg,
>+		  struct xe_reg upper_reg,
>+		  u64 *cs_ts,
>+		  u64 *cpu_ts,
>+		  u64 *cpu_delta,
>+		  __ktime_func_t cpu_clock)
>+{
>+	u32 upper, lower, old_upper, loop = 0;
>+
>+	upper = xe_mmio_read32(gt, upper_reg);
>+	do {
>+		*cpu_delta = local_clock();
>+		*cpu_ts = cpu_clock();
>+		lower = xe_mmio_read32(gt, lower_reg);
>+		*cpu_delta = local_clock() - *cpu_delta;
>+		old_upper = upper;
>+		upper = xe_mmio_read32(gt, upper_reg);
>+	} while (upper != old_upper && loop++ < 2);
>+
>+	*cs_ts = (u64)upper << 32 | lower;
>+}
>+
>+static int
>+query_cs_cycles(struct xe_device *xe,
>+		struct drm_xe_device_query *query)
>+{
>+	struct drm_xe_query_cs_cycles __user *query_ptr;
>+	struct drm_xe_engine_class_instance *eci;
>+	struct drm_xe_query_cs_cycles resp;
>+	size_t size = sizeof(resp);
>+	__ktime_func_t cpu_clock;
>+	struct xe_hw_engine *hwe;
>+	struct xe_gt *gt;
>+
>+	if (query->size == 0) {
>+		query->size = size;
>+		return 0;
>+	} else if (XE_IOCTL_DBG(xe, query->size != size)) {
>+		return -EINVAL;
>+	}
>+
>+	query_ptr = u64_to_user_ptr(query->data);
>+	if (copy_from_user(&resp, query_ptr, size))
>+		return -EFAULT;
>+
>+	if (resp.rsvd)
>+		return -EINVAL;
>+
>+	cpu_clock = __clock_id_to_func(resp.clockid);
>+	if (!cpu_clock)
>+		return -EINVAL;
>+
>+	eci = &resp.eci;
>+	if (eci->gt_id > XE_MAX_GT_PER_TILE)
>+		return -EINVAL;
>+
>+	gt = xe_device_get_gt(xe, eci->gt_id);
>+	if (!gt)
>+		return -EINVAL;
>+
>+	if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class))
>+		return -EINVAL;
>+
>+	hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class],
>+			      eci->engine_instance, true);
>+	if (!hwe)
>+		return -EINVAL;
>+
>+	resp.cs_frequency = gt->info.clock_freq;
>+
>+	xe_device_mem_access_get(xe);
>+	xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>+
>+	__read_timestamps(gt,
>+			  RING_TIMESTAMP(hwe->mmio_base),
>+			  RING_TIMESTAMP_UDW(hwe->mmio_base),
>+			  &resp.cs_cycles,
>+			  &resp.cpu_timestamp,
>+			  &resp.cpu_delta,
>+			  cpu_clock);
>+
>+	xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>+	xe_device_mem_access_put(xe);
>+	resp.width = 36;
>+
>+	/* Only write to the output fields of user query */
>+	if (put_user(resp.cs_frequency, &query_ptr->cs_frequency))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
>+		return -EFAULT;
>+
>+	if (put_user(resp.cs_cycles, &query_ptr->cs_cycles))
>+		return -EFAULT;
>+
>+	if (put_user(resp.width, &query_ptr->width))
>+		return -EFAULT;
>+
>+	return 0;
>+}
>+
> static int query_engines(struct xe_device *xe,
> 			 struct drm_xe_device_query *query)
> {
>@@ -369,6 +509,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
> 	query_gts,
> 	query_hwconfig,
> 	query_gt_topology,
>+	query_cs_cycles,
> };
>
> int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>index d48d8e3c898c..fde1378a60b9 100644
>--- a/include/uapi/drm/xe_drm.h
>+++ b/include/uapi/drm/xe_drm.h
>@@ -128,6 +128,24 @@ struct xe_user_extension {
> #define DRM_IOCTL_XE_WAIT_USER_FENCE		DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
> #define DRM_IOCTL_XE_VM_MADVISE			 DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise)
>
>+/** struct drm_xe_engine_class_instance - instance of an engine class */
>+struct drm_xe_engine_class_instance {
>+#define DRM_XE_ENGINE_CLASS_RENDER		0
>+#define DRM_XE_ENGINE_CLASS_COPY		1
>+#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
>+#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
>+#define DRM_XE_ENGINE_CLASS_COMPUTE		4
>+	/*
>+	 * Kernel only class (not actual hardware engine class). Used for
>+	 * creating ordered queues of VM bind operations.
>+	 */
>+#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>+	__u16 engine_class;
>+
>+	__u16 engine_instance;
>+	__u16 gt_id;
>+};
>+
> /**
>  * enum drm_xe_memory_class - Supported memory classes.
>  */
>@@ -219,6 +237,64 @@ struct drm_xe_query_mem_region {
> 	__u64 reserved[6];
> };
>
>+/**
>+ * struct drm_xe_query_cs_cycles - correlate CPU and GPU timestamps
>+ *
>+ * If a query is made with a struct drm_xe_device_query where .query
>+ * is equal to DRM_XE_QUERY_CS_CYCLES, then the reply uses
>+ * struct drm_xe_query_cs_cycles in .data.
>+ *
>+ * struct drm_xe_query_cs_cycles is allocated by the user and .data points to
>+ * this allocated structure. The user must pass .eci and .clockid as inputs to
>+ * this query. eci determines the engine and tile info required to fetch the
>+ * relevant GPU timestamp. clockid is used to return the specific CPU
>+ * timestamp.
>+ *
>+ * The query returns the command streamer cycles and the frequency that can
>+ * be used to calculate the command streamer timestamp. In addition the
>+ * query returns a set of cpu timestamps that indicate when the command
>+ * streamer cycle count was captured.
>+ */
>+struct drm_xe_query_cs_cycles {
>+	/** Engine for which command streamer cycles is queried. */
>+	struct drm_xe_engine_class_instance eci;
>+
>+	/** MBZ (pad eci to 64 bit) */
>+	__u16 rsvd;
>+
>+	/**
>+	 * Command streamer cycles as read from the command streamer
>+	 * register at 0x358 offset.
>+	 */
>+	__u64 cs_cycles;
>+
>+	/** Frequency of the cs cycles in Hz. */
>+	__u64 cs_frequency;
>+
>+	/**
>+	 * CPU timestamp in ns. The timestamp is captured before reading the
>+	 * cs_cycles register using the reference clockid set by the user.
>+	 */
>+	__u64 cpu_timestamp;
>+
>+	/**
>+	 * Time delta in ns captured around reading the lower dword of the
>+	 * cs_cycles register.
>+	 */
>+	__u64 cpu_delta;
>+
>+	/**
>+	 * Reference clock id for CPU timestamp. For definition, see
>+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
>+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
>+	 * CLOCK_TAI.
>+	 */
>+	__s32 clockid;
>+
>+	/** Width of the cs cycle counter in bits. */
>+	__u32 width;
>+};
>+
> /**
>  * struct drm_xe_query_mem_usage - describe memory regions and usage
>  *
>@@ -391,6 +467,7 @@ struct drm_xe_device_query {
> #define DRM_XE_DEVICE_QUERY_GTS		3
> #define DRM_XE_DEVICE_QUERY_HWCONFIG	4
> #define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY	5
>+#define DRM_XE_QUERY_CS_CYCLES		6
> 	/** @query: The type of data to query */
> 	__u32 query;
>
>@@ -732,24 +809,6 @@ struct drm_xe_exec_queue_set_property {
> 	__u64 reserved[2];
> };
>
>-/** struct drm_xe_engine_class_instance - instance of an engine class */
>-struct drm_xe_engine_class_instance {
>-#define DRM_XE_ENGINE_CLASS_RENDER		0
>-#define DRM_XE_ENGINE_CLASS_COPY		1
>-#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
>-#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
>-#define DRM_XE_ENGINE_CLASS_COMPUTE		4
>-	/*
>-	 * Kernel only class (not actual hardware engine class). Used for
>-	 * creating ordered queues of VM bind operations.
>-	 */
>-#define DRM_XE_ENGINE_CLASS_VM_BIND		5
>-	__u16 engine_class;
>-
>-	__u16 engine_instance;
>-	__u16 gt_id;

Patch 26/30 where a pad is added here (drm/xe/uapi: Add pad to 
drm_xe_engine_class_instance) should appear before this patch in the 
series.

Thanks,
Umesh

>-};
>-
> struct drm_xe_exec_queue_create {
> #define XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY               0
> 	/** @extensions: Pointer to the first extension struct, if any */
>-- 
>2.34.1
>

  parent reply	other threads:[~2023-09-26 18:43 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-26 12:55 [Intel-xe] [PATCH v3 00/30] uAPI Alignment - take 1 v3 Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 01/30] drm/xe: Fix array bounds check for queries Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 02/30] drm/xe: Set the correct type for xe_to_user_engine_class Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 03/30] drm/xe: Correlate engine and cpu timestamps with better accuracy Francois Dugast
2023-09-26 16:42   ` Souza, Jose
2023-09-26 18:43   ` Umesh Nerlige Ramappa [this message]
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 04/30] drm/xe/uapi: Separate VM_BIND's operation and flag Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 05/30] drm/xe/vm: Remove VM_BIND_OP macro Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 06/30] drm/xe/uapi: Remove MMIO ioctl Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 07/30] drm/xe: Fix xe_exec_queue_is_idle for parallel exec queues Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 08/30] drm/xe: Deprecate XE_EXEC_QUEUE_SET_PROPERTY_COMPUTE_MODE implementation Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 09/30] drm/xe: Rename exec_queue_kill_compute to xe_vm_remove_compute_exec_queue Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 10/30] drm/xe: Remove XE_EXEC_QUEUE_SET_PROPERTY_COMPUTE_MODE from uAPI Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 11/30] drm/xe/uapi: Use common drm_xe_ext_set_property extension Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 12/30] drm/xe: Kill XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS extension Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 13/30] drm/xe/uapi: Kill DRM_XE_UFENCE_WAIT_VM_ERROR Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 14/30] drm/xe: Remove async worker and rework sync binds Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 15/30] drm/xe: Fix VM bind out-sync signaling ordering Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 16/30] drm/xe/uapi: Document drm_xe_query_gt Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 17/30] drm/xe/uapi: Replace useless 'instance' per unique gt_id Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 18/30] drm/xe/uapi: Remove unused field of drm_xe_query_gt Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 19/30] drm/xe/uapi: Rename gts to gt_list Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 20/30] drm/xe/uapi: Fix naming of XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 21/30] drm/xe/uapi: Add documentation for query Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 22/30] drm/xe/uapi: Crystal Reference Clock updates Francois Dugast
2023-09-26 16:40   ` Souza, Jose
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 23/30] drm/xe: Extend drm_xe_vm_bind_op Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 24/30] drm/xe: Add uAPI to query micro-controler firmware version Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 25/30] drm/xe/uapi: Document DRM_XE_DEVICE_QUERY_HWCONFIG Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 26/30] drm/xe/uapi: Add pad to drm_xe_engine_class_instance Francois Dugast
2023-09-29  0:36   ` Umesh Nerlige Ramappa
2023-09-29  9:06     ` Francois Dugast
2023-09-29 16:00       ` Umesh Nerlige Ramappa
2023-09-29 16:45         ` Souza, Jose
2023-10-03 18:15           ` Umesh Nerlige Ramappa
2023-10-04 10:55             ` Francois Dugast
2023-10-05  2:35               ` Umesh Nerlige Ramappa
2023-10-09 17:05                 ` Umesh Nerlige Ramappa
2023-10-09 17:16                   ` Francois Dugast
2023-10-06  2:07               ` Umesh Nerlige Ramappa
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 27/30] drm/xe: Extend uAPI to query HuC micro-controler firmware version Francois Dugast
2023-09-26 16:46   ` Souza, Jose
2023-09-27 17:04     ` Rodrigo Vivi
2023-09-27 17:22       ` Souza, Jose
2023-10-04  0:48         ` John Harrison
2023-10-09 13:08           ` Francois Dugast
2023-10-09 13:35             ` Souza, Jose
2023-10-10 19:10   ` Lucas De Marchi
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 28/30] drm/xe: Remove useless query config num_params Francois Dugast
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 29/30] drm/xe/uapi: Add missing DRM_ prefix in uAPI constants Francois Dugast
2023-09-26 16:24   ` Souza, Jose
2023-09-26 12:55 ` [Intel-xe] [PATCH v3 30/30] drm/xe/uapi: Add _FLAG to uAPI constants usable for flags Francois Dugast
2023-09-26 13:12 ` [Intel-xe] ✓ CI.Patch_applied: success for uAPI Alignment - take 1 v3 Patchwork
2023-09-26 13:13 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-09-26 13:14 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-09-26 13:21 ` [Intel-xe] ✓ CI.Build: " Patchwork
2023-09-26 13:22 ` [Intel-xe] ✗ CI.Hooks: failure " Patchwork
2023-09-26 13:23 ` [Intel-xe] ✓ CI.checksparse: success " Patchwork
2023-09-26 13:49 ` [Intel-xe] ✗ CI.BAT: failure " Patchwork
2023-10-04  0:31 ` [Intel-xe] [PATCH v3 00/30] " John Harrison

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZRMmWLusAMpwiGIr@unerlige-ril \
    --to=umesh.nerlige.ramappa@intel.com \
    --cc=francois.dugast@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.