From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from gabe.freedesktop.org (gabe.freedesktop.org [131.252.210.177]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.lore.kernel.org (Postfix) with ESMTPS id 8EE4CC48BF8 for ; Fri, 16 Feb 2024 23:17:29 +0000 (UTC) Received: from gabe.freedesktop.org (localhost [127.0.0.1]) by gabe.freedesktop.org (Postfix) with ESMTP id 2B64F10ED15; Fri, 16 Feb 2024 23:17:29 +0000 (UTC) Authentication-Results: gabe.freedesktop.org; dkim=pass (2048-bit key; unprotected) header.d=intel.com header.i=@intel.com header.b="UnTbogA2"; dkim-atps=neutral Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.7]) by gabe.freedesktop.org (Postfix) with ESMTPS id 0966910ECFF for ; Fri, 16 Feb 2024 23:17:12 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1708125433; x=1739661433; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=7OWNQN42wDCfQVpza8LZIY/+/w5JCbxcDFqKnVpYS5E=; b=UnTbogA25m7Gp5Rc6JiD+ndQ0OWO6Z3ZZiDwnEn2oU4CwlCaKE0X50b8 g4nIAnbVXIG2vflLxCL8efSgPyvmuYtmogZVKAZkU8K/QFrVwQ4v0RoFo gnKZNc8dQeSYKwlTJ4Tq0C7d0G+8UsM56O3lq9qWmBOu2O0Szr70BHnn3 lMPHxxuJycy6o72O4aExmlQOgrqZw2KuNv7PVF4oGf/4bU0+xOZgyMfqV pIF3cPV0z/VaTUiazPfnDKLmXzB8of4ca2g37w40P6BkLUZnU9KnnkZzU wGV4PFwnhVv1Xypwedo/fcTWdWeQIH6/kQfcv22rc6VreLf6l48SkQHJM w==; X-IronPort-AV: E=McAfee;i="6600,9927,10986"; a="27724280" X-IronPort-AV: E=Sophos;i="6.06,165,1705392000"; d="scan'208";a="27724280" Received: from orviesa003.jf.intel.com ([10.64.159.143]) by fmvoesa101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 16 Feb 2024 15:17:09 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="6.06,165,1705392000"; d="scan'208";a="8626869" Received: from orsosgc001.jf.intel.com (HELO unerlige-ril.jf.intel.com) ([10.165.21.138]) by ORVIESA003-auth.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 16 Feb 2024 15:17:09 -0800 From: Ashutosh Dixit To: igt-dev@lists.freedesktop.org Cc: Umesh Nerlige Ramappa Subject: [PATCH i-g-t 06/18] lib/xe: Complete xe_oa lib changes Date: Fri, 16 Feb 2024 15:16:51 -0800 Message-ID: <20240216231703.845644-7-ashutosh.dixit@intel.com> X-Mailer: git-send-email 2.41.0 In-Reply-To: <20240216231703.845644-1-ashutosh.dixit@intel.com> References: <20240216231703.845644-1-ashutosh.dixit@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-BeenThere: igt-dev@lists.freedesktop.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Development mailing list for IGT GPU Tools List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: igt-dev-bounces@lists.freedesktop.org Sender: "igt-dev" Add various functionality in lib/xe for OA. This includes: * Support for OA metrics generation * intel_perf_for_devinfo and intel_perf_for_fd support * intel_perf_load_perf_configs * xe_perf_ioctl * drm_xe_query_oa_units Signed-off-by: Ashutosh Dixit --- lib/intel_device_info.c | 1 + lib/meson.build | 74 +++++ lib/xe/xe_oa.c | 699 ++++++++++++++++++++++++++++++++++++++++ lib/xe/xe_oa.h | 328 +++++++++++++++++++ lib/xe/xe_query.c | 38 +++ lib/xe/xe_query.h | 5 + 6 files changed, 1145 insertions(+) create mode 100644 lib/xe/xe_oa.c create mode 100644 lib/xe/xe_oa.h diff --git a/lib/intel_device_info.c b/lib/intel_device_info.c index 64b5246b7783..83ca0a5ed149 100644 --- a/lib/intel_device_info.c +++ b/lib/intel_device_info.c @@ -510,6 +510,7 @@ static const struct intel_device_info intel_lunarlake_info = { .display_ver = 20, .has_4tile = true, .has_flatccs = true, + .has_oam = true, .is_lunarlake = true, .codename = "lunarlake", .cmds_info = &xe2_cmds_info, diff --git a/lib/meson.build b/lib/meson.build index 6122861d8b7a..34de0e1b6ae9 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -374,6 +374,79 @@ install_headers( subdir : 'i915-perf' ) +xe_oa_files = [ + 'igt_list.c', + 'xe/xe_oa.c', +] + +xe_oa_hardware = [ + 'lnl', +] + +xe_xml_files = [] +foreach hw : xe_oa_hardware + xe_xml_files += files('xe/oa-configs/oa-@0@.xml'.format(hw)) +endforeach + +xe_oa_files += custom_target( + 'xe-oa-equations', + input : [ 'xe/oa-configs/oa-equations-codegen.py' ] + xe_xml_files, + output : [ 'xe_oa_equations.c', 'xe_oa_equations.h' ], + command : [ + python3, '@INPUT0@', + '--code', '@OUTPUT0@', + '--header', '@OUTPUT1@', + xe_xml_files, + ]) + +foreach hw : xe_oa_hardware + xe_oa_files += custom_target( + 'xe-oa-registers-@0@'.format(hw), + input : [ 'xe/oa-configs/oa-registers-codegen.py', + 'xe/oa-configs/oa-@0@.xml'.format(hw) ], + output : [ 'xe_oa_registers_@0@.c'.format(hw), + 'xe_oa_registers_@0@.h'.format(hw), ], + command : [ + python3, '@INPUT0@', + '--code', '@OUTPUT0@', + '--header', '@OUTPUT1@', + '--xml-file', '@INPUT1@' + ]) + xe_oa_files += custom_target( + 'xe-oa-metrics-@0@'.format(hw), + input : [ 'xe/oa-configs/oa-metricset-codegen.py', + 'xe/oa-configs/oa-@0@.xml'.format(hw) ], + output : [ 'xe_oa_metrics_@0@.c'.format(hw), + 'xe_oa_metrics_@0@.h'.format(hw), ], + command : [ + python3, '@INPUT0@', + '--code', '@OUTPUT0@', + '--header', '@OUTPUT1@', + '--equations-include', 'xe_oa_equations.h', + '--registers-include', 'xe_oa_registers_@0@.h'.format(hw), + '--xml-file', '@INPUT1@', + ]) +endforeach + +lib_igt_xe_oa_build = shared_library( + 'xe_oa', + xe_oa_files, + dependencies: [lib_igt_chipset,lib_igt,pciaccess], + include_directories : inc, + install: true, + soversion: '1.5') + +lib_igt_xe_oa = declare_dependency( + link_with : lib_igt_xe_oa_build, + include_directories : inc) + +install_headers( + 'igt_list.h', + 'intel_chipset.h', + 'xe/xe_oa.h', + subdir : 'xe-oa' +) + pkgconf = configuration_data() pkgconf.set('prefix', get_option('prefix')) @@ -381,6 +454,7 @@ pkgconf.set('exec_prefix', '${prefix}') pkgconf.set('libdir', '${prefix}/@0@'.format(get_option('libdir'))) pkgconf.set('includedir', '${prefix}/@0@'.format(get_option('includedir'))) pkgconf.set('i915_perf_version', '1.5.1') +pkgconf.set('xe_oa_version', '1.5.1') configure_file( input : 'i915-perf.pc.in', diff --git a/lib/xe/xe_oa.c b/lib/xe/xe_oa.c new file mode 100644 index 000000000000..346b23349b4f --- /dev/null +++ b/lib/xe/xe_oa.c @@ -0,0 +1,699 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2024 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drmtest.h" +#include "i915_pciids.h" +#include "i915_pciids_local.h" +#include "intel_chipset.h" +#include "intel_hwconfig_types.h" +#include "ioctl_wrappers.h" +#include "linux_scaffold.h" +#include "xe_ioctl.h" +#include "xe_oa.h" +#include "xe_query.h" + +#include "xe_oa_metrics_lnl.h" + +static struct intel_perf_logical_counter_group * +intel_perf_logical_counter_group_new(struct intel_perf *perf, + struct intel_perf_logical_counter_group *parent, + const char *name) +{ + struct intel_perf_logical_counter_group *group = calloc(1, sizeof(*group)); + + group->name = strdup(name); + + IGT_INIT_LIST_HEAD(&group->counters); + IGT_INIT_LIST_HEAD(&group->groups); + + if (parent) + igt_list_add_tail(&group->link, &parent->groups); + else + IGT_INIT_LIST_HEAD(&group->link); + + return group; +} + +static void +intel_perf_logical_counter_group_free(struct intel_perf_logical_counter_group *group) +{ + struct intel_perf_logical_counter_group *child, *tmp; + + igt_list_for_each_entry_safe(child, tmp, &group->groups, link) { + igt_list_del(&child->link); + intel_perf_logical_counter_group_free(child); + } + + free(group->name); + free(group); +} + +static void +intel_perf_metric_set_free(struct intel_perf_metric_set *metric_set) +{ + free(metric_set->counters); + free(metric_set); +} + +static bool +slice_available(const struct drm_i915_query_topology_info *topo, + int s) +{ + return (topo->data[s / 8] >> (s % 8)) & 1; +} + +static bool +subslice_available(const struct drm_i915_query_topology_info *topo, + int s, int ss) +{ + return (topo->data[topo->subslice_offset + + s * topo->subslice_stride + + ss / 8] >> (ss % 8)) & 1; +} + +static bool +eu_available(const struct drm_i915_query_topology_info *topo, + int s, int ss, int eu) +{ + return (topo->data[topo->eu_offset + + (s * topo->max_subslices + ss) * topo->eu_stride + + eu / 8] >> (eu % 8)) & 1; +} + +static struct intel_perf * +unsupported_xe_oa_platform(struct intel_perf *perf) +{ + intel_perf_free(perf); + return NULL; +} + +struct intel_perf * +intel_perf_for_devinfo(uint32_t device_id, + uint32_t revision, + uint64_t timestamp_frequency, + uint64_t gt_min_freq, + uint64_t gt_max_freq, + const struct drm_i915_query_topology_info *topology) +{ + const struct intel_device_info *devinfo = intel_get_device_info(device_id); + struct intel_perf *perf; + uint32_t subslice_mask_len; + uint32_t eu_mask_len; + uint32_t half_max_subslices; + uint64_t half_subslices_mask; + int bits_per_subslice; + + if (!devinfo) + return NULL; + + perf = calloc(1, sizeof(*perf));; + perf->root_group = intel_perf_logical_counter_group_new(perf, NULL, ""); + + IGT_INIT_LIST_HEAD(&perf->metric_sets); + + /* Initialize the device characterists first. Loading the + * metrics uses that information to detect whether some + * counters are available on a given device (for example BXT + * 2x6 does not have 2 samplers). + */ + perf->devinfo.devid = device_id; + perf->devinfo.graphics_ver = devinfo->graphics_ver; + perf->devinfo.revision = revision; + perf->devinfo.timestamp_frequency = timestamp_frequency; + perf->devinfo.gt_min_freq = gt_min_freq; + perf->devinfo.gt_max_freq = gt_max_freq; + + if (devinfo->codename) { + snprintf(perf->devinfo.devname, sizeof(perf->devinfo.devname), + "%s", devinfo->codename); + } + + /* Store i915 topology. */ + perf->devinfo.max_slices = topology->max_slices; + perf->devinfo.max_subslices_per_slice = topology->max_subslices; + perf->devinfo.max_eu_per_subslice = topology->max_eus_per_subslice; + + subslice_mask_len = + topology->max_slices * topology->subslice_stride; + igt_assert(sizeof(perf->devinfo.subslice_masks) >= subslice_mask_len); + memcpy(perf->devinfo.subslice_masks, + &topology->data[topology->subslice_offset], + subslice_mask_len); + + eu_mask_len = topology->eu_stride * + topology->max_subslices * topology->max_slices; + igt_assert(sizeof(perf->devinfo.eu_masks) >= eu_mask_len); + memcpy(perf->devinfo.eu_masks, + &topology->data[topology->eu_offset], + eu_mask_len); + + bits_per_subslice = 8; + for (uint32_t s = 0; s < topology->max_slices; s++) { + if (!slice_available(topology, s)) + continue; + + perf->devinfo.slice_mask |= 1ULL << s; + for (uint32_t ss = 0; ss < topology->max_subslices; ss++) { + if (!subslice_available(topology, s, ss)) + continue; + + perf->devinfo.subslice_mask |= 1ULL << (s * bits_per_subslice + ss); + + for (uint32_t eu = 0; eu < topology->max_eus_per_subslice; eu++) { + if (eu_available(topology, s, ss, eu)) + perf->devinfo.n_eus++; + } + } + } + + perf->devinfo.n_eu_slices = __builtin_popcount(perf->devinfo.slice_mask); + perf->devinfo.n_eu_sub_slices = __builtin_popcount(perf->devinfo.subslice_mask); + + /* Compute number of subslices/dualsubslices in first half of + * the GPU. + */ + half_max_subslices = topology->max_subslices / 2; + half_subslices_mask = perf->devinfo.subslice_mask & + ((1 << half_max_subslices) - 1); + perf->devinfo.n_eu_sub_slices_half_slices = __builtin_popcount(half_subslices_mask); + + /* Valid on most generations except Gen9LP. */ + perf->devinfo.eu_threads_count = 7; + + /* Most platforms have full 32bit timestamps. */ + perf->devinfo.oa_timestamp_mask = 0xffffffff; + perf->devinfo.oa_timestamp_shift = 0; + + if (devinfo->is_lunarlake) { + intel_perf_load_metrics_lnl(perf); + } else { + return unsupported_xe_oa_platform(perf); + } + + return perf; +} + +static bool +read_fd_uint64(int fd, uint64_t *out_value) +{ + char buf[32]; + int n; + + n = read(fd, buf, sizeof (buf) - 1); + if (n < 0) + return false; + + buf[n] = '\0'; + *out_value = strtoull(buf, 0, 0); + + return true; +} + +static bool +read_sysfs(int sysfs_dir_fd, const char *file_path, uint64_t *out_value) +{ + int fd = openat(sysfs_dir_fd, file_path, O_RDONLY); + bool res; + + if (fd < 0) + return false; + + res = read_fd_uint64(fd, out_value); + close(fd); + + return res; +} + +static int +open_master_sysfs_dir(int drm_fd) +{ + char path[128]; + struct stat st; + int sysfs; + + if (fstat(drm_fd, &st) || !S_ISCHR(st.st_mode)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/char/%d:%d", major(st.st_rdev), minor(st.st_rdev)); + sysfs = open(path, O_DIRECTORY); + if (sysfs < 0) + return sysfs; + + if (minor(st.st_rdev) >= 128) { + /* If we were given a renderD* drm_fd, find it's associated cardX node. */ + char device[100], cmp[100]; + int device_len, cmp_len, i; + + device_len = readlinkat(sysfs, "device", device, sizeof(device)); + close(sysfs); + if (device_len < 0) + return device_len; + + for (i = 0; i < 64; i++) { + + snprintf(path, sizeof(path), "/sys/dev/char/%d:%d", major(st.st_rdev), i); + sysfs = open(path, O_DIRECTORY); + if (sysfs < 0) + continue; + + cmp_len = readlinkat(sysfs, "device", cmp, sizeof(cmp)); + if (cmp_len == device_len && !memcmp(cmp, device, cmp_len)) + break; + + close(sysfs); + sysfs = -1; + } + } + + return sysfs; +} + +static void process_hwconfig(void *data, uint32_t len, + struct drm_i915_query_topology_info *topinfo) +{ + + uint32_t *d = (uint32_t*)data; + uint32_t l = len / 4; + uint32_t pos = 0; + + while (pos + 2 < l) { + if (d[pos + 1] == 1) { + switch (d[pos]) { + case INTEL_HWCONFIG_MAX_SLICES_SUPPORTED: + topinfo->max_slices = d[pos + 2]; + igt_debug("hwconfig: max_slices %d\n", topinfo->max_slices); + break; + case INTEL_HWCONFIG_MAX_SUBSLICE: + case INTEL_HWCONFIG_MAX_DUAL_SUBSLICES_SUPPORTED: + topinfo->max_subslices = d[pos + 2]; + igt_debug("hwconfig: max_subslices %d\n", topinfo->max_subslices); + break; + case INTEL_HWCONFIG_MAX_EU_PER_SUBSLICE: + case INTEL_HWCONFIG_MAX_NUM_EU_PER_DSS: + topinfo->max_eus_per_subslice = d[pos + 2]; + igt_debug("hwconfig: max_eus_per_subslice %d\n", + topinfo->max_eus_per_subslice); + break; + default: + break; + } + } + pos += 2 + d[pos + 1]; + } +} + +static void query_hwconfig(int fd, struct drm_i915_query_topology_info *topinfo) +{ + struct drm_xe_device_query query = { + .extensions = 0, + .query = DRM_XE_DEVICE_QUERY_HWCONFIG, + .size = 0, + .data = 0, + }; + void *hwconfig; + + igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + igt_assert(query.size); + + hwconfig = malloc(query.size); + igt_assert(hwconfig); + + query.data = to_user_pointer(hwconfig); + igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + + process_hwconfig(hwconfig, query.size, topinfo); + free(hwconfig); +} + +struct drm_i915_query_topology_info *xe_fill_i915_topology_info(int drm_fd) +{ + struct drm_i915_query_topology_info i915_topinfo = {}; + struct drm_i915_query_topology_info *i915_topo; + struct drm_xe_query_topology_mask *xe_topo; + int total_size, pos = 0; + u8 *ptr; + struct drm_xe_device_query query = { + .extensions = 0, + .query = DRM_XE_DEVICE_QUERY_GT_TOPOLOGY, + .size = 0, + .data = 0, + }; + + query_hwconfig(drm_fd, &i915_topinfo); + + i915_topinfo.subslice_offset = 1; /* always 1 */ + i915_topinfo.subslice_stride = DIV_ROUND_UP(i915_topinfo.max_subslices, 8); + i915_topinfo.eu_offset = i915_topinfo.subslice_offset + i915_topinfo.subslice_stride; + i915_topinfo.eu_stride = DIV_ROUND_UP(i915_topinfo.max_eus_per_subslice, 8); + + /* Allocate and start filling the struct to return */ + total_size = sizeof(i915_topinfo) + i915_topinfo.eu_offset + + i915_topinfo.max_subslices * i915_topinfo.eu_stride; + i915_topo = malloc(total_size); + igt_assert(i915_topo); + + memcpy(i915_topo, &i915_topinfo, sizeof(i915_topinfo)); + ptr = (u8 *)i915_topo + sizeof(i915_topinfo); + *ptr++ = 0x1; /* slice mask */ + + /* Get xe topology masks */ + igt_assert_eq(igt_ioctl(drm_fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + igt_assert_neq(query.size, 0); + + xe_topo = malloc(query.size); + igt_assert(xe_topo); + + query.data = to_user_pointer(xe_topo); + igt_assert_eq(igt_ioctl(drm_fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + igt_debug("Topology size: %d\n", query.size); + + while (query.size >= sizeof(struct drm_xe_query_topology_mask)) { + struct drm_xe_query_topology_mask *topo = + (struct drm_xe_query_topology_mask*)((unsigned char*)xe_topo + pos); + int i, sz = sizeof(struct drm_xe_query_topology_mask) + topo->num_bytes; + u64 geom_mask, compute_mask; + + igt_debug(" gt_id: %d type: %d n:%d [%d] ", topo->gt_id, topo->type, topo->num_bytes, sz); + for (int j=0; j< topo->num_bytes; j++) + igt_debug(" %02x", topo->mask[j]); + igt_debug("\n"); + + /* i915 only returns topology for gt 0, do the same here */ + if (topo->gt_id) + goto next; + + /* Follow the same order as in xe query_gt_topology() */ + switch (topo->type) { + case DRM_XE_TOPO_DSS_GEOMETRY: + igt_assert_lte(i915_topo->subslice_stride, 8); /* Fit in u64 mask */ + memcpy(&geom_mask, topo->mask, i915_topo->subslice_stride); + break; + case DRM_XE_TOPO_DSS_COMPUTE: + memcpy(&compute_mask, topo->mask, i915_topo->subslice_stride); + geom_mask |= compute_mask; + memcpy(ptr, &geom_mask, i915_topo->subslice_stride); + ptr += i915_topo->subslice_stride; + break; + case DRM_XE_TOPO_EU_PER_DSS: + for (i = 0; i < i915_topo->max_subslices; i++) { + memcpy(ptr, topo->mask, i915_topo->eu_stride); + ptr += i915_topo->eu_stride; + } + break; + default: + igt_assert(0); + } +next: + query.size -= sz; + pos += sz; + } + + free(xe_topo); + + return i915_topo; +} + +static struct intel_perf * +xe_perf_for_fd(int drm_fd, int gt) +{ + uint32_t device_id; + uint32_t device_revision = 0; + uint32_t timestamp_frequency; + uint64_t gt_min_freq = 0; + uint64_t gt_max_freq = 0; + struct drm_i915_query_topology_info *topology; + struct intel_perf *ret; + int sysfs_dir_fd = open_master_sysfs_dir(drm_fd); + char path_min[64], path_max[64]; + + if (sysfs_dir_fd < 0) { + igt_warn("open_master_sysfs_dir failed\n"); + return NULL; + } + + if (IS_PONTEVECCHIO(xe_dev_id(drm_fd))) { + sprintf(path_min, "device/tile%d/gt%d/freq%d/min_freq", gt, gt, gt); + sprintf(path_max, "device/tile%d/gt%d/freq%d/max_freq", gt, gt, gt); + } else { + sprintf(path_min, "device/tile0/gt%d/freq%d/min_freq", gt, gt); + sprintf(path_max, "device/tile0/gt%d/freq%d/max_freq", gt, gt); + } + + if (!read_sysfs(sysfs_dir_fd, path_min, >_min_freq) || + !read_sysfs(sysfs_dir_fd, path_max, >_max_freq)) { + igt_warn("Unable to read freqs from sysfs\n"); + close(sysfs_dir_fd); + return NULL; + } + close(sysfs_dir_fd); + + device_id = intel_get_drm_devid(drm_fd); + timestamp_frequency = xe_oa_units(drm_fd)->oa_units[0].oa_timestamp_freq; + + topology = xe_fill_i915_topology_info(drm_fd); + if (!topology) { + igt_warn("xe_fill_i915_topology_info failed\n"); + return NULL; + } + + ret = intel_perf_for_devinfo(device_id, + device_revision, + timestamp_frequency, + gt_min_freq * 1000000, + gt_max_freq * 1000000, + topology); + if (!ret) + igt_warn("intel_perf_for_devinfo failed\n"); + + free(topology); + + return ret; +} + +struct intel_perf * +intel_perf_for_fd(int drm_fd, int gt) +{ + if (!is_xe_device(drm_fd)) + return NULL; + + return xe_perf_for_fd(drm_fd, gt); +} + +void +intel_perf_free(struct intel_perf *perf) +{ + struct intel_perf_metric_set *metric_set, *tmp; + + intel_perf_logical_counter_group_free(perf->root_group); + + igt_list_for_each_entry_safe(metric_set, tmp, &perf->metric_sets, link) { + igt_list_del(&metric_set->link); + intel_perf_metric_set_free(metric_set); + } + + free(perf); +} + +void +intel_perf_add_logical_counter(struct intel_perf *perf, + struct intel_perf_logical_counter *counter, + const char *group_path) +{ + const char *group_path_end = group_path + strlen(group_path); + struct intel_perf_logical_counter_group *group = perf->root_group, *child_group = NULL; + const char *name = group_path; + + while (name < group_path_end) { + const char *name_end = strstr(name, "/"); + char group_name[128] = { 0, }; + struct intel_perf_logical_counter_group *iter_group; + + if (!name_end) + name_end = group_path_end; + + memcpy(group_name, name, name_end - name); + + child_group = NULL; + igt_list_for_each_entry(iter_group, &group->groups, link) { + if (!strcmp(iter_group->name, group_name)) { + child_group = iter_group; + break; + } + } + + if (!child_group) + child_group = intel_perf_logical_counter_group_new(perf, group, group_name); + + name = name_end + 1; + group = child_group; + } + + igt_list_add_tail(&counter->link, &child_group->counters); +} + +void +intel_perf_add_metric_set(struct intel_perf *perf, + struct intel_perf_metric_set *metric_set) +{ + igt_list_add_tail(&metric_set->link, &perf->metric_sets); +} + +static void +load_metric_set_config(struct intel_perf_metric_set *metric_set, int drm_fd) +{ + struct drm_xe_oa_config config; + u8 *regs; + int ret; + + memset(&config, 0, sizeof(config)); + + memcpy(config.uuid, metric_set->hw_config_guid, sizeof(config.uuid)); + + config.n_regs = metric_set->n_mux_regs + + metric_set->n_b_counter_regs + + metric_set->n_flex_regs; + config.regs_ptr = to_user_pointer(malloc(2 * config.n_regs * sizeof(u32))); + igt_assert(config.regs_ptr); + regs = (u8 *)config.regs_ptr; + + memcpy(regs, metric_set->mux_regs, 2 * metric_set->n_mux_regs * sizeof(u32)); + regs += 2 * metric_set->n_mux_regs * sizeof(u32); + memcpy(regs, metric_set->b_counter_regs, 2 * metric_set->n_b_counter_regs * sizeof(u32)); + regs += 2 * metric_set->n_b_counter_regs * sizeof(u32); + memcpy(regs, metric_set->flex_regs, 2 * metric_set->n_flex_regs * sizeof(u32)); + regs += 2 * metric_set->n_flex_regs * sizeof(u32); + + ret = xe_perf_ioctl(drm_fd, DRM_XE_PERF_OP_ADD_CONFIG, &config); + if (ret >= 0) + metric_set->perf_oa_metrics_set = ret; + + free((void *)config.regs_ptr); +} + +void +intel_perf_load_perf_configs(struct intel_perf *perf, int drm_fd) +{ + int sysfs_dir_fd = open_master_sysfs_dir(drm_fd); + struct dirent *entry; + int metrics_dir_fd; + DIR *metrics_dir; + struct intel_perf_metric_set *metric_set; + + if (sysfs_dir_fd < 0) + return; + + metrics_dir_fd = openat(sysfs_dir_fd, "metrics", O_DIRECTORY); + close(sysfs_dir_fd); + if (metrics_dir_fd < -1) + return; + + metrics_dir = fdopendir(metrics_dir_fd); + if (!metrics_dir) { + close(metrics_dir_fd); + return; + } + + while ((entry = readdir(metrics_dir))) { + bool metric_id_read; + uint64_t metric_id; + char path[256 + 4]; + int id_fd; + + if (entry->d_type != DT_DIR) + continue; + + snprintf(path, sizeof(path), "%s/id", entry->d_name); + + id_fd = openat(metrics_dir_fd, path, O_RDONLY); + if (id_fd < 0) + continue; + + metric_id_read = read_fd_uint64(id_fd, &metric_id); + close(id_fd); + + if (!metric_id_read) + continue; + + igt_list_for_each_entry(metric_set, &perf->metric_sets, link) { + if (!strcmp(metric_set->hw_config_guid, entry->d_name)) { + metric_set->perf_oa_metrics_set = metric_id; + break; + } + } + } + + closedir(metrics_dir); + + igt_list_for_each_entry(metric_set, &perf->metric_sets, link) { + if (metric_set->perf_oa_metrics_set) + continue; + + load_metric_set_config(metric_set, drm_fd); + } +} + +static void xe_oa_prop_to_ext(struct drm_xe_oa_open_prop *properties, + struct drm_xe_ext_set_property *extn) +{ + __u64 *prop = (__u64 *)properties->properties_ptr; + struct drm_xe_ext_set_property *ext = extn; + int i, j; + + for (i = 0; i < properties->num_properties; i++) { + ext->base.name = DRM_XE_OA_EXTENSION_SET_PROPERTY; + ext->property = *prop++; + ext->value = *prop++; + ext++; + } + + igt_assert_lte(1, i); + ext = extn; + for (j = 0; j < i - 1; j++) + ext[j].base.next_extension = (__u64)&ext[j + 1]; +} + +int xe_perf_ioctl(int fd, enum drm_xe_perf_op op, void *arg) +{ +#define XE_OA_MAX_SET_PROPERTIES 16 + + struct drm_xe_ext_set_property ext[XE_OA_MAX_SET_PROPERTIES] = {}; + + /* Chain the PERF layer struct */ + struct drm_xe_perf_param p = { + .extensions = 0, + .perf_type = DRM_XE_PERF_TYPE_OA, + .perf_op = op, + .param = (__u64)((op == DRM_XE_PERF_OP_STREAM_OPEN) ? ext : arg), + }; + + if (op == DRM_XE_PERF_OP_STREAM_OPEN) { + struct drm_xe_oa_open_prop *oprop = (struct drm_xe_oa_open_prop *)arg; + + igt_assert_lte(oprop->num_properties, XE_OA_MAX_SET_PROPERTIES); + xe_oa_prop_to_ext(oprop, ext); + } + + return igt_ioctl(fd, DRM_IOCTL_XE_PERF, &p); +} + +void xe_perf_ioctl_err(int fd, enum drm_xe_perf_op op, void *arg, int err) +{ + igt_assert_eq(xe_perf_ioctl(fd, op, arg), -1); + igt_assert_eq(errno, err); + errno = 0; +} diff --git a/lib/xe/xe_oa.h b/lib/xe/xe_oa.h new file mode 100644 index 000000000000..f3a9d1f6c7b4 --- /dev/null +++ b/lib/xe/xe_oa.h @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef PERF_METRICS_H +#define PERF_METRICS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "igt_list.h" +#include + +#define _DIV_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define INTEL_DEVICE_MAX_SLICES (8) +#define INTEL_DEVICE_MAX_SUBSLICES (32) +#define INTEL_DEVICE_MAX_EUS_PER_SUBSLICE (16) /* Maximum on gfx12 */ + +struct intel_perf_devinfo { + char devname[20]; + + /* The following fields are prepared for equations from the XML files. + * Their values are build up from the topology fields. + */ + uint32_t devid; + uint32_t graphics_ver; + uint32_t revision; + /** + * Bit shifting required to put OA report timestamps into + * timestamp_frequency (some HW generations can shift + * timestamp values to the right by a number of bits). + */ + int32_t oa_timestamp_shift; + /** + * On some platforms only part of the timestamp bits are valid + * (on previous platforms we would get full 32bits, newer + * platforms can have fewer). It's important to know when + * correlating the full 36bits timestamps to the OA report + * timestamps. + */ + uint64_t oa_timestamp_mask; + /* Frequency of the timestamps in Hz */ + uint64_t timestamp_frequency; + uint64_t gt_min_freq; + uint64_t gt_max_freq; + + /* Total number of EUs */ + uint64_t n_eus; + /* Total number of EUs in a slice */ + uint64_t n_eu_slices; + /* Total number of subslices/dualsubslices */ + uint64_t n_eu_sub_slices; + /* Number of subslices/dualsubslices in the first half of the + * slices. + */ + uint64_t n_eu_sub_slices_half_slices; + /* Mask of available subslices/dualsubslices */ + uint64_t subslice_mask; + /* Mask of available slices */ + uint64_t slice_mask; + /* Number of threads in one EU */ + uint64_t eu_threads_count; + + /** + * Maximu number of slices present on this device (can be more than + * num_slices if some slices are fused). + */ + uint16_t max_slices; + + /** + * Maximu number of subslices per slice present on this device (can be more + * than the maximum value in the num_subslices[] array if some subslices are + * fused). + */ + uint16_t max_subslices_per_slice; + + /** + * Stride to access subslice_masks[]. + */ + uint16_t subslice_slice_stride; + + /** + * Maximum number of EUs per subslice (can be more than + * num_eu_per_subslice if some EUs are fused off). + */ + uint16_t max_eu_per_subslice; + + /** + * Strides to access eu_masks[]. + */ + uint16_t eu_slice_stride; + uint16_t eu_subslice_stride; + + /** + * A bit mask of the slices available. + */ + uint8_t slice_masks[_DIV_ROUND_UP(INTEL_DEVICE_MAX_SLICES, 8)]; + + /** + * An array of bit mask of the subslices available, use subslice_slice_stride + * to access this array. + */ + uint8_t subslice_masks[INTEL_DEVICE_MAX_SLICES * + _DIV_ROUND_UP(INTEL_DEVICE_MAX_SUBSLICES, 8)]; + + /** + * An array of bit mask of EUs available, use eu_slice_stride & + * eu_subslice_stride to access this array. + */ + uint8_t eu_masks[INTEL_DEVICE_MAX_SLICES * + INTEL_DEVICE_MAX_SUBSLICES * + _DIV_ROUND_UP(INTEL_DEVICE_MAX_EUS_PER_SUBSLICE, 8)]; +}; + +typedef enum { + INTEL_PERF_LOGICAL_COUNTER_STORAGE_UINT64, + INTEL_PERF_LOGICAL_COUNTER_STORAGE_UINT32, + INTEL_PERF_LOGICAL_COUNTER_STORAGE_DOUBLE, + INTEL_PERF_LOGICAL_COUNTER_STORAGE_FLOAT, + INTEL_PERF_LOGICAL_COUNTER_STORAGE_BOOL32, +} intel_perf_logical_counter_storage_t; + +typedef enum { + INTEL_PERF_LOGICAL_COUNTER_TYPE_RAW, + INTEL_PERF_LOGICAL_COUNTER_TYPE_DURATION_RAW, + INTEL_PERF_LOGICAL_COUNTER_TYPE_DURATION_NORM, + INTEL_PERF_LOGICAL_COUNTER_TYPE_EVENT, + INTEL_PERF_LOGICAL_COUNTER_TYPE_THROUGHPUT, + INTEL_PERF_LOGICAL_COUNTER_TYPE_TIMESTAMP, +} intel_perf_logical_counter_type_t; + +typedef enum { + /* size */ + INTEL_PERF_LOGICAL_COUNTER_UNIT_BYTES, + + /* frequency */ + INTEL_PERF_LOGICAL_COUNTER_UNIT_HZ, + + /* time */ + INTEL_PERF_LOGICAL_COUNTER_UNIT_NS, + INTEL_PERF_LOGICAL_COUNTER_UNIT_US, + + /**/ + INTEL_PERF_LOGICAL_COUNTER_UNIT_PIXELS, + INTEL_PERF_LOGICAL_COUNTER_UNIT_TEXELS, + INTEL_PERF_LOGICAL_COUNTER_UNIT_THREADS, + INTEL_PERF_LOGICAL_COUNTER_UNIT_PERCENT, + + /* events */ + INTEL_PERF_LOGICAL_COUNTER_UNIT_MESSAGES, + INTEL_PERF_LOGICAL_COUNTER_UNIT_NUMBER, + INTEL_PERF_LOGICAL_COUNTER_UNIT_CYCLES, + INTEL_PERF_LOGICAL_COUNTER_UNIT_EVENTS, + INTEL_PERF_LOGICAL_COUNTER_UNIT_UTILIZATION, + + /**/ + INTEL_PERF_LOGICAL_COUNTER_UNIT_EU_SENDS_TO_L3_CACHE_LINES, + INTEL_PERF_LOGICAL_COUNTER_UNIT_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES, + INTEL_PERF_LOGICAL_COUNTER_UNIT_EU_REQUESTS_TO_L3_CACHE_LINES, + INTEL_PERF_LOGICAL_COUNTER_UNIT_EU_BYTES_PER_L3_CACHE_LINE, + INTEL_PERF_LOGICAL_COUNTER_UNIT_GBPS, + + INTEL_PERF_LOGICAL_COUNTER_UNIT_MAX +} intel_perf_logical_counter_unit_t; + +/* Hold deltas of raw performance counters. */ +struct intel_perf_accumulator { +#define INTEL_PERF_MAX_RAW_OA_COUNTERS 64 + uint64_t deltas[INTEL_PERF_MAX_RAW_OA_COUNTERS]; +}; + +struct intel_perf; +struct intel_perf_metric_set; +struct intel_perf_logical_counter { + const struct intel_perf_metric_set *metric_set; + const char *name; + const char *symbol_name; + const char *desc; + const char *group; + bool (*availability)(const struct intel_perf *perf); + intel_perf_logical_counter_storage_t storage; + intel_perf_logical_counter_type_t type; + intel_perf_logical_counter_unit_t unit; + union { + uint64_t (*max_uint64)(const struct intel_perf *perf, + const struct intel_perf_metric_set *metric_set, + uint64_t *deltas); + double (*max_float)(const struct intel_perf *perf, + const struct intel_perf_metric_set *metric_set, + uint64_t *deltas); + }; + + union { + uint64_t (*read_uint64)(const struct intel_perf *perf, + const struct intel_perf_metric_set *metric_set, + uint64_t *deltas); + double (*read_float)(const struct intel_perf *perf, + const struct intel_perf_metric_set *metric_set, + uint64_t *deltas); + }; + + struct igt_list_head link; /* list from intel_perf_logical_counter_group.counters */ +}; + +struct intel_perf_register_prog { + uint32_t reg; + uint32_t val; +}; + +struct intel_perf_metric_set { + const char *name; + const char *symbol_name; + const char *hw_config_guid; + + struct intel_perf_logical_counter *counters; + int n_counters; + + uint64_t perf_oa_metrics_set; + int perf_oa_format; + int perf_raw_size; + + /* For indexing into accumulator->deltas[] ... */ + int gpu_time_offset; + int gpu_clock_offset; + int a_offset; + int b_offset; + int c_offset; + int perfcnt_offset; + + const struct intel_perf_register_prog *b_counter_regs; + uint32_t n_b_counter_regs; + + const struct intel_perf_register_prog *mux_regs; + uint32_t n_mux_regs; + + const struct intel_perf_register_prog *flex_regs; + uint32_t n_flex_regs; + + struct igt_list_head link; +}; + +/* A tree structure with group having subgroups and counters. */ +struct intel_perf_logical_counter_group { + char *name; + + struct igt_list_head counters; + struct igt_list_head groups; + + struct igt_list_head link; /* link for intel_perf_logical_counter_group.groups */ +}; + +struct intel_perf { + const char *name; + + struct intel_perf_logical_counter_group *root_group; + + struct igt_list_head metric_sets; + + struct intel_perf_devinfo devinfo; +}; + +struct drm_i915_query_topology_info; + +static inline bool +intel_perf_devinfo_slice_available(const struct intel_perf_devinfo *devinfo, + int slice) +{ + return (devinfo->slice_masks[slice / 8] & (1U << (slice % 8))) != 0; +} + +static inline bool +intel_perf_devinfo_subslice_available(const struct intel_perf_devinfo *devinfo, + int slice, int subslice) +{ + return (devinfo->subslice_masks[slice * devinfo->subslice_slice_stride + + subslice / 8] & (1U << (subslice % 8))) != 0; +} + +static inline bool +intel_perf_devinfo_eu_available(const struct intel_perf_devinfo *devinfo, + int slice, int subslice, int eu) +{ + unsigned subslice_offset = slice * devinfo->eu_slice_stride + + subslice * devinfo->eu_subslice_stride; + + return (devinfo->eu_masks[subslice_offset + eu / 8] & (1U << eu % 8)) != 0; +} + +struct drm_i915_query_topology_info *xe_fill_i915_topology_info(int drm_fd); +struct intel_perf *intel_perf_for_fd(int drm_fd, int gt); +struct intel_perf *intel_perf_for_devinfo(uint32_t device_id, + uint32_t revision, + uint64_t timestamp_frequency, + uint64_t gt_min_freq, + uint64_t gt_max_freq, + const struct drm_i915_query_topology_info *topology); +void intel_perf_free(struct intel_perf *perf); + +void intel_perf_add_logical_counter(struct intel_perf *perf, + struct intel_perf_logical_counter *counter, + const char *group); + +void intel_perf_add_metric_set(struct intel_perf *perf, + struct intel_perf_metric_set *metric_set); + +void intel_perf_load_perf_configs(struct intel_perf *perf, int drm_fd); + + +struct drm_xe_oa_open_prop { + uint32_t num_properties; + uint32_t reserved; + uint64_t properties_ptr; +}; + +int xe_perf_ioctl(int fd, enum drm_xe_perf_op op, void *arg); +void xe_perf_ioctl_err(int fd, enum drm_xe_perf_op op, void *arg, int err); + +#ifdef __cplusplus +}; +#endif + +#endif /* PERF_METRICS_H */ diff --git a/lib/xe/xe_query.c b/lib/xe/xe_query.c index 729fba6b1a43..9e08caa74b1b 100644 --- a/lib/xe/xe_query.c +++ b/lib/xe/xe_query.c @@ -114,6 +114,27 @@ static struct drm_xe_query_mem_regions *xe_query_mem_regions_new(int fd) return mem_regions; } +static struct drm_xe_query_oa_units *xe_query_oa_units_new(int fd) +{ + struct drm_xe_query_oa_units *oa_units; + struct drm_xe_device_query query = { + .extensions = 0, + .query = DRM_XE_DEVICE_QUERY_OA_UNITS, + .size = 0, + .data = 0, + }; + + igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + + oa_units = malloc(query.size); + igt_assert(oa_units); + + query.data = to_user_pointer(oa_units); + igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_DEVICE_QUERY, &query), 0); + + return oa_units; +} + static uint64_t native_region_for_gt(const struct drm_xe_query_gt_list *gt_list, int gt) { uint64_t region; @@ -251,6 +272,7 @@ struct xe_device *xe_device_get(int fd) xe_dev->memory_regions = __memory_regions(xe_dev->gt_list); xe_dev->engines = xe_query_engines(fd); xe_dev->mem_regions = xe_query_mem_regions_new(fd); + xe_dev->oa_units = xe_query_oa_units_new(fd); xe_dev->vram_size = calloc(xe_dev->gt_list->num_gt, sizeof(*xe_dev->vram_size)); xe_dev->visible_vram_size = calloc(xe_dev->gt_list->num_gt, sizeof(*xe_dev->visible_vram_size)); for (int gt = 0; gt < xe_dev->gt_list->num_gt; gt++) { @@ -524,6 +546,22 @@ uint32_t xe_min_page_size(int fd, uint64_t region) */ xe_dev_FN(xe_config, config, struct drm_xe_query_config *); +/** + * xe_gt_list: + * @fd: xe device fd + * + * Returns query gts of xe device @fd. + */ +xe_dev_FN(xe_gt_list, gt_list, struct drm_xe_query_gt_list *); + +/** + * xe_oa_units: + * @fd: xe device fd + * + * Returns query gts of xe device @fd. + */ +xe_dev_FN(xe_oa_units, oa_units, struct drm_xe_query_oa_units *); + /** * xe_number_engine: * @fd: xe device fd diff --git a/lib/xe/xe_query.h b/lib/xe/xe_query.h index 2460384c99af..5e2b7d223a65 100644 --- a/lib/xe/xe_query.h +++ b/lib/xe/xe_query.h @@ -38,6 +38,9 @@ struct xe_device { /** @mem_regions: regions memory information and usage */ struct drm_xe_query_mem_regions *mem_regions; + /** @oa_units: information about OA units */ + struct drm_xe_query_oa_units *oa_units; + /** @vram_size: array of vram sizes for all gt_list */ uint64_t *vram_size; @@ -85,6 +88,8 @@ const char *xe_region_name(uint64_t region); uint16_t xe_region_class(int fd, uint64_t region); uint32_t xe_min_page_size(int fd, uint64_t region); struct drm_xe_query_config *xe_config(int fd); +struct drm_xe_query_gt_list *xe_gt_list(int fd); +struct drm_xe_query_oa_units *xe_oa_units(int fd); unsigned int xe_number_engines(int fd); bool xe_has_vram(int fd); uint64_t xe_vram_size(int fd, int gt); -- 2.41.0