Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Raag Jadav <raag.jadav@intel.com>
To: Karthik Poosa <karthik.poosa@intel.com>
Cc: intel-xe@lists.freedesktop.org, anshuman.gupta@intel.com,
	badal.nilawar@intel.com, rodrigo.vivi@intel.com,
	riana.tauro@intel.com
Subject: Re: [PATCH v3 1/4] drm/xe/hwmon: Expose temperature limits
Date: Wed, 17 Dec 2025 18:05:12 +0100	[thread overview]
Message-ID: <aULiyK1tWOxKZLI8@black.igk.intel.com> (raw)
In-Reply-To: <20251216114030.226399-2-karthik.poosa@intel.com>

On Tue, Dec 16, 2025 at 05:10:27PM +0530, Karthik Poosa wrote:
> Read temperature limits using pcode mailbox and
> expose shutdown temperature limit as tempX_emergency,
> critical temperature limit of as tempX_crit and
> GPU average temperature limit as tempX_max.
> 
> Update Xe hwmon documentation for these entries.
> 
> Signed-off-by: Karthik Poosa <karthik.poosa@intel.com>
> ---
>  .../ABI/testing/sysfs-driver-intel-xe-hwmon   |  40 ++++++
>  drivers/gpu/drm/xe/xe_device_types.h          |   3 +
>  drivers/gpu/drm/xe/xe_hwmon.c                 | 116 +++++++++++++++++-
>  drivers/gpu/drm/xe/xe_pci.c                   |   5 +
>  drivers/gpu/drm/xe/xe_pci_types.h             |   1 +
>  drivers/gpu/drm/xe/xe_pcode_api.h             |   3 +
>  6 files changed, 164 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> index d9e2b17c6872..c8f211c336be 100644
> --- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> +++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
> @@ -196,3 +196,43 @@ Description:	RW. Package burst power limit interval (Tau in PL2/Tau) in
>  		milliseconds over which sustained power is averaged.
>  
>  		Only supported for particular Intel Xe graphics platforms.
> +
> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_emergency
> +Date:		December 2025
> +KernelVersion:	6.19
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. Package shutdown temperature in millidegree Celsius.
> +
> +		Only supported for particular Intel Xe graphics platforms.
> +
> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_crit
> +Date:		December 2025
> +KernelVersion:	6.19
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. Package critical temperature in millidegree Celsius.
> +
> +		Only supported for particular Intel Xe graphics platforms.
> +
> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp2_max
> +Date:		December 2025
> +KernelVersion:	6.19
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. Package average temperature limit in millidegree Celsius.
> +
> +		Only supported for particular Intel Xe graphics platforms.
> +
> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp3_emergency
> +Date:		December 2025
> +KernelVersion:	6.19
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. VRAM shutdown temperature in millidegree Celsius.
> +
> +		Only supported for particular Intel Xe graphics platforms.
> +
> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp3_crit
> +Date:		December 2025
> +KernelVersion:	6.19
> +Contact:	intel-xe@lists.freedesktop.org
> +Description:	RO. VRAM critical temperature in millidegree Celsius.
> +
> +		Only supported for particular Intel Xe graphics platforms.

As per hwmon documentation all these attributes are RW so they don't
match with ABI expectations.

> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 413ba4c8b62e..8e6dd2665596 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -321,6 +321,9 @@ struct xe_device {
>  		 * pcode mailbox commands.
>  		 */
>  		u8 has_mbx_power_limits:1;
> +		/** @info.has_mbx_thermal_info: Device has support for thermal mailbox commands.
> +		 */

Can be one line.

> +		u8 has_mbx_thermal_info:1;
>  		/** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
>  		u8 has_mem_copy_instr:1;
>  		/** @info.has_mert: Device has standalone MERT */
> diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
> index ff2aea52ef75..66a8c3e40027 100644
> --- a/drivers/gpu/drm/xe/xe_hwmon.c
> +++ b/drivers/gpu/drm/xe/xe_hwmon.c
> @@ -53,6 +53,14 @@ enum xe_fan_channel {
>  	FAN_MAX,
>  };
>  
> +enum xe_temp_limit {
> +	TEMP_LIMIT_PKG_SHUTDOWN,
> +	TEMP_LIMIT_PKG_TJMAX,
> +	TEMP_LIMIT_MEM_SHUTDOWN,
> +	TEMP_LIMIT_PKG_CRIT,
> +	TEMP_LIMIT_MEM_TJMAX,
> +};
> +
>  /* Attribute index for powerX_xxx_interval sysfs entries */
>  enum sensor_attr_power {
>  	SENSOR_INDEX_PSYS_PL1,
> @@ -111,6 +119,18 @@ struct xe_hwmon_fan_info {
>  	u64 time_prev;
>  };
>  
> +/**
> + * struct xe_hwmon_thermal_info - to store temperature data
> + */
> +struct xe_hwmon_thermal_info {
> +	union {
> +		/** @limits: temperatures limits */
> +		u8 limit[8];

What are the magic numbers? Can we have proper defines?

> +		/** @data: temperature limits in dwords */
> +		u32 data[2];
> +	};
> +};
> +
>  /**
>   * struct xe_hwmon - xe hwmon data structure
>   */
> @@ -137,7 +157,8 @@ struct xe_hwmon {
>  	u32 pl1_on_boot[CHANNEL_MAX];
>  	/** @pl2_on_boot: power limit PL2 on boot */
>  	u32 pl2_on_boot[CHANNEL_MAX];
> -
> +	/** @temp: Temperature info */
> +	struct xe_hwmon_thermal_info temp;
>  };
>  
>  static int xe_hwmon_pcode_read_power_limit(const struct xe_hwmon *hwmon, u32 attr, int channel,
> @@ -677,8 +698,11 @@ static const struct attribute_group *hwmon_groups[] = {
>  };
>  
>  static const struct hwmon_channel_info * const hwmon_info[] = {
> -	HWMON_CHANNEL_INFO(temp, HWMON_T_LABEL, HWMON_T_INPUT | HWMON_T_LABEL,
> -			   HWMON_T_INPUT | HWMON_T_LABEL),
> +	HWMON_CHANNEL_INFO(temp,
> +			   HWMON_T_LABEL,
> +			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT |
> +			   HWMON_T_MAX,
> +			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT),
>  	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
>  			   HWMON_P_CAP,
>  			   HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CAP),
> @@ -689,6 +713,23 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
>  	NULL
>  };
>  
> +static int xe_hwmon_pcode_read_thermal_info(struct xe_hwmon *hwmon)
> +{
> +	struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
> +	int ret = 0;
> +
> +	if (!hwmon->xe->info.has_mbx_power_limits)
> +		return -EOPNOTSUPP;
> +
> +	/* Read thermal info */
> +	ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_LIMITS, 0),
> +			    &hwmon->temp.data[0], &hwmon->temp.data[1]);
> +	drm_dbg(&hwmon->xe->drm, "thermal info read ret %d, val 0x%x val1 0x%x\n", ret,
> +		hwmon->temp.data[0], hwmon->temp.data[1]);
> +
> +	return ret;
> +}
> +
>  /* I1 is exposed as power_crit or as curr_crit depending on bit 31 */
>  static int xe_hwmon_pcode_read_i1(const struct xe_hwmon *hwmon, u32 *uval)
>  {
> @@ -787,6 +828,34 @@ static umode_t
>  xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
>  {
>  	switch (attr) {
> +	case hwmon_temp_emergency:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			return hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] ? 0444 : 0;
> +		case CHANNEL_VRAM:
> +			return hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] ? 0444 : 0;
> +		default:
> +			return 0;
> +		}
> +		break;
> +	case hwmon_temp_crit:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			return hwmon->temp.limit[TEMP_LIMIT_PKG_TJMAX] ? 0444 : 0;
> +		case CHANNEL_VRAM:
> +			return hwmon->temp.limit[TEMP_LIMIT_MEM_TJMAX] ? 0444 : 0;
> +		default:
> +			return 0;
> +		}
> +		break;
> +	case hwmon_temp_max:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			return hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] ? 0444 : 0;
> +		default:
> +			return 0;
> +		}
> +		break;
>  	case hwmon_temp_input:
>  	case hwmon_temp_label:
>  		return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, channel)) ? 0444 : 0;
> @@ -807,10 +876,46 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
>  
>  		/* HW register value is in degrees Celsius, convert to millidegrees. */
>  		*val = REG_FIELD_GET(TEMP_MASK, reg_val) * MILLIDEGREE_PER_DEGREE;
> -		return 0;
> +		break;
> +	case hwmon_temp_emergency:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
> +			break;
> +		case CHANNEL_VRAM:
> +			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
> +			break;
> +		default:
> +			*val = 0;

Do you need this?

> +			return -EOPNOTSUPP;
> +		}
> +		break;
> +	case hwmon_temp_crit:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			*val = hwmon->temp.limit[TEMP_LIMIT_PKG_TJMAX] * MILLIDEGREE_PER_DEGREE;
> +			break;
> +		case CHANNEL_VRAM:
> +			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_TJMAX] * MILLIDEGREE_PER_DEGREE;
> +			break;
> +		default:
> +			*val = 0;

Ditto.

> +			return -EOPNOTSUPP;
> +		}
> +		break;
> +	case hwmon_temp_max:
> +		switch (channel) {
> +		case CHANNEL_PKG:
> +			*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
> +			break;
> +		default:
> +			return 0;
> +		}
> +		break;
>  	default:
>  		return -EOPNOTSUPP;
>  	}
> +	return 0;
>  }
>  
>  static umode_t
> @@ -1263,6 +1368,9 @@ xe_hwmon_get_preregistration_info(struct xe_hwmon *hwmon)
>  	for (channel = 0; channel < FAN_MAX; channel++)
>  		if (xe_hwmon_is_visible(hwmon, hwmon_fan, hwmon_fan_input, channel))
>  			xe_hwmon_fan_input_read(hwmon, channel, &fan_speed);
> +
> +	if (xe_hwmon_pcode_read_thermal_info(hwmon))
> +		drm_dbg(&hwmon->xe->drm, "Thermal mailbox support not present in firmware\n");
>  }
>  
>  int xe_hwmon_register(struct xe_device *xe)
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index 673761c8623e..a47637900e84 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -311,6 +311,7 @@ static const struct xe_device_desc dg2_desc = {
>  	.has_display = true,
>  	.has_fan_control = true,
>  	.has_mbx_power_limits = false,
> +	.has_mbx_thermal_info = false,

Why?

>  };
>  
>  static const __maybe_unused struct xe_device_desc pvc_desc = {
> @@ -328,6 +329,7 @@ static const __maybe_unused struct xe_device_desc pvc_desc = {
>  	.vm_max_level = 4,
>  	.vram_flags = XE_VRAM_FLAGS_NEED64K,
>  	.has_mbx_power_limits = false,
> +	.has_mbx_thermal_info = false,

Ditto.

>  };
>  
>  static const struct xe_device_desc mtl_desc = {
> @@ -365,6 +367,7 @@ static const struct xe_device_desc bmg_desc = {
>  	.has_fan_control = true,
>  	.has_flat_ccs = 1,
>  	.has_mbx_power_limits = true,
> +	.has_mbx_thermal_info = true,
>  	.has_gsc_nvm = 1,
>  	.has_heci_cscfi = 1,
>  	.has_i2c = true,
> @@ -418,6 +421,7 @@ static const struct xe_device_desc cri_desc = {
>  	.has_flat_ccs = false,
>  	.has_i2c = true,
>  	.has_mbx_power_limits = true,
> +	.has_mbx_thermal_info = true,
>  	.has_mert = true,
>  	.has_pre_prod_wa = 1,
>  	.has_sriov = true,
> @@ -681,6 +685,7 @@ static int xe_info_init_early(struct xe_device *xe,
>  	/* runtime fusing may force flat_ccs to disabled later */
>  	xe->info.has_flat_ccs = desc->has_flat_ccs;
>  	xe->info.has_mbx_power_limits = desc->has_mbx_power_limits;
> +	xe->info.has_mbx_thermal_info = desc->has_mbx_thermal_info;
>  	xe->info.has_gsc_nvm = desc->has_gsc_nvm;
>  	xe->info.has_heci_gscfi = desc->has_heci_gscfi;
>  	xe->info.has_heci_cscfi = desc->has_heci_cscfi;
> diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
> index 3bb51d155951..34d2b66188e4 100644
> --- a/drivers/gpu/drm/xe/xe_pci_types.h
> +++ b/drivers/gpu/drm/xe/xe_pci_types.h
> @@ -48,6 +48,7 @@ struct xe_device_desc {
>  	u8 has_late_bind:1;
>  	u8 has_llc:1;
>  	u8 has_mbx_power_limits:1;
> +	u8 has_mbx_thermal_info:1;
>  	u8 has_mem_copy_instr:1;
>  	u8 has_mert:1;
>  	u8 has_pre_prod_wa:1;
> diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h
> index 975892d6b230..a3bff76a074d 100644
> --- a/drivers/gpu/drm/xe/xe_pcode_api.h
> +++ b/drivers/gpu/drm/xe/xe_pcode_api.h
> @@ -65,6 +65,9 @@
>  #define       FAN_TABLE				1
>  #define       VR_CONFIG				2
>  
> +#define	PCODE_THERMAL_INFO			0x25
> +#define	READ_THERMAL_LIMITS			0x0

Does this match with spacing convention and ordering?

Raag

>  #define   PCODE_FREQUENCY_CONFIG		0x6e
>  /* Frequency Config Sub Commands (param1) */
>  #define     PCODE_MBOX_FC_SC_READ_FUSED_P0	0x0
> -- 
> 2.25.1
> 

  reply	other threads:[~2025-12-17 17:05 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-16 11:40 [PATCH v3 0/4] drm/xe/hwmon: Expose new temperature attributes Karthik Poosa
2025-12-16 11:40 ` [PATCH v3 1/4] drm/xe/hwmon: Expose temperature limits Karthik Poosa
2025-12-17 17:05   ` Raag Jadav [this message]
2025-12-18  7:37     ` Poosa, Karthik
2025-12-18 11:15       ` Raag Jadav
2025-12-19  8:37         ` Nilawar, Badal
2025-12-19 15:06           ` Guenter Roeck
2025-12-19 15:56         ` Rodrigo Vivi
2025-12-16 11:40 ` [PATCH v3 2/4] drm/xe/hwmon: Expose memory controller temperature Karthik Poosa
2025-12-19  7:55   ` Raag Jadav
2025-12-23 15:49     ` Poosa, Karthik
2025-12-24 10:13     ` Poosa, Karthik
2025-12-16 11:40 ` [PATCH v3 3/4] drm/xe/hwmon: Expose GPU pcie temperature Karthik Poosa
2025-12-19  8:23   ` Raag Jadav
2025-12-24 10:19     ` Poosa, Karthik
2025-12-16 11:40 ` [PATCH v3 4/4] drm/xe/hwmon: Expose individual vram temperature Karthik Poosa
2025-12-21 18:52   ` Raag Jadav
2025-12-23 11:51     ` Poosa, Karthik
2025-12-16 19:28 ` ✓ CI.KUnit: success for drm/xe/hwmon: Expose new temperature attributes (rev4) Patchwork
2025-12-16 21:05 ` ✓ Xe.CI.BAT: " Patchwork
2025-12-17 19:17 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aULiyK1tWOxKZLI8@black.igk.intel.com \
    --to=raag.jadav@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=karthik.poosa@intel.com \
    --cc=riana.tauro@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox