All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Poosa, Karthik" <karthik.poosa@intel.com>
To: Raag Jadav <raag.jadav@intel.com>
Cc: <intel-xe@lists.freedesktop.org>, <anshuman.gupta@intel.com>,
	<badal.nilawar@intel.com>, <rodrigo.vivi@intel.com>
Subject: Re: [PATCH v6 4/4] drm/xe/hwmon: Expose individual VRAM channel temperature
Date: Tue, 13 Jan 2026 00:02:21 +0530	[thread overview]
Message-ID: <34da973b-e1b6-48d2-b0d2-d341c767c4b3@intel.com> (raw)
In-Reply-To: <aWU4mvctA-yb7bFN@black.igk.intel.com>


On 12-01-2026 23:38, Raag Jadav wrote:
> On Mon, Jan 12, 2026 at 05:47:24PM +0530, Karthik Poosa wrote:
>> Expose individual VRAM temperature attributes.
>> Update Xe hwmon documentation for this entry.
>>
>> v2:
>>   - Avoid using default switch case for VRAM individual temperatures.
>>   - Append labels with VRAM channel number.
>>   - Update kernel version in Xe hwmon documentation.
>>
>> v3:
>>   - Add missing brackets in Xe hwmon documentation from VRAM channel sysfs.
>>   - Reorder BMG_VRAM_TEMPERATURE_N macro in xe_pcode_regs.h.
>>   - Add api to check if VRAM is available on the channel.
>>
>> v4:
>>   - Improve VRAM label handling to eliminate temp variable by
>>     introducing a dedicated array vram_label in xe_hwmon_thermal_info.
>>   - Remove a magic number.
>>   - Change the label from vram_X to vram_ch_X.
>>
>> v5:
>>   - Address review comments from Raag.
>>   - Change vram to VRAM in commit title and subject.
>>   - Refactor BMG_VRAM_TEMPERATURE_N macro.
>>   - Refactor is_vram_ch_available().
>>   - Rephrase a comment.
>>   - Check individual VRAM temperature limits in addition to VRAM
>>     availability in xe_hwmon_temp_is_visible. (Raag)
>>   - Move VRAM label change out of this patch.
>>
>> Signed-off-by: Karthik Poosa <karthik.poosa@intel.com>
>> ---
>>   .../ABI/testing/sysfs-driver-intel-xe-hwmon   | 22 +++++++
>>   drivers/gpu/drm/xe/regs/xe_pcode_regs.h       |  3 +
>>   drivers/gpu/drm/xe/xe_hwmon.c                 | 66 +++++++++++++++++++
>>   3 files changed, 91 insertions(+)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> index 6e21bebf0e0d..55ab45f669ac 100644
>> --- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> +++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
>> @@ -211,6 +211,28 @@ KernelVersion:	7.0
>>   Contact:	intel-xe@lists.freedesktop.org
>>   Description:	RO. GPU PCIe temperature in millidegree Celsius.
>>   
>> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_crit
>> +Date:		January 2026
>> +KernelVersion:	7.0
>> +Contact:	intel-xe@lists.freedesktop.org
>> +Description:	RO. VRAM channel critical temperature in millidegree Celsius.
>> +
>> +		Only supported for particular Intel Xe graphics platforms.
>> +
>> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_emergency
>> +Date:		January 2026
>> +KernelVersion:	7.0
>> +Contact:	intel-xe@lists.freedesktop.org
>> +Description:	RO. VRAM channel shutdown temperature in millidegree Celsius.
>> +
>> +		Only supported for particular Intel Xe graphics platforms.
>> +
>> +What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp[6-21]_input
>> +Date:		January 2026
>> +KernelVersion:	7.0
>> +Contact:	intel-xe@lists.freedesktop.org
>> +Description:	RO. VRAM channel temperature in millidegree Celsius.
>> +
>>   		Only supported for particular Intel Xe graphics platforms.
>>   
>>   What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/fan1_input
>> diff --git a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
>> index fb097607b86c..45f09f39df96 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
>> @@ -22,6 +22,9 @@
>>   #define BMG_FAN_2_SPEED				XE_REG(0x138170)
>>   #define BMG_FAN_3_SPEED				XE_REG(0x1381a0)
>>   #define BMG_VRAM_TEMPERATURE			XE_REG(0x1382c0)
>> +#define BMG_VRAM_TEMPERATURE_N(n)		XE_REG(0x138260 + ((n) * (sizeof(u32))))
> Ascending order please!
>
>> +#define   TEMP_MASK_VRAM_N			REG_GENMASK(30, 8)
>> +#define   TEMP_SIGN_MASK			BIT(31)
> Use REG_BIT() for consistency.
>
>>   #define BMG_PACKAGE_TEMPERATURE			XE_REG(0x138434)
>>   
>>   #endif /* _XE_PCODE_REGS_H_ */
>> diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
>> index e8604e6300ac..b1737403a38b 100644
>> --- a/drivers/gpu/drm/xe/xe_hwmon.c
>> +++ b/drivers/gpu/drm/xe/xe_hwmon.c
>> @@ -39,12 +39,16 @@ enum xe_hwmon_reg_operation {
>>   	REG_READ64,
>>   };
>>   
>> +#define MAX_VRAM_CHANNELS      (16)
>> +
>>   enum xe_hwmon_channel {
>>   	CHANNEL_CARD,
>>   	CHANNEL_PKG,
>>   	CHANNEL_VRAM,
>>   	CHANNEL_MCTRL,
>>   	CHANNEL_PCIE,
>> +	CHANNEL_VRAM_N,
>> +	CHANNEL_VRAM_N_MAX = CHANNEL_VRAM_N + MAX_VRAM_CHANNELS,
>>   	CHANNEL_MAX,
>>   };
>>   
>> @@ -105,6 +109,9 @@ enum sensor_attr_power {
>>   /* Index of memory controller in READ_THERMAL_DATA output */
>>   #define TEMP_INDEX_MCTRL	2
>>   
>> +/* Maximum characters in hwmon label name */
>> +#define MAX_LABEL_SIZE		16
>> +
>>   /**
>>    * struct xe_hwmon_energy_info - to accumulate energy
>>    */
>> @@ -139,6 +146,8 @@ struct xe_hwmon_thermal_info {
>>   	u8 count;
>>   	/** @value: signed value from each sensor */
>>   	s8 value[U8_MAX];
>> +	/** @vram_label: vram label names */
>> +	char vram_label[MAX_VRAM_CHANNELS][MAX_LABEL_SIZE];
>>   };
>>   
>>   /**
>> @@ -255,6 +264,8 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg
>>   				return BMG_PACKAGE_TEMPERATURE;
>>   			else if (channel == CHANNEL_VRAM)
>>   				return BMG_VRAM_TEMPERATURE;
>> +			else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX)
> This looks like it can be in_range() but please double check.
Current check should be sufficient, in_range not required here.
>
>> +				return BMG_VRAM_TEMPERATURE_N(channel - CHANNEL_VRAM_N);
>>   		} else if (xe->info.platform == XE_DG2) {
>>   			if (channel == CHANNEL_PKG)
>>   				return PCU_CR_PACKAGE_TEMPERATURE;
>> @@ -714,6 +725,22 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
>>   			   HWMON_T_MAX,
>>   			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>>   			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>> +			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
>>   			   HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
>>   	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
>>   			   HWMON_P_CAP,
>> @@ -888,6 +915,21 @@ static void xe_hwmon_get_voltage(struct xe_hwmon *hwmon, int channel, long *valu
>>   	*value = DIV_ROUND_CLOSEST(REG_FIELD_GET(VOLTAGE_MASK, reg_val) * 2500, SF_VOLTAGE);
>>   }
>>   
>> +static inline bool is_vram_ch_available(struct xe_hwmon *hwmon, int channel)
>> +{
>> +	struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe);
>> +	int vram_id = channel - CHANNEL_VRAM_N;
>> +	struct xe_reg vram_reg;
>> +
>> +	vram_reg = xe_hwmon_get_reg(hwmon, REG_TEMP, channel);
>> +	if (!xe_reg_is_valid(vram_reg) || !xe_mmio_read32(mmio, vram_reg))
>> +		return false;
>> +
>> +	/* Create label only for available vram channel */
>> +	sprintf(hwmon->temp.vram_label[vram_id], "vram_ch_%d", vram_id);
>> +	return true;
>> +}
>> +
>>   static umode_t
>>   xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
>>   {
>> @@ -901,6 +943,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
>>   		case CHANNEL_MCTRL:
>>   		case CHANNEL_PCIE:
>>   			return hwmon->temp.count ? 0444 : 0;
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>> +			return (is_vram_ch_available(hwmon, channel) &&
>> +				hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN]) ? 0444 : 0;
>>   		default:
>>   			return 0;
>>   		}
>> @@ -913,6 +958,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
>>   		case CHANNEL_MCTRL:
>>   		case CHANNEL_PCIE:
>>   			return hwmon->temp.count ? 0444 : 0;
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>> +			return (is_vram_ch_available(hwmon, channel) &&
>> +				hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT]) ? 0444 : 0;
>>   		default:
>>   			return 0;
>>   		}
>> @@ -933,6 +981,8 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
>>   		case CHANNEL_MCTRL:
>>   		case CHANNEL_PCIE:
>>   			return hwmon->temp.count ? 0444 : 0;
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>> +			return is_vram_ch_available(hwmon, channel) ? 0444 : 0;
>>   		default:
>>   			return 0;
>>   		}
>> @@ -961,6 +1011,16 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
>>   			return get_mc_temp(hwmon, val);
>>   		case CHANNEL_PCIE:
>>   			return get_pcie_temp(hwmon, val);
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>> +			reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel));
>> +			/*
>> +			 * This temperature format is 24 bit [31:8] signed integer and 8 bits
> Nit: Either use bits with 's' or don't, but let's be consistent.
>
>> +			 * [7:0] fraction.
>> +			 */
>> +			*val = (s32)(REG_FIELD_GET(TEMP_MASK_VRAM_N, reg_val)) *
>> +				(REG_FIELD_GET(TEMP_SIGN_MASK, reg_val) ? -1 : 1) *
>> +				 MILLIDEGREE_PER_DEGREE;
>> +			return 0;
>>   		default:
>>   			return -EOPNOTSUPP;
>>   		}
>> @@ -972,6 +1032,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
>>   			*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
>>   			return 0;
>>   		case CHANNEL_VRAM:
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>>   			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
>>   			return 0;
>>   		default:
>> @@ -985,6 +1046,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
>>   			*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
>>   			return 0;
>>   		case CHANNEL_VRAM:
>> +		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
>>   			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] * MILLIDEGREE_PER_DEGREE;
>>   			return 0;
>>   		default:
>> @@ -1353,6 +1415,8 @@ static int xe_hwmon_read_label(struct device *dev,
>>   			       enum hwmon_sensor_types type,
>>   			       u32 attr, int channel, const char **str)
>>   {
>> +	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
>> +
>>   	switch (type) {
>>   	case hwmon_temp:
>>   		if (channel == CHANNEL_PKG)
>> @@ -1363,6 +1427,8 @@ static int xe_hwmon_read_label(struct device *dev,
>>   			*str = "mctrl";
>>   		else if (channel == CHANNEL_PCIE)
>>   			*str = "pcie";
>> +		else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX)
> Ditto for in_range().
same as above
>
> Reviewed-by: Raag Jadav <raag.jadav@intel.com>
>
>> +			*str = hwmon->temp.vram_label[channel - CHANNEL_VRAM_N];
>>   		return 0;
>>   	case hwmon_power:
>>   	case hwmon_energy:
>> -- 
>> 2.25.1
>>

  reply	other threads:[~2026-01-12 18:32 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-12 12:17 [PATCH v6 0/4] drm/xe/hwmon: Expose new temperature attributes Karthik Poosa
2026-01-12 12:17 ` [PATCH v6 1/4] drm/xe/hwmon: Expose temperature limits Karthik Poosa
2026-01-12 12:17 ` [PATCH v6 2/4] drm/xe/hwmon: Expose memory controller temperature Karthik Poosa
2026-01-12 12:17 ` [PATCH v6 3/4] drm/xe/hwmon: Expose GPU PCIe temperature Karthik Poosa
2026-01-12 12:17 ` [PATCH v6 4/4] drm/xe/hwmon: Expose individual VRAM channel temperature Karthik Poosa
2026-01-12 18:08   ` Raag Jadav
2026-01-12 18:32     ` Poosa, Karthik [this message]
2026-01-12 12:29 ` ✓ CI.KUnit: success for drm/xe/hwmon: Expose new temperature attributes (rev8) Patchwork
2026-01-12 13:03 ` ✓ Xe.CI.BAT: " Patchwork
2026-01-12 14:10 ` ✗ Xe.CI.Full: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=34da973b-e1b6-48d2-b0d2-d341c767c4b3@intel.com \
    --to=karthik.poosa@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=raag.jadav@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.