On 22-12-2025 00:22, Raag Jadav wrote:
On Tue, Dec 16, 2025 at 05:10:30PM +0530, Karthik Poosa wrote:
Expose individual VRAM temperature attributes.
Update Xe hwmon documentation for this entry.

v2:
 - Avoid using default switch case for VRAM individual temperatures.
 - Append labels with vram number.
 - Update kernel version in Xe hwmon documentation.

Signed-off-by: Karthik Poosa <karthik.poosa@intel.com>
---
 .../ABI/testing/sysfs-driver-intel-xe-hwmon   | 24 ++++++++++
 drivers/gpu/drm/xe/regs/xe_pcode_regs.h       |  2 +
 drivers/gpu/drm/xe/xe_hwmon.c                 | 47 ++++++++++++++++++-
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
index 51a35fcfb393..b58a96b1857d 100644
--- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
+++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
@@ -284,3 +284,27 @@ Contact:	intel-xe@lists.freedesktop.org
 Description:	RO. GPU PCIe critical temperature in millidegree Celsius.
 
 		Only supported for particular Intel Xe graphics platforms.
+
+What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_input
Brace around channel range with '[]'.
ok

+Date:		December 2025
+KernelVersion:	6.19
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RO. Individual VRAM temperature in millidegree Celsius.
+
+		Only supported for particular Intel Xe graphics platforms.
+
+What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_emergency
Ditto.

+Date:		December 2025
+KernelVersion:	6.19
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RO. Individual VRAM shutdown temperature in millidegree Celsius.
+
+		Only supported for particular Intel Xe graphics platforms.
+
+What:		/sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_crit
Ditto.

+Date:		December 2025
+KernelVersion:	6.19
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RO. Individual VRAM critical temperature in millidegree Celsius.
+
+		Only supported for particular Intel Xe graphics platforms.
diff --git a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
index fb097607b86c..7c35e2605d2f 100644
--- a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h
@@ -23,5 +23,7 @@
 #define BMG_FAN_3_SPEED				XE_REG(0x1381a0)
 #define BMG_VRAM_TEMPERATURE			XE_REG(0x1382c0)
 #define BMG_PACKAGE_TEMPERATURE			XE_REG(0x138434)
+#define BMG_VRAM_TEMPERATURE_N(n)		XE_REG(0x138260 + (n))
Is this the correct ordering?
We can move this after BMG_VRAM_TEMPERATURE

+#define TEMP_MASK_VRAM_N			REG_GENMASK(31, 8)
We usually have spacing for masks and bitfields. Please refer to
xe_mchbar_regs.h.

 #endif /* _XE_PCODE_REGS_H_ */
diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
index b8519c734b4e..8eb1dd8f4b2f 100644
--- a/drivers/gpu/drm/xe/xe_hwmon.c
+++ b/drivers/gpu/drm/xe/xe_hwmon.c
@@ -39,12 +39,16 @@ enum xe_hwmon_reg_operation {
 	REG_READ64,
 };
 
+#define MAX_VRAM_CHANNELS      (16)
+
 enum xe_hwmon_channel {
 	CHANNEL_CARD,
 	CHANNEL_PKG,
 	CHANNEL_VRAM,
 	CHANNEL_MCTRL,
 	CHANNEL_PCIE,
+	CHANNEL_VRAM_N,
+	CHANNEL_VRAM_N_MAX = CHANNEL_VRAM_N + MAX_VRAM_CHANNELS,
Please consider the future prospects (or risks) of this approach. We're
writing ourselves into a corner here, because now any new component will
be id'ed after CHANNEL_VRAM_N_MAX and with that we can potentially hit
scaling problem with MAX_VRAM_CHANNELS.

I have already mentioned it once[1] but again, not my call.

[1] https://lore.kernel.org/intel-xe/Z5SPWwiB15ptK4hR@black.fi.intel.com/

what is your suggestion here ? to have separate channel enum for each sensor type ? 


 	CHANNEL_MAX,
 };
 
@@ -256,6 +260,8 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg
 				return BMG_PACKAGE_TEMPERATURE;
 			else if (channel == CHANNEL_VRAM)
 				return BMG_VRAM_TEMPERATURE;
+			else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX)
+				return BMG_VRAM_TEMPERATURE_N(((channel % CHANNEL_VRAM_N) * 4));
 		} else if (xe->info.platform == XE_DG2) {
 			if (channel == CHANNEL_PKG)
 				return PCU_CR_PACKAGE_TEMPERATURE;
@@ -715,6 +721,22 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
 			   HWMON_T_MAX,
 			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
 			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
+			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT,
 			   HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT),
 	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
 			   HWMON_P_CAP,
@@ -921,6 +943,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
 			return (!get_mc_temp(hwmon, &val)) ? 0444 : 0;
 		case CHANNEL_PCIE:
 			return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0;
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
So this means we're exposing them all but do we really have 16 VRAMs?

we have support for maximum of 16, we don't have mechanism to detect VRAM if it is present, 

we could check returned temperature is non-zero, but if temp is 0 C, it would be valid.


+			return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
+					       channel)) ? 0444 : 0;
 		default:
 			return 0;
 		}
@@ -935,6 +960,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
 			return (!get_mc_temp(hwmon, &val)) ? 0444 : 0;
 		case CHANNEL_PCIE:
 			return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0;
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
Ditto.

+			return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
+					       channel)) ? 0444 : 0;
 		default:
 			return 0;
 		}
@@ -958,6 +986,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
 			return (!get_mc_temp(hwmon, &val)) ? 0444 : 0;
 		case CHANNEL_PCIE:
 			return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0;
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
Ditto.

+			return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
+					       channel)) ? 0444 : 0;
 		default:
 			return 0;
 		}
@@ -986,6 +1017,12 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
 			return get_mc_temp(hwmon, val);
 		case CHANNEL_PCIE:
 			return get_pcie_temp(hwmon, val);
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
+			reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel));
+			/* Temperature format here is S 31.23.8 */
I'm not confident if this is translatable to human beings, could you
please elaborate?

This is signed format, 31 bit for sign, 23 bits for whole number part and 8 bits for fraction.

I shall add a comment about this next revision.


+			*val = REG_FIELD_GET(TEMP_MASK_VRAM_N, reg_val) *
+					     ((reg_val >> 31) ? -1 : 1) * MILLIDEGREE_PER_DEGREE;
+			break;
 		default:
 			*val = 0;
 			return -EOPNOTSUPP;
@@ -999,6 +1036,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
 			break;
 		case CHANNEL_VRAM:
 		case CHANNEL_MCTRL:
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
 			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
 			break;
 		default:
@@ -1014,6 +1052,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
 			break;
 		case CHANNEL_VRAM:
 		case CHANNEL_MCTRL:
+		case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:
 			*val = hwmon->temp.limit[TEMP_LIMIT_MEM_TJMAX] * MILLIDEGREE_PER_DEGREE;
 			break;
 		default:
@@ -1386,16 +1425,22 @@ static int xe_hwmon_read_label(struct device *dev,
 			       enum hwmon_sensor_types type,
 			       u32 attr, int channel, const char **str)
 {
+	char temp[16] = {0};
+
 	switch (type) {
 	case hwmon_temp:
 		if (channel == CHANNEL_PKG)
 			*str = "pkg";
 		else if (channel == CHANNEL_VRAM)
-			*str = "vram";
+			*str = "vram_avg";
 		else if (channel == CHANNEL_MCTRL)
 			*str = "mctrl_avg";
 		else if (channel == CHANNEL_PCIE)
 			*str = "pcie";
+		else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX) {
+			sprintf(temp, "vram_%d", (channel - CHANNEL_VRAM_N));
+			*str = temp;
Can this be done without temp?

Raag
I experimented with different methods; this one compiles without warnings.
Do you have any suggestions ?


+		}
 		return 0;
 	case hwmon_power:
 	case hwmon_energy:
-- 
2.25.1