okOn Tue, Dec 16, 2025 at 05:10:30PM +0530, Karthik Poosa wrote:Expose individual VRAM temperature attributes. Update Xe hwmon documentation for this entry. v2: - Avoid using default switch case for VRAM individual temperatures. - Append labels with vram number. - Update kernel version in Xe hwmon documentation. Signed-off-by: Karthik Poosa <karthik.poosa@intel.com> --- .../ABI/testing/sysfs-driver-intel-xe-hwmon | 24 ++++++++++ drivers/gpu/drm/xe/regs/xe_pcode_regs.h | 2 + drivers/gpu/drm/xe/xe_hwmon.c | 47 ++++++++++++++++++- 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon index 51a35fcfb393..b58a96b1857d 100644 --- a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon +++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon @@ -284,3 +284,27 @@ Contact: intel-xe@lists.freedesktop.org Description: RO. GPU PCIe critical temperature in millidegree Celsius. Only supported for particular Intel Xe graphics platforms. + +What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_inputBrace around channel range with '[]'.
We can move this after BMG_VRAM_TEMPERATURE+Date: December 2025 +KernelVersion: 6.19 +Contact: intel-xe@lists.freedesktop.org +Description: RO. Individual VRAM temperature in millidegree Celsius. + + Only supported for particular Intel Xe graphics platforms. + +What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_emergencyDitto.+Date: December 2025 +KernelVersion: 6.19 +Contact: intel-xe@lists.freedesktop.org +Description: RO. Individual VRAM shutdown temperature in millidegree Celsius. + + Only supported for particular Intel Xe graphics platforms. + +What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp6-21_critDitto.+Date: December 2025 +KernelVersion: 6.19 +Contact: intel-xe@lists.freedesktop.org +Description: RO. Individual VRAM critical temperature in millidegree Celsius. + + Only supported for particular Intel Xe graphics platforms. diff --git a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h index fb097607b86c..7c35e2605d2f 100644 --- a/drivers/gpu/drm/xe/regs/xe_pcode_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_pcode_regs.h @@ -23,5 +23,7 @@ #define BMG_FAN_3_SPEED XE_REG(0x1381a0) #define BMG_VRAM_TEMPERATURE XE_REG(0x1382c0) #define BMG_PACKAGE_TEMPERATURE XE_REG(0x138434) +#define BMG_VRAM_TEMPERATURE_N(n) XE_REG(0x138260 + (n))Is this the correct ordering?
+#define TEMP_MASK_VRAM_N REG_GENMASK(31, 8)We usually have spacing for masks and bitfields. Please refer to xe_mchbar_regs.h.#endif /* _XE_PCODE_REGS_H_ */ diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index b8519c734b4e..8eb1dd8f4b2f 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -39,12 +39,16 @@ enum xe_hwmon_reg_operation { REG_READ64, }; +#define MAX_VRAM_CHANNELS (16) + enum xe_hwmon_channel { CHANNEL_CARD, CHANNEL_PKG, CHANNEL_VRAM, CHANNEL_MCTRL, CHANNEL_PCIE, + CHANNEL_VRAM_N, + CHANNEL_VRAM_N_MAX = CHANNEL_VRAM_N + MAX_VRAM_CHANNELS,Please consider the future prospects (or risks) of this approach. We're writing ourselves into a corner here, because now any new component will be id'ed after CHANNEL_VRAM_N_MAX and with that we can potentially hit scaling problem with MAX_VRAM_CHANNELS. I have already mentioned it once[1] but again, not my call. [1] https://lore.kernel.org/intel-xe/Z5SPWwiB15ptK4hR@black.fi.intel.com/
what is your suggestion here ? to have separate channel enum for each sensor type ?
CHANNEL_MAX, }; @@ -256,6 +260,8 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg return BMG_PACKAGE_TEMPERATURE; else if (channel == CHANNEL_VRAM) return BMG_VRAM_TEMPERATURE; + else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX) + return BMG_VRAM_TEMPERATURE_N(((channel % CHANNEL_VRAM_N) * 4)); } else if (xe->info.platform == XE_DG2) { if (channel == CHANNEL_PKG) return PCU_CR_PACKAGE_TEMPERATURE; @@ -715,6 +721,22 @@ static const struct hwmon_channel_info * const hwmon_info[] = { HWMON_T_MAX, HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, + HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT, HWMON_T_LABEL | HWMON_T_INPUT | HWMON_T_EMERGENCY | HWMON_T_CRIT), HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT | HWMON_P_CAP, @@ -921,6 +943,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel) return (!get_mc_temp(hwmon, &val)) ? 0444 : 0; case CHANNEL_PCIE: return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0; + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:So this means we're exposing them all but do we really have 16 VRAMs?
we have support for maximum of 16, we don't have mechanism to detect VRAM if it is present,
we could check returned temperature is non-zero, but if temp is 0 C, it would be valid.
+ return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, + channel)) ? 0444 : 0; default: return 0; } @@ -935,6 +960,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel) return (!get_mc_temp(hwmon, &val)) ? 0444 : 0; case CHANNEL_PCIE: return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0; + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:Ditto.+ return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, + channel)) ? 0444 : 0; default: return 0; } @@ -958,6 +986,9 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel) return (!get_mc_temp(hwmon, &val)) ? 0444 : 0; case CHANNEL_PCIE: return (!get_pcie_temp(hwmon, &val)) ? 0444 : 0; + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX:Ditto.+ return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP, + channel)) ? 0444 : 0; default: return 0; } @@ -986,6 +1017,12 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val) return get_mc_temp(hwmon, val); case CHANNEL_PCIE: return get_pcie_temp(hwmon, val); + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX: + reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_TEMP, channel)); + /* Temperature format here is S 31.23.8 */I'm not confident if this is translatable to human beings, could you please elaborate?
This is signed format, 31 bit for sign, 23 bits for whole number part and 8 bits for fraction.
I shall add a comment about this next revision.
+ *val = REG_FIELD_GET(TEMP_MASK_VRAM_N, reg_val) * + ((reg_val >> 31) ? -1 : 1) * MILLIDEGREE_PER_DEGREE; + break; default: *val = 0; return -EOPNOTSUPP; @@ -999,6 +1036,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val) break; case CHANNEL_VRAM: case CHANNEL_MCTRL: + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX: *val = hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] * MILLIDEGREE_PER_DEGREE; break; default: @@ -1014,6 +1052,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val) break; case CHANNEL_VRAM: case CHANNEL_MCTRL: + case CHANNEL_VRAM_N...CHANNEL_VRAM_N_MAX: *val = hwmon->temp.limit[TEMP_LIMIT_MEM_TJMAX] * MILLIDEGREE_PER_DEGREE; break; default: @@ -1386,16 +1425,22 @@ static int xe_hwmon_read_label(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, const char **str) { + char temp[16] = {0}; + switch (type) { case hwmon_temp: if (channel == CHANNEL_PKG) *str = "pkg"; else if (channel == CHANNEL_VRAM) - *str = "vram"; + *str = "vram_avg"; else if (channel == CHANNEL_MCTRL) *str = "mctrl_avg"; else if (channel == CHANNEL_PCIE) *str = "pcie"; + else if (channel >= CHANNEL_VRAM_N && channel <= CHANNEL_VRAM_N_MAX) { + sprintf(temp, "vram_%d", (channel - CHANNEL_VRAM_N)); + *str = temp;Can this be done without temp? Raag
+ } return 0; case hwmon_power: case hwmon_energy: -- 2.25.1