LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] powerpc/powermac: New windfarm driver for PowerMac G5 (AGP) and Xserve G5
From: Benjamin Herrenschmidt @ 2012-04-30  1:42 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: khali, Andreas Schwab

This replaces the old therm_pm72 using the same windfarm infrastructure
that was used for other PowerMac G5 models. The fan speeds and sensors
should now be visible in the same location in sysfs.

The driver is split into separate core modules for PowerMac7,2 (and 7,3)
and RackMac3,1, with a lot of the shared code now in the separate sensor
and control modules.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

v2: - Use programmed value when reading back from FCU
      (like therm_pm72, behaviour is preferable and appears more
       in line with what OS X does)

    - Remove tickle code, it's annoying. Instead, don't compare
      programmed values with previous values, which means that
      the normal loops should be enough to keep the FCU from
      timing out. That also means that it should now be possible
      to manually program the slots fan from sysfs.

 drivers/macintosh/Kconfig                  |   23 +-
 drivers/macintosh/Makefile                 |   14 +
 drivers/macintosh/windfarm.h               |    3 +-
 drivers/macintosh/windfarm_core.c          |   10 +-
 drivers/macintosh/windfarm_cpufreq_clamp.c |    6 -
 drivers/macintosh/windfarm_fcu_controls.c  |   10 +-
 drivers/macintosh/windfarm_mpu.h           |  105 ++++
 drivers/macintosh/windfarm_pm72.c          |  847 ++++++++++++++++++++++++++++
 drivers/macintosh/windfarm_rm31.c          |  740 ++++++++++++++++++++++++
 9 files changed, 1736 insertions(+), 22 deletions(-)
 create mode 100644 drivers/macintosh/windfarm_mpu.h
 create mode 100644 drivers/macintosh/windfarm_pm72.c
 create mode 100644 drivers/macintosh/windfarm_rm31.c

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index fa51af1..a555da6 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -204,11 +204,14 @@ config THERM_ADT746X
 	  better fan behaviour by default, and some manual control.
 
 config THERM_PM72
-	tristate "Support for thermal management on PowerMac G5"
+	tristate "Support for thermal management on PowerMac G5 (AGP)"
 	depends on I2C && I2C_POWERMAC && PPC_PMAC64
+	default n
 	help
 	  This driver provides thermostat and fan control for the desktop
-	  G5 machines. 
+	  G5 machines.
+
+	  This is deprecated, use windfarm instead.
 
 config WINDFARM
 	tristate "New PowerMac thermal control infrastructure"
@@ -221,6 +224,22 @@ config WINDFARM_PM81
 	help
 	  This driver provides thermal control for the iMacG5
 
+config WINDFARM_PM72
+	tristate "Support for thermal management on PowerMac G5 (AGP)"
+	depends on WINDFARM && I2C && CPU_FREQ_PMAC64 && ADB_PMU
+	select I2C_POWERMAC
+	help
+	  This driver provides thermal control for the PowerMac G5
+	  "AGP" variants (PowerMac 7,2 and 7,3)
+
+config WINDFARM_RM31
+	tristate "Support for thermal management on Xserve G5"
+	depends on WINDFARM && I2C && CPU_FREQ_PMAC64 && ADB_PMU
+	select I2C_POWERMAC
+	help
+	  This driver provides thermal control for the Xserve G5
+	  (RackMac3,1)
+
 config WINDFARM_PM91
 	tristate "Support for thermal management on PowerMac9,1"
 	depends on WINDFARM && I2C && CPU_FREQ_PMAC64 && PMAC_SMU
diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile
index 6652a6e..6753b65 100644
--- a/drivers/macintosh/Makefile
+++ b/drivers/macintosh/Makefile
@@ -29,6 +29,20 @@ obj-$(CONFIG_THERM_PM72)	+= therm_pm72.o
 obj-$(CONFIG_THERM_WINDTUNNEL)	+= therm_windtunnel.o
 obj-$(CONFIG_THERM_ADT746X)	+= therm_adt746x.o
 obj-$(CONFIG_WINDFARM)	        += windfarm_core.o
+obj-$(CONFIG_WINDFARM_PM72)     += windfarm_fcu_controls.o \
+				   windfarm_ad7417_sensor.o \
+				   windfarm_lm75_sensor.o \
+				   windfarm_max6690_sensor.o \
+				   windfarm_pid.o \
+				   windfarm_cpufreq_clamp.o \
+				   windfarm_pm72.o
+obj-$(CONFIG_WINDFARM_RM31)     += windfarm_fcu_controls.o \
+				   windfarm_ad7417_sensor.o \
+				   windfarm_lm75_sensor.o \
+				   windfarm_lm87_sensor.o \
+				   windfarm_pid.o \
+				   windfarm_cpufreq_clamp.o \
+				   windfarm_rm31.o
 obj-$(CONFIG_WINDFARM_PM81)     += windfarm_smu_controls.o \
 				   windfarm_smu_sensors.o \
 				   windfarm_lm75_sensor.o windfarm_pid.o \
diff --git a/drivers/macintosh/windfarm.h b/drivers/macintosh/windfarm.h
index a9e385e..028cdac 100644
--- a/drivers/macintosh/windfarm.h
+++ b/drivers/macintosh/windfarm.h
@@ -17,7 +17,7 @@
 #include <linux/device.h>
 
 /* Display a 16.16 fixed point value */
-#define FIX32TOPRINT(f)	((f) >> 16),((((f) & 0xffff) * 1000) >> 16)
+#define FIX32TOPRINT(f)	(((s32)(f)) >> 16),(((((s32)(f)) & 0xffff) * 1000) >> 16)
 
 /*
  * Control objects
@@ -41,6 +41,7 @@ struct wf_control {
 	int				type;
 	struct kref			ref;
 	struct device_attribute		attr;
+	void				*priv;
 };
 
 #define WF_CONTROL_TYPE_GENERIC		0
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index ebafc25..3ee198b 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -169,8 +169,11 @@ static ssize_t wf_show_control(struct device *dev,
 	int err;
 
 	err = ctrl->ops->get_value(ctrl, &val);
-	if (err < 0)
+	if (err < 0) {
+		if (err == -EFAULT)
+			return sprintf(buf, "<HW FAULT>\n");
 		return err;
+	}
 	switch(ctrl->type) {
 	case WF_CONTROL_RPM_FAN:
 		typestr = " RPM";
@@ -481,11 +484,6 @@ static int __init windfarm_core_init(void)
 {
 	DBG("wf: core loaded\n");
 
-	/* Don't register on old machines that use therm_pm72 for now */
-	if (of_machine_is_compatible("PowerMac7,2") ||
-	    of_machine_is_compatible("PowerMac7,3") ||
-	    of_machine_is_compatible("RackMac3,1"))
-		return -ENODEV;
 	platform_device_register(&wf_platform_device);
 	return 0;
 }
diff --git a/drivers/macintosh/windfarm_cpufreq_clamp.c b/drivers/macintosh/windfarm_cpufreq_clamp.c
index 1a77a7c..72d1fdf 100644
--- a/drivers/macintosh/windfarm_cpufreq_clamp.c
+++ b/drivers/macintosh/windfarm_cpufreq_clamp.c
@@ -75,12 +75,6 @@ static int __init wf_cpufreq_clamp_init(void)
 {
 	struct wf_control *clamp;
 
-	/* Don't register on old machines that use therm_pm72 for now */
-	if (of_machine_is_compatible("PowerMac7,2") ||
-	    of_machine_is_compatible("PowerMac7,3") ||
-	    of_machine_is_compatible("RackMac3,1"))
-		return -ENODEV;
-
 	clamp = kmalloc(sizeof(struct wf_control), GFP_KERNEL);
 	if (clamp == NULL)
 		return -ENOMEM;
diff --git a/drivers/macintosh/windfarm_fcu_controls.c b/drivers/macintosh/windfarm_fcu_controls.c
index 871f8b4..b3411ed 100644
--- a/drivers/macintosh/windfarm_fcu_controls.c
+++ b/drivers/macintosh/windfarm_fcu_controls.c
@@ -41,10 +41,10 @@
  * applied to the setpoint RPM speed, that is basically the
  * speed we proviously "asked" for.
  *
- * I'm not sure which of these Apple's algorithm is supposed
- * to use
+ * I'm using 0 for now which is what therm_pm72 used to do and
+ * what Darwin -apparently- does based on observed behaviour.
  */
-#define RPM_PID_USE_ACTUAL_SPEED	1
+#define RPM_PID_USE_ACTUAL_SPEED	0
 
 /* Default min/max for pumps */
 #define CPU_PUMP_OUTPUT_MAX		3200
@@ -154,8 +154,6 @@ static int wf_fcu_fan_set_rpm(struct wf_control *ct, s32 value)
 	if (value > fan->max)
 		value = fan->max;
 
-	if (fan->target && fan->target == value)
-		return 0;
 	fan->target = value;
 
 	buf[0] = value >> (8 - shift);
@@ -213,8 +211,6 @@ static int wf_fcu_fan_set_pwm(struct wf_control *ct, s32 value)
 	if (value > fan->max)
 		value = fan->max;
 
-	if (fan->target && fan->target == value)
-		return 0;
 	fan->target = value;
 
 	value = (value * 2559) / 1000;
diff --git a/drivers/macintosh/windfarm_mpu.h b/drivers/macintosh/windfarm_mpu.h
new file mode 100644
index 0000000..046edc8
--- /dev/null
+++ b/drivers/macintosh/windfarm_mpu.h
@@ -0,0 +1,105 @@
+/*
+ * Windfarm PowerMac thermal control
+ *
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * Released under the term of the GNU GPL v2.
+ */
+
+#ifndef __WINDFARM_MPU_H
+#define __WINDFARM_MPU_H
+
+typedef unsigned short fu16;
+typedef int fs32;
+typedef short fs16;
+
+/* Definition of the MPU data structure which contains per CPU
+ * calibration information (among others) for the G5 machines
+ */
+struct mpu_data
+{
+	u8	signature;		/* 0x00 - EEPROM sig. */
+	u8	bytes_used;		/* 0x01 - Bytes used in eeprom (160 ?) */
+	u8	size;			/* 0x02 - EEPROM size (256 ?) */
+	u8	version;		/* 0x03 - EEPROM version */
+	u32	data_revision;		/* 0x04 - Dataset revision */
+	u8	processor_bin_code[3];	/* 0x08 - Processor BIN code */
+	u8	bin_code_expansion;	/* 0x0b - ??? (padding ?) */
+	u8	processor_num;		/* 0x0c - Number of CPUs on this MPU */
+	u8	input_mul_bus_div;	/* 0x0d - Clock input multiplier/bus divider */
+	u8	reserved1[2];		/* 0x0e - */
+	u32	input_clk_freq_high;	/* 0x10 - Input clock frequency high */
+	u8	cpu_nb_target_cycles;	/* 0x14 - ??? */
+	u8	cpu_statlat;		/* 0x15 - ??? */
+	u8	cpu_snooplat;		/* 0x16 - ??? */
+	u8	cpu_snoopacc;		/* 0x17 - ??? */
+	u8	nb_paamwin;		/* 0x18 - ??? */
+	u8	nb_statlat;		/* 0x19 - ??? */
+	u8	nb_snooplat;		/* 0x1a - ??? */
+	u8	nb_snoopwin;		/* 0x1b - ??? */
+	u8	api_bus_mode;		/* 0x1c - ??? */
+	u8	reserved2[3];		/* 0x1d - */
+	u32	input_clk_freq_low;	/* 0x20 - Input clock frequency low */
+	u8	processor_card_slot;	/* 0x24 - Processor card slot number */
+	u8	reserved3[2];		/* 0x25 - */
+	u8	padjmax;       		/* 0x27 - Max power adjustment (Not in OF!) */
+	u8	ttarget;		/* 0x28 - Target temperature */
+	u8	tmax;			/* 0x29 - Max temperature */
+	u8	pmaxh;			/* 0x2a - Max power */
+	u8	tguardband;		/* 0x2b - Guardband temp ??? Hist. len in OSX */
+	fs32	pid_gp;			/* 0x2c - PID proportional gain */
+	fs32	pid_gr;			/* 0x30 - PID reset gain */
+	fs32	pid_gd;			/* 0x34 - PID derivative gain */
+	fu16	voph;			/* 0x38 - Vop High */
+	fu16	vopl;			/* 0x3a - Vop Low */
+	fs16	nactual_die;		/* 0x3c - nActual Die */
+	fs16	nactual_heatsink;	/* 0x3e - nActual Heatsink */
+	fs16	nactual_system;		/* 0x40 - nActual System */
+	u16	calibration_flags;	/* 0x42 - Calibration flags */
+	fu16	mdiode;			/* 0x44 - Diode M value (scaling factor) */
+	fs16	bdiode;			/* 0x46 - Diode B value (offset) */
+	fs32	theta_heat_sink;	/* 0x48 - Theta heat sink */
+	u16	rminn_intake_fan;	/* 0x4c - Intake fan min RPM */
+	u16	rmaxn_intake_fan;	/* 0x4e - Intake fan max RPM */
+	u16	rminn_exhaust_fan;	/* 0x50 - Exhaust fan min RPM */
+	u16	rmaxn_exhaust_fan;	/* 0x52 - Exhaust fan max RPM */
+	u8	processor_part_num[8];	/* 0x54 - Processor part number XX pumps min/max */
+	u32	processor_lot_num;	/* 0x5c - Processor lot number */
+	u8	orig_card_sernum[0x10];	/* 0x60 - Card original serial number */
+	u8	curr_card_sernum[0x10];	/* 0x70 - Card current serial number */
+	u8	mlb_sernum[0x18];	/* 0x80 - MLB serial number */
+	u32	checksum1;		/* 0x98 - */
+	u32	checksum2;		/* 0x9c - */	
+}; /* Total size = 0xa0 */
+
+static inline const struct mpu_data *wf_get_mpu(int cpu)
+{
+	struct device_node *np;
+	char nodename[64];
+	const void *data;
+	int len;
+
+	/*
+	 * prom.c routine for finding a node by path is a bit brain dead
+	 * and requires exact @xxx unit numbers. This is a bit ugly but
+	 * will work for these machines
+	 */
+	sprintf(nodename, "/u3@0,f8000000/i2c@f8001000/cpuid@a%d", cpu ? 2 : 0);
+	np = of_find_node_by_path(nodename);
+	if (!np)
+		return NULL;
+	data = of_get_property(np, "cpuid", &len);	
+	of_node_put(np);
+	if (!data)
+		return NULL;
+
+	/*
+	 * We are naughty, we have dropped the reference to the device
+	 * node and still return a pointer to the content. We know we
+	 * can do that though as this is only ever called on PowerMac
+	 * which cannot remove those nodes
+	 */
+	return data;
+}
+
+#endif /*  __WINDFARM_MPU_H */
diff --git a/drivers/macintosh/windfarm_pm72.c b/drivers/macintosh/windfarm_pm72.c
new file mode 100644
index 0000000..84ac913
--- /dev/null
+++ b/drivers/macintosh/windfarm_pm72.c
@@ -0,0 +1,847 @@
+/*
+ * Windfarm PowerMac thermal control.
+ * Control loops for PowerMac7,2 and 7,3
+ *
+ * Copyright (C) 2012 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * Use and redistribute under the terms of the GNU GPL v2.
+ */
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <asm/prom.h>
+#include <asm/smu.h>
+
+#include "windfarm.h"
+#include "windfarm_pid.h"
+#include "windfarm_mpu.h"
+
+#define VERSION "1.0"
+
+#undef DEBUG
+#undef LOTSA_DEBUG
+
+#ifdef DEBUG
+#define DBG(args...)	printk(args)
+#else
+#define DBG(args...)	do { } while(0)
+#endif
+
+#ifdef LOTSA_DEBUG
+#define DBG_LOTS(args...)	printk(args)
+#else
+#define DBG_LOTS(args...)	do { } while(0)
+#endif
+
+/* define this to force CPU overtemp to 60 degree, useful for testing
+ * the overtemp code
+ */
+#undef HACKED_OVERTEMP
+
+/* We currently only handle 2 chips */
+#define NR_CHIPS	2
+#define NR_CPU_FANS	3 * NR_CHIPS
+
+/* Controls and sensors */
+static struct wf_sensor *sens_cpu_temp[NR_CHIPS];
+static struct wf_sensor *sens_cpu_volts[NR_CHIPS];
+static struct wf_sensor *sens_cpu_amps[NR_CHIPS];
+static struct wf_sensor *backside_temp;
+static struct wf_sensor *drives_temp;
+
+static struct wf_control *cpu_front_fans[NR_CHIPS];
+static struct wf_control *cpu_rear_fans[NR_CHIPS];
+static struct wf_control *cpu_pumps[NR_CHIPS];
+static struct wf_control *backside_fan;
+static struct wf_control *drives_fan;
+static struct wf_control *slots_fan;
+static struct wf_control *cpufreq_clamp;
+
+/* We keep a temperature history for average calculation of 180s */
+#define CPU_TEMP_HIST_SIZE	180
+
+/* Fixed speed for slot fan */
+#define	SLOTS_FAN_DEFAULT_PWM	40
+
+/* Scale value for CPU intake fans */
+#define CPU_INTAKE_SCALE	0x0000f852
+
+/* PID loop state */
+static const struct mpu_data *cpu_mpu_data[NR_CHIPS];
+static struct wf_cpu_pid_state cpu_pid[NR_CHIPS];
+static bool cpu_pid_combined;
+static u32 cpu_thist[CPU_TEMP_HIST_SIZE];
+static int cpu_thist_pt;
+static s64 cpu_thist_total;
+static s32 cpu_all_tmax = 100 << 16;
+static struct wf_pid_state backside_pid;
+static int backside_tick;
+static struct wf_pid_state drives_pid;
+static int drives_tick;
+
+static int nr_chips;
+static bool have_all_controls;
+static bool have_all_sensors;
+static bool started;
+
+static int failure_state;
+#define FAILURE_SENSOR		1
+#define FAILURE_FAN		2
+#define FAILURE_PERM		4
+#define FAILURE_LOW_OVERTEMP	8
+#define FAILURE_HIGH_OVERTEMP	16
+
+/* Overtemp values */
+#define LOW_OVER_AVERAGE	0
+#define LOW_OVER_IMMEDIATE	(10 << 16)
+#define LOW_OVER_CLEAR		((-10) << 16)
+#define HIGH_OVER_IMMEDIATE	(14 << 16)
+#define HIGH_OVER_AVERAGE	(10 << 16)
+#define HIGH_OVER_IMMEDIATE	(14 << 16)
+
+
+static void cpu_max_all_fans(void)
+{
+	int i;
+
+	/* We max all CPU fans in case of a sensor error. We also do the
+	 * cpufreq clamping now, even if it's supposedly done later by the
+	 * generic code anyway, we do it earlier here to react faster
+	 */
+	if (cpufreq_clamp)
+		wf_control_set_max(cpufreq_clamp);
+	for (i = 0; i < nr_chips; i++) {
+		if (cpu_front_fans[i])
+			wf_control_set_max(cpu_front_fans[i]);
+		if (cpu_rear_fans[i])
+			wf_control_set_max(cpu_rear_fans[i]);
+		if (cpu_pumps[i])
+			wf_control_set_max(cpu_pumps[i]);
+	}
+}
+
+static int cpu_check_overtemp(s32 temp)
+{
+	int new_state = 0;
+	s32 t_avg, t_old;
+	static bool first = true;
+
+	/* First check for immediate overtemps */
+	if (temp >= (cpu_all_tmax + LOW_OVER_IMMEDIATE)) {
+		new_state |= FAILURE_LOW_OVERTEMP;
+		if ((failure_state & FAILURE_LOW_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Overtemp due to immediate CPU"
+			       " temperature !\n");
+	}
+	if (temp >= (cpu_all_tmax + HIGH_OVER_IMMEDIATE)) {
+		new_state |= FAILURE_HIGH_OVERTEMP;
+		if ((failure_state & FAILURE_HIGH_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Critical overtemp due to"
+			       " immediate CPU temperature !\n");
+	}
+
+	/*
+	 * The first time around, initialize the array with the first
+	 * temperature reading
+	 */
+	if (first) {
+		int i;
+
+		cpu_thist_total = 0;
+		for (i = 0; i < CPU_TEMP_HIST_SIZE; i++) {
+			cpu_thist[i] = temp;
+			cpu_thist_total += temp;
+		}
+		first = false;
+	}
+
+	/*
+	 * We calculate a history of max temperatures and use that for the
+	 * overtemp management
+	 */
+	t_old = cpu_thist[cpu_thist_pt];
+	cpu_thist[cpu_thist_pt] = temp;
+	cpu_thist_pt = (cpu_thist_pt + 1) % CPU_TEMP_HIST_SIZE;
+	cpu_thist_total -= t_old;
+	cpu_thist_total += temp;
+	t_avg = cpu_thist_total / CPU_TEMP_HIST_SIZE;
+
+	DBG_LOTS("  t_avg = %d.%03d (out: %d.%03d, in: %d.%03d)\n",
+		 FIX32TOPRINT(t_avg), FIX32TOPRINT(t_old), FIX32TOPRINT(temp));
+
+	/* Now check for average overtemps */
+	if (t_avg >= (cpu_all_tmax + LOW_OVER_AVERAGE)) {
+		new_state |= FAILURE_LOW_OVERTEMP;
+		if ((failure_state & FAILURE_LOW_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Overtemp due to average CPU"
+			       " temperature !\n");
+	}
+	if (t_avg >= (cpu_all_tmax + HIGH_OVER_AVERAGE)) {
+		new_state |= FAILURE_HIGH_OVERTEMP;
+		if ((failure_state & FAILURE_HIGH_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Critical overtemp due to"
+			       " average CPU temperature !\n");
+	}
+
+	/* Now handle overtemp conditions. We don't currently use the windfarm
+	 * overtemp handling core as it's not fully suited to the needs of those
+	 * new machine. This will be fixed later.
+	 */
+	if (new_state) {
+		/* High overtemp -> immediate shutdown */
+		if (new_state & FAILURE_HIGH_OVERTEMP)
+			machine_power_off();
+		if ((failure_state & new_state) != new_state)
+			cpu_max_all_fans();
+		failure_state |= new_state;
+	} else if ((failure_state & FAILURE_LOW_OVERTEMP) &&
+		   (temp < (cpu_all_tmax + LOW_OVER_CLEAR))) {
+		printk(KERN_ERR "windfarm: Overtemp condition cleared !\n");
+		failure_state &= ~FAILURE_LOW_OVERTEMP;
+	}
+
+	return failure_state & (FAILURE_LOW_OVERTEMP | FAILURE_HIGH_OVERTEMP);
+}
+
+static int read_one_cpu_vals(int cpu, s32 *temp, s32 *power)
+{
+	s32 dtemp, volts, amps;
+	int rc;
+
+	/* Get diode temperature */
+	rc = wf_sensor_get(sens_cpu_temp[cpu], &dtemp);
+	if (rc) {
+		DBG("  CPU%d: temp reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: temp   = %d.%03d\n", cpu, FIX32TOPRINT((dtemp)));
+	*temp = dtemp;
+
+	/* Get voltage */
+	rc = wf_sensor_get(sens_cpu_volts[cpu], &volts);
+	if (rc) {
+		DBG("  CPU%d, volts reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: volts  = %d.%03d\n", cpu, FIX32TOPRINT((volts)));
+
+	/* Get current */
+	rc = wf_sensor_get(sens_cpu_amps[cpu], &amps);
+	if (rc) {
+		DBG("  CPU%d, current reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: amps   = %d.%03d\n", cpu, FIX32TOPRINT((amps)));
+
+	/* Calculate power */
+
+	/* Scale voltage and current raw sensor values according to fixed scales
+	 * obtained in Darwin and calculate power from I and V
+	 */
+	*power = (((u64)volts) * ((u64)amps)) >> 16;
+
+	DBG_LOTS("  CPU%d: power  = %d.%03d\n", cpu, FIX32TOPRINT((*power)));
+
+	return 0;
+
+}
+
+static void cpu_fans_tick_split(void)
+{
+	int err, cpu;
+	s32 intake, temp, power, t_max = 0;
+
+	DBG_LOTS("* cpu fans_tick_split()\n");
+
+	for (cpu = 0; cpu < nr_chips; ++cpu) {
+		struct wf_cpu_pid_state *sp = &cpu_pid[cpu];
+
+		/* Read current speed */
+		wf_control_get(cpu_rear_fans[cpu], &sp->target);
+
+		DBG_LOTS("  CPU%d: cur_target = %d RPM\n", cpu, sp->target);
+
+		err = read_one_cpu_vals(cpu, &temp, &power);
+		if (err) {
+			failure_state |= FAILURE_SENSOR;
+			cpu_max_all_fans();
+			return;
+		}
+
+		/* Keep track of highest temp */
+		t_max = max(t_max, temp);
+
+		/* Handle possible overtemps */
+		if (cpu_check_overtemp(t_max))
+			return;
+
+		/* Run PID */
+		wf_cpu_pid_run(sp, power, temp);
+
+		DBG_LOTS("  CPU%d: target = %d RPM\n", cpu, sp->target);
+
+		/* Apply result directly to exhaust fan */
+		err = wf_control_set(cpu_rear_fans[cpu], sp->target);
+		if (err) {
+			pr_warning("wf_pm72: Fan %s reports error %d\n",
+			       cpu_rear_fans[cpu]->name, err);
+			failure_state |= FAILURE_FAN;
+			break;
+		}
+
+		/* Scale result for intake fan */
+		intake = (sp->target * CPU_INTAKE_SCALE) >> 16;
+		DBG_LOTS("  CPU%d: intake = %d RPM\n", cpu, intake);
+		err = wf_control_set(cpu_front_fans[cpu], intake);
+		if (err) {
+			pr_warning("wf_pm72: Fan %s reports error %d\n",
+			       cpu_front_fans[cpu]->name, err);
+			failure_state |= FAILURE_FAN;
+			break;
+		}
+	}
+}
+
+static void cpu_fans_tick_combined(void)
+{
+	s32 temp0, power0, temp1, power1, t_max = 0;
+	s32 temp, power, intake, pump;
+	struct wf_control *pump0, *pump1;
+	struct wf_cpu_pid_state *sp = &cpu_pid[0];
+	int err, cpu;
+
+	DBG_LOTS("* cpu fans_tick_combined()\n");
+
+	/* Read current speed from cpu 0 */
+	wf_control_get(cpu_rear_fans[0], &sp->target);
+
+	DBG_LOTS("  CPUs: cur_target = %d RPM\n", sp->target);
+
+	/* Read values for both CPUs */
+	err = read_one_cpu_vals(0, &temp0, &power0);
+	if (err) {
+		failure_state |= FAILURE_SENSOR;
+		cpu_max_all_fans();
+		return;
+	}
+	err = read_one_cpu_vals(1, &temp1, &power1);
+	if (err) {
+		failure_state |= FAILURE_SENSOR;
+		cpu_max_all_fans();
+		return;
+	}
+
+	/* Keep track of highest temp */
+	t_max = max(t_max, max(temp0, temp1));
+
+	/* Handle possible overtemps */
+	if (cpu_check_overtemp(t_max))
+		return;
+
+	/* Use the max temp & power of both */
+	temp = max(temp0, temp1);
+	power = max(power0, power1);
+
+	/* Run PID */
+	wf_cpu_pid_run(sp, power, temp);
+
+	/* Scale result for intake fan */
+	intake = (sp->target * CPU_INTAKE_SCALE) >> 16;
+
+	/* Same deal with pump speed */
+	pump0 = cpu_pumps[0];
+	pump1 = cpu_pumps[1];
+	if (!pump0) {
+		pump0 = pump1;
+		pump1 = NULL;
+	}
+	pump = (sp->target * wf_control_get_max(pump0)) /
+		cpu_mpu_data[0]->rmaxn_exhaust_fan;
+
+	DBG_LOTS("  CPUs: target = %d RPM\n", sp->target);
+	DBG_LOTS("  CPUs: intake = %d RPM\n", intake);
+	DBG_LOTS("  CPUs: pump   = %d RPM\n", pump);
+
+	for (cpu = 0; cpu < nr_chips; cpu++) {
+		err = wf_control_set(cpu_rear_fans[cpu], sp->target);
+		if (err) {
+			pr_warning("wf_pm72: Fan %s reports error %d\n",
+				   cpu_rear_fans[cpu]->name, err);
+			failure_state |= FAILURE_FAN;
+		}
+		err = wf_control_set(cpu_front_fans[cpu], intake);
+		if (err) {
+			pr_warning("wf_pm72: Fan %s reports error %d\n",
+				   cpu_front_fans[cpu]->name, err);
+			failure_state |= FAILURE_FAN;
+		}
+		err = 0;
+		if (cpu_pumps[cpu])
+			err = wf_control_set(cpu_pumps[cpu], pump);
+		if (err) {
+			pr_warning("wf_pm72: Pump %s reports error %d\n",
+				   cpu_pumps[cpu]->name, err);
+			failure_state |= FAILURE_FAN;
+		}
+	}
+}
+
+/* Implementation... */
+static int cpu_setup_pid(int cpu)
+{
+	struct wf_cpu_pid_param pid;
+	const struct mpu_data *mpu = cpu_mpu_data[cpu];
+	s32 tmax, ttarget, ptarget;
+	int fmin, fmax, hsize;
+
+	/* Get PID params from the appropriate MPU EEPROM */
+	tmax = mpu->tmax << 16;
+	ttarget = mpu->ttarget << 16;
+	ptarget = ((s32)(mpu->pmaxh - mpu->padjmax)) << 16;
+
+	DBG("wf_72: CPU%d ttarget = %d.%03d, tmax = %d.%03d\n",
+	    cpu, FIX32TOPRINT(ttarget), FIX32TOPRINT(tmax));
+
+	/* We keep a global tmax for overtemp calculations */
+	if (tmax < cpu_all_tmax)
+		cpu_all_tmax = tmax;
+
+	/* Set PID min/max by using the rear fan min/max */
+	fmin = wf_control_get_min(cpu_rear_fans[cpu]);
+	fmax = wf_control_get_max(cpu_rear_fans[cpu]);
+	DBG("wf_72: CPU%d max RPM range = [%d..%d]\n", cpu, fmin, fmax);
+
+	/* History size */
+	hsize = min_t(int, mpu->tguardband, WF_PID_MAX_HISTORY);
+	DBG("wf_72: CPU%d history size = %d\n", cpu, hsize);
+
+	/* Initialize PID loop */
+	pid.interval	= 1;	/* seconds */
+	pid.history_len = hsize;
+	pid.gd		= mpu->pid_gd;
+	pid.gp		= mpu->pid_gp;
+	pid.gr		= mpu->pid_gr;
+	pid.tmax	= tmax;
+	pid.ttarget	= ttarget;
+	pid.pmaxadj	= ptarget;
+	pid.min		= fmin;
+	pid.max		= fmax;
+
+	wf_cpu_pid_init(&cpu_pid[cpu], &pid);
+	cpu_pid[cpu].target = 1000;
+
+	return 0;
+}
+
+/* Backside/U3 fan */
+static struct wf_pid_param backside_u3_param = {
+	.interval	= 5,
+	.history_len	= 2,
+	.gd		= 40 << 20,
+	.gp		= 5 << 20,
+	.gr		= 0,
+	.itarget	= 65 << 16,
+	.additive	= 1,
+	.min		= 20,
+	.max		= 100,
+};
+
+static struct wf_pid_param backside_u3h_param = {
+	.interval	= 5,
+	.history_len	= 2,
+	.gd		= 20 << 20,
+	.gp		= 5 << 20,
+	.gr		= 0,
+	.itarget	= 75 << 16,
+	.additive	= 1,
+	.min		= 20,
+	.max		= 100,
+};
+
+static void backside_fan_tick(void)
+{
+	s32 temp;
+	int speed;
+	int err;
+
+	if (!backside_fan || !backside_temp || !backside_tick)
+		return;
+	if (--backside_tick > 0)
+		return;
+	backside_tick = backside_pid.param.interval;
+
+	DBG_LOTS("* backside fans tick\n");
+
+	/* Update fan speed from actual fans */
+	err = wf_control_get(backside_fan, &speed);
+	if (!err)
+		backside_pid.target = speed;
+
+	err = wf_sensor_get(backside_temp, &temp);
+	if (err) {
+		printk(KERN_WARNING "windfarm: U4 temp sensor error %d\n",
+		       err);
+		failure_state |= FAILURE_SENSOR;
+		wf_control_set_max(backside_fan);
+		return;
+	}
+	speed = wf_pid_run(&backside_pid, temp);
+
+	DBG_LOTS("backside PID temp=%d.%.3d speed=%d\n",
+		 FIX32TOPRINT(temp), speed);
+
+	err = wf_control_set(backside_fan, speed);
+	if (err) {
+		printk(KERN_WARNING "windfarm: backside fan error %d\n", err);
+		failure_state |= FAILURE_FAN;
+	}
+}
+
+static void backside_setup_pid(void)
+{
+	/* first time initialize things */
+	s32 fmin = wf_control_get_min(backside_fan);
+	s32 fmax = wf_control_get_max(backside_fan);
+	struct wf_pid_param param;
+	struct device_node *u3;
+	int u3h = 1; /* conservative by default */
+
+	u3 = of_find_node_by_path("/u3@0,f8000000");
+	if (u3 != NULL) {
+		const u32 *vers = of_get_property(u3, "device-rev", NULL);
+		if (vers)
+			if (((*vers) & 0x3f) < 0x34)
+				u3h = 0;
+		of_node_put(u3);
+	}
+
+	param = u3h ? backside_u3h_param : backside_u3_param;
+
+	param.min = max(param.min, fmin);
+	param.max = min(param.max, fmax);
+	wf_pid_init(&backside_pid, &param);
+	backside_tick = 1;
+
+	pr_info("wf_pm72: Backside control loop started.\n");
+}
+
+/* Drive bay fan */
+static const struct wf_pid_param drives_param = {
+	.interval	= 5,
+	.history_len	= 2,
+	.gd		= 30 << 20,
+	.gp		= 5 << 20,
+	.gr		= 0,
+	.itarget	= 40 << 16,
+	.additive	= 1,
+	.min		= 300,
+	.max		= 4000,
+};
+
+static void drives_fan_tick(void)
+{
+	s32 temp;
+	int speed;
+	int err;
+
+	if (!drives_fan || !drives_temp || !drives_tick)
+		return;
+	if (--drives_tick > 0)
+		return;
+	drives_tick = drives_pid.param.interval;
+
+	DBG_LOTS("* drives fans tick\n");
+
+	/* Update fan speed from actual fans */
+	err = wf_control_get(drives_fan, &speed);
+	if (!err)
+		drives_pid.target = speed;
+
+	err = wf_sensor_get(drives_temp, &temp);
+	if (err) {
+		pr_warning("wf_pm72: drive bay temp sensor error %d\n", err);
+		failure_state |= FAILURE_SENSOR;
+		wf_control_set_max(drives_fan);
+		return;
+	}
+	speed = wf_pid_run(&drives_pid, temp);
+
+	DBG_LOTS("drives PID temp=%d.%.3d speed=%d\n",
+		 FIX32TOPRINT(temp), speed);
+
+	err = wf_control_set(drives_fan, speed);
+	if (err) {
+		printk(KERN_WARNING "windfarm: drive bay fan error %d\n", err);
+		failure_state |= FAILURE_FAN;
+	}
+}
+
+static void drives_setup_pid(void)
+{
+	/* first time initialize things */
+	s32 fmin = wf_control_get_min(drives_fan);
+	s32 fmax = wf_control_get_max(drives_fan);
+	struct wf_pid_param param = drives_param;
+
+	param.min = max(param.min, fmin);
+	param.max = min(param.max, fmax);
+	wf_pid_init(&drives_pid, &param);
+	drives_tick = 1;
+
+	pr_info("wf_pm72: Drive bay control loop started.\n");
+}
+
+static void set_fail_state(void)
+{
+	cpu_max_all_fans();
+
+	if (backside_fan)
+		wf_control_set_max(backside_fan);
+	if (slots_fan)
+		wf_control_set_max(slots_fan);
+	if (drives_fan)
+		wf_control_set_max(drives_fan);
+}
+
+static void pm72_tick(void)
+{
+	int i, last_failure;
+
+	if (!started) {
+		started = 1;
+		printk(KERN_INFO "windfarm: CPUs control loops started.\n");
+		for (i = 0; i < nr_chips; ++i) {
+			if (cpu_setup_pid(i) < 0) {
+				failure_state = FAILURE_PERM;
+				set_fail_state();
+				break;
+			}
+		}
+		DBG_LOTS("cpu_all_tmax=%d.%03d\n", FIX32TOPRINT(cpu_all_tmax));
+
+		backside_setup_pid();
+		drives_setup_pid();
+
+		/*
+		 * We don't have the right stuff to drive the PCI fan
+		 * so we fix it to a default value
+		 */
+		wf_control_set(slots_fan, SLOTS_FAN_DEFAULT_PWM);
+
+#ifdef HACKED_OVERTEMP
+		cpu_all_tmax = 60 << 16;
+#endif
+	}
+
+	/* Permanent failure, bail out */
+	if (failure_state & FAILURE_PERM)
+		return;
+
+	/*
+	 * Clear all failure bits except low overtemp which will be eventually
+	 * cleared by the control loop itself
+	 */
+	last_failure = failure_state;
+	failure_state &= FAILURE_LOW_OVERTEMP;
+	if (cpu_pid_combined)
+		cpu_fans_tick_combined();
+	else
+		cpu_fans_tick_split();
+	backside_fan_tick();
+	drives_fan_tick();
+
+	DBG_LOTS("  last_failure: 0x%x, failure_state: %x\n",
+		 last_failure, failure_state);
+
+	/* Check for failures. Any failure causes cpufreq clamping */
+	if (failure_state && last_failure == 0 && cpufreq_clamp)
+		wf_control_set_max(cpufreq_clamp);
+	if (failure_state == 0 && last_failure && cpufreq_clamp)
+		wf_control_set_min(cpufreq_clamp);
+
+	/* That's it for now, we might want to deal with other failures
+	 * differently in the future though
+	 */
+}
+
+static void pm72_new_control(struct wf_control *ct)
+{
+	bool all_controls;
+	bool had_pump = cpu_pumps[0] || cpu_pumps[1];
+
+	if (!strcmp(ct->name, "cpu-front-fan-0"))
+		cpu_front_fans[0] = ct;
+	else if (!strcmp(ct->name, "cpu-front-fan-1"))
+		cpu_front_fans[1] = ct;
+	else if (!strcmp(ct->name, "cpu-rear-fan-0"))
+		cpu_rear_fans[0] = ct;
+	else if (!strcmp(ct->name, "cpu-rear-fan-1"))
+		cpu_rear_fans[1] = ct;
+	else if (!strcmp(ct->name, "cpu-pump-0"))
+		cpu_pumps[0] = ct;
+	else if (!strcmp(ct->name, "cpu-pump-1"))
+		cpu_pumps[1] = ct;
+	else if (!strcmp(ct->name, "backside-fan"))
+		backside_fan = ct;
+	else if (!strcmp(ct->name, "slots-fan"))
+		slots_fan = ct;
+	else if (!strcmp(ct->name, "drive-bay-fan"))
+		drives_fan = ct;
+	else if (!strcmp(ct->name, "cpufreq-clamp"))
+		cpufreq_clamp = ct;
+
+	all_controls =
+		cpu_front_fans[0] &&
+		cpu_rear_fans[0] &&
+		backside_fan &&
+		slots_fan &&
+		drives_fan;
+	if (nr_chips > 1)
+		all_controls &=
+			cpu_front_fans[1] &&
+			cpu_rear_fans[1];
+	have_all_controls = all_controls;
+
+	if ((cpu_pumps[0] || cpu_pumps[1]) && !had_pump) {
+		pr_info("wf_pm72: Liquid cooling pump(s) detected,"
+			" using new algorithm !\n");
+		cpu_pid_combined = true;
+	}
+}
+
+
+static void pm72_new_sensor(struct wf_sensor *sr)
+{
+	bool all_sensors;
+
+	if (!strcmp(sr->name, "cpu-diode-temp-0"))
+		sens_cpu_temp[0] = sr;
+	else if (!strcmp(sr->name, "cpu-diode-temp-1"))
+		sens_cpu_temp[1] = sr;
+	else if (!strcmp(sr->name, "cpu-voltage-0"))
+		sens_cpu_volts[0] = sr;
+	else if (!strcmp(sr->name, "cpu-voltage-1"))
+		sens_cpu_volts[1] = sr;
+	else if (!strcmp(sr->name, "cpu-current-0"))
+		sens_cpu_amps[0] = sr;
+	else if (!strcmp(sr->name, "cpu-current-1"))
+		sens_cpu_amps[1] = sr;
+	else if (!strcmp(sr->name, "backside-temp"))
+		backside_temp = sr;
+	else if (!strcmp(sr->name, "hd-temp"))
+		drives_temp = sr;
+
+	all_sensors =
+		sens_cpu_temp[0] &&
+		sens_cpu_volts[0] &&
+		sens_cpu_amps[0] &&
+		backside_temp &&
+		drives_temp;
+	if (nr_chips > 1)
+		all_sensors &=
+			sens_cpu_temp[1] &&
+			sens_cpu_volts[1] &&
+			sens_cpu_amps[1];
+
+	have_all_sensors = all_sensors;
+}
+
+static int pm72_wf_notify(struct notifier_block *self,
+			  unsigned long event, void *data)
+{
+	switch (event) {
+	case WF_EVENT_NEW_SENSOR:
+		pm72_new_sensor(data);
+		break;
+	case WF_EVENT_NEW_CONTROL:
+		pm72_new_control(data);
+		break;
+	case WF_EVENT_TICK:
+		if (have_all_controls && have_all_sensors)
+			pm72_tick();
+	}
+	return 0;
+}
+
+static struct notifier_block pm72_events = {
+	.notifier_call = pm72_wf_notify,
+};
+
+static int wf_pm72_probe(struct platform_device *dev)
+{
+	wf_register_client(&pm72_events);
+	return 0;
+}
+
+static int __devexit wf_pm72_remove(struct platform_device *dev)
+{
+	wf_unregister_client(&pm72_events);
+
+	/* should release all sensors and controls */
+	return 0;
+}
+
+static struct platform_driver wf_pm72_driver = {
+	.probe	= wf_pm72_probe,
+	.remove	= wf_pm72_remove,
+	.driver	= {
+		.name = "windfarm",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init wf_pm72_init(void)
+{
+	struct device_node *cpu;
+	int i;
+
+	if (!of_machine_is_compatible("PowerMac7,2") &&
+	    !of_machine_is_compatible("PowerMac7,3"))
+		return -ENODEV;
+
+	/* Count the number of CPU cores */
+	nr_chips = 0;
+	for (cpu = NULL; (cpu = of_find_node_by_type(cpu, "cpu")) != NULL; )
+		++nr_chips;
+	if (nr_chips > NR_CHIPS)
+		nr_chips = NR_CHIPS;
+
+	pr_info("windfarm: Initializing for desktop G5 with %d chips\n",
+		nr_chips);
+
+	/* Get MPU data for each CPU */
+	for (i = 0; i < nr_chips; i++) {
+		cpu_mpu_data[i] = wf_get_mpu(i);
+		if (!cpu_mpu_data[i]) {
+			pr_err("wf_pm72: Failed to find MPU data for CPU %d\n", i);
+			return -ENXIO;
+		}
+	}
+
+#ifdef MODULE
+	request_module("windfarm_fcu_controls");
+	request_module("windfarm_lm75_sensor");
+	request_module("windfarm_ad7417_sensor");
+	request_module("windfarm_max6690_sensor");
+	request_module("windfarm_cpufreq_clamp");
+#endif /* MODULE */
+
+	platform_driver_register(&wf_pm72_driver);
+	return 0;
+}
+
+static void __exit wf_pm72_exit(void)
+{
+	platform_driver_unregister(&wf_pm72_driver);
+}
+
+module_init(wf_pm72_init);
+module_exit(wf_pm72_exit);
+
+MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>");
+MODULE_DESCRIPTION("Thermal control for AGP PowerMac G5s");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:windfarm");
diff --git a/drivers/macintosh/windfarm_rm31.c b/drivers/macintosh/windfarm_rm31.c
new file mode 100644
index 0000000..3eca6d4
--- /dev/null
+++ b/drivers/macintosh/windfarm_rm31.c
@@ -0,0 +1,740 @@
+/*
+ * Windfarm PowerMac thermal control.
+ * Control loops for RackMack3,1 (Xserve G5)
+ *
+ * Copyright (C) 2012 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * Use and redistribute under the terms of the GNU GPL v2.
+ */
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <asm/prom.h>
+#include <asm/smu.h>
+
+#include "windfarm.h"
+#include "windfarm_pid.h"
+#include "windfarm_mpu.h"
+
+#define VERSION "1.0"
+
+#undef DEBUG
+#undef LOTSA_DEBUG
+
+#ifdef DEBUG
+#define DBG(args...)	printk(args)
+#else
+#define DBG(args...)	do { } while(0)
+#endif
+
+#ifdef LOTSA_DEBUG
+#define DBG_LOTS(args...)	printk(args)
+#else
+#define DBG_LOTS(args...)	do { } while(0)
+#endif
+
+/* define this to force CPU overtemp to 60 degree, useful for testing
+ * the overtemp code
+ */
+#undef HACKED_OVERTEMP
+
+/* We currently only handle 2 chips */
+#define NR_CHIPS	2
+#define NR_CPU_FANS	3 * NR_CHIPS
+
+/* Controls and sensors */
+static struct wf_sensor *sens_cpu_temp[NR_CHIPS];
+static struct wf_sensor *sens_cpu_volts[NR_CHIPS];
+static struct wf_sensor *sens_cpu_amps[NR_CHIPS];
+static struct wf_sensor *backside_temp;
+static struct wf_sensor *slots_temp;
+static struct wf_sensor *dimms_temp;
+
+static struct wf_control *cpu_fans[NR_CHIPS][3];
+static struct wf_control *backside_fan;
+static struct wf_control *slots_fan;
+static struct wf_control *cpufreq_clamp;
+
+/* We keep a temperature history for average calculation of 180s */
+#define CPU_TEMP_HIST_SIZE	180
+
+/* PID loop state */
+static const struct mpu_data *cpu_mpu_data[NR_CHIPS];
+static struct wf_cpu_pid_state cpu_pid[NR_CHIPS];
+static u32 cpu_thist[CPU_TEMP_HIST_SIZE];
+static int cpu_thist_pt;
+static s64 cpu_thist_total;
+static s32 cpu_all_tmax = 100 << 16;
+static struct wf_pid_state backside_pid;
+static int backside_tick;
+static struct wf_pid_state slots_pid;
+static int slots_tick;
+static int slots_speed;
+static struct wf_pid_state dimms_pid;
+static int dimms_output_clamp;
+
+static int nr_chips;
+static bool have_all_controls;
+static bool have_all_sensors;
+static bool started;
+
+static int failure_state;
+#define FAILURE_SENSOR		1
+#define FAILURE_FAN		2
+#define FAILURE_PERM		4
+#define FAILURE_LOW_OVERTEMP	8
+#define FAILURE_HIGH_OVERTEMP	16
+
+/* Overtemp values */
+#define LOW_OVER_AVERAGE	0
+#define LOW_OVER_IMMEDIATE	(10 << 16)
+#define LOW_OVER_CLEAR		((-10) << 16)
+#define HIGH_OVER_IMMEDIATE	(14 << 16)
+#define HIGH_OVER_AVERAGE	(10 << 16)
+#define HIGH_OVER_IMMEDIATE	(14 << 16)
+
+
+static void cpu_max_all_fans(void)
+{
+	int i;
+
+	/* We max all CPU fans in case of a sensor error. We also do the
+	 * cpufreq clamping now, even if it's supposedly done later by the
+	 * generic code anyway, we do it earlier here to react faster
+	 */
+	if (cpufreq_clamp)
+		wf_control_set_max(cpufreq_clamp);
+	for (i = 0; i < nr_chips; i++) {
+		if (cpu_fans[i][0])
+			wf_control_set_max(cpu_fans[i][0]);
+		if (cpu_fans[i][1])
+			wf_control_set_max(cpu_fans[i][1]);
+		if (cpu_fans[i][2])
+			wf_control_set_max(cpu_fans[i][2]);
+	}
+}
+
+static int cpu_check_overtemp(s32 temp)
+{
+	int new_state = 0;
+	s32 t_avg, t_old;
+	static bool first = true;
+
+	/* First check for immediate overtemps */
+	if (temp >= (cpu_all_tmax + LOW_OVER_IMMEDIATE)) {
+		new_state |= FAILURE_LOW_OVERTEMP;
+		if ((failure_state & FAILURE_LOW_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Overtemp due to immediate CPU"
+			       " temperature !\n");
+	}
+	if (temp >= (cpu_all_tmax + HIGH_OVER_IMMEDIATE)) {
+		new_state |= FAILURE_HIGH_OVERTEMP;
+		if ((failure_state & FAILURE_HIGH_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Critical overtemp due to"
+			       " immediate CPU temperature !\n");
+	}
+
+	/*
+	 * The first time around, initialize the array with the first
+	 * temperature reading
+	 */
+	if (first) {
+		int i;
+
+		cpu_thist_total = 0;
+		for (i = 0; i < CPU_TEMP_HIST_SIZE; i++) {
+			cpu_thist[i] = temp;
+			cpu_thist_total += temp;
+		}
+		first = false;
+	}
+
+	/*
+	 * We calculate a history of max temperatures and use that for the
+	 * overtemp management
+	 */
+	t_old = cpu_thist[cpu_thist_pt];
+	cpu_thist[cpu_thist_pt] = temp;
+	cpu_thist_pt = (cpu_thist_pt + 1) % CPU_TEMP_HIST_SIZE;
+	cpu_thist_total -= t_old;
+	cpu_thist_total += temp;
+	t_avg = cpu_thist_total / CPU_TEMP_HIST_SIZE;
+
+	DBG_LOTS("  t_avg = %d.%03d (out: %d.%03d, in: %d.%03d)\n",
+		 FIX32TOPRINT(t_avg), FIX32TOPRINT(t_old), FIX32TOPRINT(temp));
+
+	/* Now check for average overtemps */
+	if (t_avg >= (cpu_all_tmax + LOW_OVER_AVERAGE)) {
+		new_state |= FAILURE_LOW_OVERTEMP;
+		if ((failure_state & FAILURE_LOW_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Overtemp due to average CPU"
+			       " temperature !\n");
+	}
+	if (t_avg >= (cpu_all_tmax + HIGH_OVER_AVERAGE)) {
+		new_state |= FAILURE_HIGH_OVERTEMP;
+		if ((failure_state & FAILURE_HIGH_OVERTEMP) == 0)
+			printk(KERN_ERR "windfarm: Critical overtemp due to"
+			       " average CPU temperature !\n");
+	}
+
+	/* Now handle overtemp conditions. We don't currently use the windfarm
+	 * overtemp handling core as it's not fully suited to the needs of those
+	 * new machine. This will be fixed later.
+	 */
+	if (new_state) {
+		/* High overtemp -> immediate shutdown */
+		if (new_state & FAILURE_HIGH_OVERTEMP)
+			machine_power_off();
+		if ((failure_state & new_state) != new_state)
+			cpu_max_all_fans();
+		failure_state |= new_state;
+	} else if ((failure_state & FAILURE_LOW_OVERTEMP) &&
+		   (temp < (cpu_all_tmax + LOW_OVER_CLEAR))) {
+		printk(KERN_ERR "windfarm: Overtemp condition cleared !\n");
+		failure_state &= ~FAILURE_LOW_OVERTEMP;
+	}
+
+	return failure_state & (FAILURE_LOW_OVERTEMP | FAILURE_HIGH_OVERTEMP);
+}
+
+static int read_one_cpu_vals(int cpu, s32 *temp, s32 *power)
+{
+	s32 dtemp, volts, amps;
+	int rc;
+
+	/* Get diode temperature */
+	rc = wf_sensor_get(sens_cpu_temp[cpu], &dtemp);
+	if (rc) {
+		DBG("  CPU%d: temp reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: temp   = %d.%03d\n", cpu, FIX32TOPRINT((dtemp)));
+	*temp = dtemp;
+
+	/* Get voltage */
+	rc = wf_sensor_get(sens_cpu_volts[cpu], &volts);
+	if (rc) {
+		DBG("  CPU%d, volts reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: volts  = %d.%03d\n", cpu, FIX32TOPRINT((volts)));
+
+	/* Get current */
+	rc = wf_sensor_get(sens_cpu_amps[cpu], &amps);
+	if (rc) {
+		DBG("  CPU%d, current reading error !\n", cpu);
+		return -EIO;
+	}
+	DBG_LOTS("  CPU%d: amps   = %d.%03d\n", cpu, FIX32TOPRINT((amps)));
+
+	/* Calculate power */
+
+	/* Scale voltage and current raw sensor values according to fixed scales
+	 * obtained in Darwin and calculate power from I and V
+	 */
+	*power = (((u64)volts) * ((u64)amps)) >> 16;
+
+	DBG_LOTS("  CPU%d: power  = %d.%03d\n", cpu, FIX32TOPRINT((*power)));
+
+	return 0;
+
+}
+
+static void cpu_fans_tick(void)
+{
+	int err, cpu, i;
+	s32 speed, temp, power, t_max = 0;
+
+	DBG_LOTS("* cpu fans_tick_split()\n");
+
+	for (cpu = 0; cpu < nr_chips; ++cpu) {
+		struct wf_cpu_pid_state *sp = &cpu_pid[cpu];
+
+		/* Read current speed */
+		wf_control_get(cpu_fans[cpu][0], &sp->target);
+
+		err = read_one_cpu_vals(cpu, &temp, &power);
+		if (err) {
+			failure_state |= FAILURE_SENSOR;
+			cpu_max_all_fans();
+			return;
+		}
+
+		/* Keep track of highest temp */
+		t_max = max(t_max, temp);
+
+		/* Handle possible overtemps */
+		if (cpu_check_overtemp(t_max))
+			return;
+
+		/* Run PID */
+		wf_cpu_pid_run(sp, power, temp);
+
+		DBG_LOTS("  CPU%d: target = %d RPM\n", cpu, sp->target);
+
+		/* Apply DIMMs clamp */
+		speed = max(sp->target, dimms_output_clamp);
+
+		/* Apply result to all cpu fans */
+		for (i = 0; i < 3; i++) {
+			err = wf_control_set(cpu_fans[cpu][i], speed);
+			if (err) {
+				pr_warning("wf_rm31: Fan %s reports error %d\n",
+					   cpu_fans[cpu][i]->name, err);
+				failure_state |= FAILURE_FAN;
+			}
+		}
+	}
+}
+
+/* Implementation... */
+static int cpu_setup_pid(int cpu)
+{
+	struct wf_cpu_pid_param pid;
+	const struct mpu_data *mpu = cpu_mpu_data[cpu];
+	s32 tmax, ttarget, ptarget;
+	int fmin, fmax, hsize;
+
+	/* Get PID params from the appropriate MPU EEPROM */
+	tmax = mpu->tmax << 16;
+	ttarget = mpu->ttarget << 16;
+	ptarget = ((s32)(mpu->pmaxh - mpu->padjmax)) << 16;
+
+	DBG("wf_72: CPU%d ttarget = %d.%03d, tmax = %d.%03d\n",
+	    cpu, FIX32TOPRINT(ttarget), FIX32TOPRINT(tmax));
+
+	/* We keep a global tmax for overtemp calculations */
+	if (tmax < cpu_all_tmax)
+		cpu_all_tmax = tmax;
+
+	/* Set PID min/max by using the rear fan min/max */
+	fmin = wf_control_get_min(cpu_fans[cpu][0]);
+	fmax = wf_control_get_max(cpu_fans[cpu][0]);
+	DBG("wf_72: CPU%d max RPM range = [%d..%d]\n", cpu, fmin, fmax);
+
+	/* History size */
+	hsize = min_t(int, mpu->tguardband, WF_PID_MAX_HISTORY);
+	DBG("wf_72: CPU%d history size = %d\n", cpu, hsize);
+
+	/* Initialize PID loop */
+	pid.interval	= 1;	/* seconds */
+	pid.history_len = hsize;
+	pid.gd		= mpu->pid_gd;
+	pid.gp		= mpu->pid_gp;
+	pid.gr		= mpu->pid_gr;
+	pid.tmax	= tmax;
+	pid.ttarget	= ttarget;
+	pid.pmaxadj	= ptarget;
+	pid.min		= fmin;
+	pid.max		= fmax;
+
+	wf_cpu_pid_init(&cpu_pid[cpu], &pid);
+	cpu_pid[cpu].target = 4000;
+	
+	return 0;
+}
+
+/* Backside/U3 fan */
+static struct wf_pid_param backside_param = {
+	.interval	= 1,
+	.history_len	= 2,
+	.gd		= 0x00500000,
+	.gp		= 0x0004cccc,
+	.gr		= 0,
+	.itarget	= 70 << 16,
+	.additive	= 0,
+	.min		= 20,
+	.max		= 100,
+};
+
+/* DIMMs temperature (clamp the backside fan) */
+static struct wf_pid_param dimms_param = {
+	.interval	= 1,
+	.history_len	= 20,
+	.gd		= 0,
+	.gp		= 0,
+	.gr		= 0x06553600,
+	.itarget	= 50 << 16,
+	.additive	= 0,
+	.min		= 4000,
+	.max		= 14000,
+};
+
+static void backside_fan_tick(void)
+{
+	s32 temp, dtemp;
+	int speed, dspeed, fan_min;
+	int err;
+
+	if (!backside_fan || !backside_temp || !dimms_temp || !backside_tick)
+		return;
+	if (--backside_tick > 0)
+		return;
+	backside_tick = backside_pid.param.interval;
+
+	DBG_LOTS("* backside fans tick\n");
+
+	/* Update fan speed from actual fans */
+	err = wf_control_get(backside_fan, &speed);
+	if (!err)
+		backside_pid.target = speed;
+
+	err = wf_sensor_get(backside_temp, &temp);
+	if (err) {
+		printk(KERN_WARNING "windfarm: U3 temp sensor error %d\n",
+		       err);
+		failure_state |= FAILURE_SENSOR;
+		wf_control_set_max(backside_fan);
+		return;
+	}
+	speed = wf_pid_run(&backside_pid, temp);
+
+	DBG_LOTS("backside PID temp=%d.%.3d speed=%d\n",
+		 FIX32TOPRINT(temp), speed);
+
+	err = wf_sensor_get(dimms_temp, &dtemp);
+	if (err) {
+		printk(KERN_WARNING "windfarm: DIMMs temp sensor error %d\n",
+		       err);
+		failure_state |= FAILURE_SENSOR;
+		wf_control_set_max(backside_fan);
+		return;
+	}
+	dspeed = wf_pid_run(&dimms_pid, dtemp);
+	dimms_output_clamp = dspeed;
+
+	fan_min = (dspeed * 100) / 14000;
+	fan_min = max(fan_min, backside_param.min);
+	speed = max(speed, fan_min);
+
+	err = wf_control_set(backside_fan, speed);
+	if (err) {
+		printk(KERN_WARNING "windfarm: backside fan error %d\n", err);
+		failure_state |= FAILURE_FAN;
+	}
+}
+
+static void backside_setup_pid(void)
+{
+	/* first time initialize things */
+	s32 fmin = wf_control_get_min(backside_fan);
+	s32 fmax = wf_control_get_max(backside_fan);
+	struct wf_pid_param param;
+
+	param = backside_param;
+	param.min = max(param.min, fmin);
+	param.max = min(param.max, fmax);
+	wf_pid_init(&backside_pid, &param);
+
+	param = dimms_param;
+	wf_pid_init(&dimms_pid, &param);
+
+	backside_tick = 1;
+
+	pr_info("wf_rm31: Backside control loop started.\n");
+}
+
+/* Slots fan */
+static const struct wf_pid_param slots_param = {
+	.interval	= 5,
+	.history_len	= 2,
+	.gd		= 30 << 20,
+	.gp		= 5 << 20,
+	.gr		= 0,
+	.itarget	= 40 << 16,
+	.additive	= 1,
+	.min		= 300,
+	.max		= 4000,
+};
+
+static void slots_fan_tick(void)
+{
+	s32 temp;
+	int speed;
+	int err;
+
+	if (!slots_fan || !slots_temp || !slots_tick)
+		return;
+	if (--slots_tick > 0)
+		return;
+	slots_tick = slots_pid.param.interval;
+
+	DBG_LOTS("* slots fans tick\n");
+
+	err = wf_sensor_get(slots_temp, &temp);
+	if (err) {
+		pr_warning("wf_rm31: slots temp sensor error %d\n", err);
+		failure_state |= FAILURE_SENSOR;
+		wf_control_set_max(slots_fan);
+		return;
+	}
+	speed = wf_pid_run(&slots_pid, temp);
+
+	DBG_LOTS("slots PID temp=%d.%.3d speed=%d\n",
+		 FIX32TOPRINT(temp), speed);
+
+	slots_speed = speed;
+	err = wf_control_set(slots_fan, speed);
+	if (err) {
+		printk(KERN_WARNING "windfarm: slots bay fan error %d\n", err);
+		failure_state |= FAILURE_FAN;
+	}
+}
+
+static void slots_setup_pid(void)
+{
+	/* first time initialize things */
+	s32 fmin = wf_control_get_min(slots_fan);
+	s32 fmax = wf_control_get_max(slots_fan);
+	struct wf_pid_param param = slots_param;
+
+	param.min = max(param.min, fmin);
+	param.max = min(param.max, fmax);
+	wf_pid_init(&slots_pid, &param);
+	slots_tick = 1;
+
+	pr_info("wf_rm31: Slots control loop started.\n");
+}
+
+static void set_fail_state(void)
+{
+	cpu_max_all_fans();
+
+	if (backside_fan)
+		wf_control_set_max(backside_fan);
+	if (slots_fan)
+		wf_control_set_max(slots_fan);
+}
+
+static void rm31_tick(void)
+{
+	int i, last_failure;
+
+	if (!started) {
+		started = 1;
+		printk(KERN_INFO "windfarm: CPUs control loops started.\n");
+		for (i = 0; i < nr_chips; ++i) {
+			if (cpu_setup_pid(i) < 0) {
+				failure_state = FAILURE_PERM;
+				set_fail_state();
+				break;
+			}
+		}
+		DBG_LOTS("cpu_all_tmax=%d.%03d\n", FIX32TOPRINT(cpu_all_tmax));
+
+		backside_setup_pid();
+		slots_setup_pid();
+
+#ifdef HACKED_OVERTEMP
+		cpu_all_tmax = 60 << 16;
+#endif
+	}
+
+	/* Permanent failure, bail out */
+	if (failure_state & FAILURE_PERM)
+		return;
+
+	/*
+	 * Clear all failure bits except low overtemp which will be eventually
+	 * cleared by the control loop itself
+	 */
+	last_failure = failure_state;
+	failure_state &= FAILURE_LOW_OVERTEMP;
+	backside_fan_tick();
+	slots_fan_tick();
+
+	/* We do CPUs last because they can be clamped high by
+	 * DIMM temperature
+	 */
+	cpu_fans_tick();
+
+	DBG_LOTS("  last_failure: 0x%x, failure_state: %x\n",
+		 last_failure, failure_state);
+
+	/* Check for failures. Any failure causes cpufreq clamping */
+	if (failure_state && last_failure == 0 && cpufreq_clamp)
+		wf_control_set_max(cpufreq_clamp);
+	if (failure_state == 0 && last_failure && cpufreq_clamp)
+		wf_control_set_min(cpufreq_clamp);
+
+	/* That's it for now, we might want to deal with other failures
+	 * differently in the future though
+	 */
+}
+
+static void rm31_new_control(struct wf_control *ct)
+{
+	bool all_controls;
+
+	if (!strcmp(ct->name, "cpu-fan-a-0"))
+		cpu_fans[0][0] = ct;
+	else if (!strcmp(ct->name, "cpu-fan-b-0"))
+		cpu_fans[0][1] = ct;
+	else if (!strcmp(ct->name, "cpu-fan-c-0"))
+		cpu_fans[0][2] = ct;
+	else if (!strcmp(ct->name, "cpu-fan-a-1"))
+		cpu_fans[1][0] = ct;
+	else if (!strcmp(ct->name, "cpu-fan-b-1"))
+		cpu_fans[1][1] = ct;
+	else if (!strcmp(ct->name, "cpu-fan-c-1"))
+		cpu_fans[1][2] = ct;
+	else if (!strcmp(ct->name, "backside-fan"))
+		backside_fan = ct;
+	else if (!strcmp(ct->name, "slots-fan"))
+		slots_fan = ct;
+	else if (!strcmp(ct->name, "cpufreq-clamp"))
+		cpufreq_clamp = ct;
+
+	all_controls =
+		cpu_fans[0][0] &&
+		cpu_fans[0][1] &&
+		cpu_fans[0][2] &&
+		backside_fan &&
+		slots_fan;
+	if (nr_chips > 1)
+		all_controls &=
+			cpu_fans[1][0] &&
+			cpu_fans[1][1] &&
+			cpu_fans[1][2];
+	have_all_controls = all_controls;
+}
+
+
+static void rm31_new_sensor(struct wf_sensor *sr)
+{
+	bool all_sensors;
+
+	if (!strcmp(sr->name, "cpu-diode-temp-0"))
+		sens_cpu_temp[0] = sr;
+	else if (!strcmp(sr->name, "cpu-diode-temp-1"))
+		sens_cpu_temp[1] = sr;
+	else if (!strcmp(sr->name, "cpu-voltage-0"))
+		sens_cpu_volts[0] = sr;
+	else if (!strcmp(sr->name, "cpu-voltage-1"))
+		sens_cpu_volts[1] = sr;
+	else if (!strcmp(sr->name, "cpu-current-0"))
+		sens_cpu_amps[0] = sr;
+	else if (!strcmp(sr->name, "cpu-current-1"))
+		sens_cpu_amps[1] = sr;
+	else if (!strcmp(sr->name, "backside-temp"))
+		backside_temp = sr;
+	else if (!strcmp(sr->name, "slots-temp"))
+		slots_temp = sr;
+	else if (!strcmp(sr->name, "dimms-temp"))
+		dimms_temp = sr;
+
+	all_sensors =
+		sens_cpu_temp[0] &&
+		sens_cpu_volts[0] &&
+		sens_cpu_amps[0] &&
+		backside_temp &&
+		slots_temp &&
+		dimms_temp;
+	if (nr_chips > 1)
+		all_sensors &=
+			sens_cpu_temp[1] &&
+			sens_cpu_volts[1] &&
+			sens_cpu_amps[1];
+
+	have_all_sensors = all_sensors;
+}
+
+static int rm31_wf_notify(struct notifier_block *self,
+			  unsigned long event, void *data)
+{
+	switch (event) {
+	case WF_EVENT_NEW_SENSOR:
+		rm31_new_sensor(data);
+		break;
+	case WF_EVENT_NEW_CONTROL:
+		rm31_new_control(data);
+		break;
+	case WF_EVENT_TICK:
+		if (have_all_controls && have_all_sensors)
+			rm31_tick();
+	}
+	return 0;
+}
+
+static struct notifier_block rm31_events = {
+	.notifier_call = rm31_wf_notify,
+};
+
+static int wf_rm31_probe(struct platform_device *dev)
+{
+	wf_register_client(&rm31_events);
+	return 0;
+}
+
+static int __devexit wf_rm31_remove(struct platform_device *dev)
+{
+	wf_unregister_client(&rm31_events);
+
+	/* should release all sensors and controls */
+	return 0;
+}
+
+static struct platform_driver wf_rm31_driver = {
+	.probe	= wf_rm31_probe,
+	.remove	= wf_rm31_remove,
+	.driver	= {
+		.name = "windfarm",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init wf_rm31_init(void)
+{
+	struct device_node *cpu;
+	int i;
+
+	if (!of_machine_is_compatible("RackMac3,1"))
+		return -ENODEV;
+
+	/* Count the number of CPU cores */
+	nr_chips = 0;
+	for (cpu = NULL; (cpu = of_find_node_by_type(cpu, "cpu")) != NULL; )
+		++nr_chips;
+	if (nr_chips > NR_CHIPS)
+		nr_chips = NR_CHIPS;
+
+	pr_info("windfarm: Initializing for desktop G5 with %d chips\n",
+		nr_chips);
+
+	/* Get MPU data for each CPU */
+	for (i = 0; i < nr_chips; i++) {
+		cpu_mpu_data[i] = wf_get_mpu(i);
+		if (!cpu_mpu_data[i]) {
+			pr_err("wf_rm31: Failed to find MPU data for CPU %d\n", i);
+			return -ENXIO;
+		}
+	}
+
+#ifdef MODULE
+	request_module("windfarm_fcu_controls");
+	request_module("windfarm_lm75_sensor");
+	request_module("windfarm_lm87_sensor");
+	request_module("windfarm_ad7417_sensor");
+	request_module("windfarm_max6690_sensor");
+	request_module("windfarm_cpufreq_clamp");
+#endif /* MODULE */
+
+	platform_driver_register(&wf_rm31_driver);
+	return 0;
+}
+
+static void __exit wf_rm31_exit(void)
+{
+	platform_driver_unregister(&wf_rm31_driver);
+}
+
+module_init(wf_rm31_init);
+module_exit(wf_rm31_exit);
+
+MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>");
+MODULE_DESCRIPTION("Thermal control for Xserve G5");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:windfarm");

^ permalink raw reply related

* [git pull] Please pull powerpc.git merge branch
From: Benjamin Herrenschmidt @ 2012-04-30  1:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linuxppc-dev list, Andrew Morton, Linux Kernel list

Hi Linus !

Here are a handful more fixes for powerpc. The irq stuff are all
regression fixes, and Gavin's patch is a simple compile fix.

Cheers,
Ben.

The following changes since commit 69964ea4c7b68c9399f7977aa5b9aa6539a6a98a:

  Linux 3.4-rc5 (2012-04-29 15:19:10 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git merge

for you to fetch changes up to 810b4de25e53459323ff48957b0162b48d6cbd57:

  tty/serial/pmac_zilog: Fix "nobody cared" IRQ message (2012-04-30 10:59:58 +1000)

----------------------------------------------------------------
Gavin Shan (1):
      powerpc/pseries: Rivet CONFIG_EEH for pSeries platform

Grant Likely (2):
      powerpc/8xx: Fix NR_IRQ bugs and refactor 8xx interrupt controller
      powerpc/irqdomain: Fix broken NR_IRQ references

Larry Finger (1):
      tty/serial/pmac_zilog: Fix "nobody cared" IRQ message

 arch/powerpc/include/asm/irq.h               |    4 --
 arch/powerpc/kernel/irq.c                    |    6 +--
 arch/powerpc/kernel/machine_kexec.c          |    7 +--
 arch/powerpc/platforms/cell/axon_msi.c       |    8 ++--
 arch/powerpc/platforms/cell/beat_interrupt.c |    2 +-
 arch/powerpc/platforms/powermac/pic.c        |    6 +--
 arch/powerpc/platforms/pseries/Kconfig       |    4 +-
 arch/powerpc/sysdev/cpm2_pic.c               |    3 +-
 arch/powerpc/sysdev/mpc8xx_pic.c             |   61 +++++++++-----------------
 arch/powerpc/sysdev/xics/xics-common.c       |    7 ++-
 drivers/tty/serial/pmac_zilog.c              |    6 +--
 11 files changed, 39 insertions(+), 75 deletions(-)

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  2:43 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1333491102.3040.12.camel@pasglop>

On Wed, 2012-04-04 at 08:11 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-04-03 at 18:03 -0400, David Miller wrote:
> 
> > > Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
> > > 
> > > I have only compile tested this, -ENOHARDWARE.
> > > Can someone with more powerpc kung-fu review and maybe test this?
> > > Esp. powerpc asm is not my strong point. I think i botched the
> > > stack frame in the call setup. Help?
> > 
> > I'm not applying this until a powerpc person tests it.
> > 
> > Also, we have an ARM JIT in the tree which probably needs to
> > be fixed similarly.
> 
> Matt's having a look at powerpc

Ok, he hasn't so I'll dig a bit.

No obvious wrongness (but I'm not very familiar with bpf), though I do
have a comment: sk_negative_common() and bpf_slow_path_common() should
be made one and single macro which takes the fallback function as an
argument.

I'll mess around & try to test using Jan test case & will come back
with an updated patch.

Cheers,
Ben.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  3:40 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335753820.20866.27.camel@pasglop>

On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:

> Ok, he hasn't so I'll dig a bit.
> 
> No obvious wrongness (but I'm not very familiar with bpf), though I do
> have a comment: sk_negative_common() and bpf_slow_path_common() should
> be made one and single macro which takes the fallback function as an
> argument.
> 
> I'll mess around & try to test using Jan test case & will come back
> with an updated patch.

Wow, hit that nasty along the way: The test program will not work
on big endian machines because of a nasty difference between
the kernel struct sock_fprog and libpcap struct bpf_program:

Kernel expects:

struct sock_fprog {     /* Required for SO_ATTACH_FILTER. */
        unsigned short          len;    /* Number of filter blocks */
        struct sock_filter __user *filter;
};

libpcap provides:

struct bpf_program {
        u_int bf_len;
        struct bpf_insn *bf_insns;
};

Note the unsigned short vs. unsigned int there ? This totally
breaks it here.

Is it expected that one can pass a struct bpf_program directly
to the kernel or should it be "converted" by the library in which
case it's just a bug in Jan's test program ?

Cheers,
Ben.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  3:43 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: eric.dumazet, matt, netdev, linux-kernel, linuxppc-dev,
	David Miller
In-Reply-To: <1335753820.20866.27.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Wed, 2012-04-04 at 08:11 +1000, Benjamin Herrenschmidt wrote:
>> On Tue, 2012-04-03 at 18:03 -0400, David Miller wrote:
>> 
>>>> Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
>>>> 
>>>> I have only compile tested this, -ENOHARDWARE. Can someone with
>>>> more powerpc kung-fu review and maybe test this? Esp. powerpc
>>>> asm is not my strong point. I think i botched the stack frame
>>>> in the call setup. Help?
>>> 
>>> I'm not applying this until a powerpc person tests it.
>>> 
>>> Also, we have an ARM JIT in the tree which probably needs to be
>>> fixed similarly.
>> 
>> Matt's having a look at powerpc
> 
> Ok, he hasn't so I'll dig a bit.
> 

That would be great Benjamin!

> No obvious wrongness (but I'm not very familiar with bpf),

As long as you know PPC ASM you are my man ;-)

> though I do have a comment: sk_negative_common() and
> bpf_slow_path_common() should be made one and single macro which
> takes the fallback function as an argument.
> 

I don't know if this is possible.
The return value is different (one returns 0 on success, the other != 0,
the return value of != is needed). I didn't wanted to change to much,
because i'm not fluent in ppc.

> I'll mess around & try to test using Jan test case & will come back 
> with an updated patch.
> 

Would be great!

> Cheers, Ben.
> 

Greetings
	Jan

-- 
A UDP packet walks into a

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  4:11 UTC (permalink / raw)
  To: David Miller
  Cc: kaffeemonster, eric.dumazet, matt, netdev, linux-kernel,
	linuxppc-dev
In-Reply-To: <1335753820.20866.27.camel@pasglop>

On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:

> > Matt's having a look at powerpc
> 
> Ok, he hasn't so I'll dig a bit.
> 
> No obvious wrongness (but I'm not very familiar with bpf), though I do
> have a comment: sk_negative_common() and bpf_slow_path_common() should
> be made one and single macro which takes the fallback function as an
> argument.

Ok, with the compile fix below it seems to work for me:

(Feel free to fold that into the original patch)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index af1ab5e..5c3cf2d 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -48,7 +48,13 @@
 /*
  * Assembly helpers from arch/powerpc/net/bpf_jit.S:
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+#define DECLARE_LOAD_FUNC(func)	\
+	extern u8 func[], func##_negative_offset[], func##_positive_offset[]
+
+DECLARE_LOAD_FUNC(sk_load_word);
+DECLARE_LOAD_FUNC(sk_load_half);
+DECLARE_LOAD_FUNC(sk_load_byte);
+DECLARE_LOAD_FUNC(sk_load_byte_msh);
 
 #define FUNCTION_DESCR_SIZE	24
 
Cheers,
Ben.

^ permalink raw reply related

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  4:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: eric.dumazet, matt, netdev, linux-kernel, linuxppc-dev,
	David Miller
In-Reply-To: <1335759088.20866.32.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
> 
>>> Matt's having a look at powerpc
>>
>> Ok, he hasn't so I'll dig a bit.
>>
>> No obvious wrongness (but I'm not very familiar with bpf), though I do
>> have a comment: sk_negative_common() and bpf_slow_path_common() should
>> be made one and single macro which takes the fallback function as an
>> argument.
> 
> Ok, with the compile fix below it seems to work for me:
> 
> (Feel free to fold that into the original patch)
> 

Should i resend the complete patch with the compile fix?

[snip]
>  
> Cheers,
> Ben.
> 


Greetings
	Jan


PS: I am sure i compile tested the orig. patch here, hmmm, must have
lost that part when moving trees, #GitIsNotMyFriend

-- 
en.gin.eer en-ji-nir n 1: a mechanism for converting caffeine into designs.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  4:29 UTC (permalink / raw)
  To: kaffeemonster
  Cc: eric.dumazet, matt, netdev, linux-kernel, linuxppc-dev,
	David Miller
In-Reply-To: <4F9E1496.9060603@googlemail.com>

On Mon, 2012-04-30 at 06:27 +0200, Jan Seiffert wrote:
> Benjamin Herrenschmidt schrieb:
> > On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
> > 
> >>> Matt's having a look at powerpc
> >>
> >> Ok, he hasn't so I'll dig a bit.
> >>
> >> No obvious wrongness (but I'm not very familiar with bpf), though I do
> >> have a comment: sk_negative_common() and bpf_slow_path_common() should
> >> be made one and single macro which takes the fallback function as an
> >> argument.
> > 
> > Ok, with the compile fix below it seems to work for me:
> > 
> > (Feel free to fold that into the original patch)
> > 
> 
> Should i resend the complete patch with the compile fix?

Won't hurt...

BTW. Any idea about that bpf_program vs. sock_fprog issue I mentioned
earlier ?

Cheers,
Ben.

^ permalink raw reply

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  4:43 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: eric.dumazet, matt, netdev, linux-kernel, linuxppc-dev,
	David Miller
In-Reply-To: <1335760199.20866.33.camel@pasglop>

Benjamin Herrenschmidt schrieb:
> On Mon, 2012-04-30 at 06:27 +0200, Jan Seiffert wrote:
>> Benjamin Herrenschmidt schrieb:
>>> On Mon, 2012-04-30 at 12:43 +1000, Benjamin Herrenschmidt wrote:
>>>
>>>>> Matt's having a look at powerpc
>>>>
>>>> Ok, he hasn't so I'll dig a bit.
>>>>
>>>> No obvious wrongness (but I'm not very familiar with bpf), though I do
>>>> have a comment: sk_negative_common() and bpf_slow_path_common() should
>>>> be made one and single macro which takes the fallback function as an
>>>> argument.
>>>
>>> Ok, with the compile fix below it seems to work for me:
>>>
>>> (Feel free to fold that into the original patch)
>>>
>>
>> Should i resend the complete patch with the compile fix?
> 
> Won't hurt...
> 

Ok

> BTW. Any idea about that bpf_program vs. sock_fprog issue I mentioned
> earlier ?
> 

No idea, i was going by the old saying:
"Thou shall not include kernel header, or you will feel the wrath of angry
kernel gurus."

> Cheers,
> Ben.
> 

Greetings
	Jan

-- 
The OO-Hype keeps on spinning, C stays.

^ permalink raw reply

* Re: [PATCH] Disable /dev/port interface on powerpc systems
From: Benjamin Herrenschmidt @ 2012-04-30  5:00 UTC (permalink / raw)
  To: Haren Myneni; +Cc: linuxppc-dev, anton
In-Reply-To: <1332314613.2982.38.camel@pasglop>

On Wed, 2012-03-21 at 18:23 +1100, Benjamin Herrenschmidt wrote:
> On Tue, 2012-03-20 at 22:37 -0700, Haren Myneni wrote:
> > Some power systems do not have legacy ISA devices. So, /dev/port is not
> > a valid interface on these systems. User level tools such as kbdrate is
> > trying to access the device using this interface which is causing the
> > system crash. 
> > 
> > This patch will fix this issue by not creating this interface on these
> > powerpc systems. 
> 
> Doesn't fix 32-bit... not a big deal for now I suppose... But I'd rather
> you change the patch a tiny bit to change legacy_isa_device_found() to
> arch_has_dev_port() instead. There may be other reason than legacy ISA
> to enable dev/port or not so let's make the arch hook more generic.
> 
> Another approach which might be even better is to filter per-access,
> ie for each read/write to /dev/port, have a hook to check if that
> specific port is valid, but that may be overkill for what is
> essentially a legacy interface that should have died a long time ago :)

The more I think about this the less I like it ...

At the end of the day, the bug is in kbdrate (or the distro, whatever)

It's just completely broken to have a bit of userspace running as root
randomly whacking IO ports or MMIO registers based on the assumption
that there must be some kind of ISA kbd controller there... It's going
to break on more than just powerpc (ARM anyone ?).

So kbdrate is busted and needs to be either fixed or removed.

Cheers,
Ben.

^ permalink raw reply

* [REGRESSION][PATCH V5 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Jan Seiffert @ 2012-04-30  5:02 UTC (permalink / raw)
  To: netdev; +Cc: eric.dumazet, matt, linux-kernel, linuxppc-dev, David Miller
In-Reply-To: <1335759088.20866.32.camel@pasglop>

Now the helper function from filter.c for negative offsets is exported,
it can be used it in the jit to handle negative offsets.

First modify the asm load helper functions to handle:
- know positive offsets
- know negative offsets
- any offset

then the compiler can be modified to explicitly use these helper
when appropriate.

This fixes the case of a negative X register and allows to lift
the restriction that bpf programs with negative offsets can't
be jited.

Tested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jan Seiffert <kaffeemonster@googlemail.com>
---
 arch/powerpc/net/bpf_jit.h      |    8 +++-
 arch/powerpc/net/bpf_jit_64.S   |  108 ++++++++++++++++++++++++++++++++++-----
 arch/powerpc/net/bpf_jit_comp.c |   26 +++------
 3 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index af1ab5e..5c3cf2d 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -48,7 +48,13 @@
 /*
  * Assembly helpers from arch/powerpc/net/bpf_jit.S:
  */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+#define DECLARE_LOAD_FUNC(func)	\
+	extern u8 func[], func##_negative_offset[], func##_positive_offset[]
+
+DECLARE_LOAD_FUNC(sk_load_word);
+DECLARE_LOAD_FUNC(sk_load_half);
+DECLARE_LOAD_FUNC(sk_load_byte);
+DECLARE_LOAD_FUNC(sk_load_byte_msh);
 
 #define FUNCTION_DESCR_SIZE	24
 
diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S
index ff4506e..55ba385 100644
--- a/arch/powerpc/net/bpf_jit_64.S
+++ b/arch/powerpc/net/bpf_jit_64.S
@@ -31,14 +31,13 @@
  * then branch directly to slow_path_XXX if required.  (In fact, could
  * load a spare GPR with the address of slow_path_generic and pass size
  * as an argument, making the call site a mtlr, li and bllr.)
- *
- * Technically, the "is addr < 0" check is unnecessary & slowing down
- * the ABS path, as it's statically checked on generation.
  */
 	.globl	sk_load_word
 sk_load_word:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_word_neg
+	.globl	sk_load_word_positive_offset
+sk_load_word_positive_offset:
 	/* Are we accessing past headlen? */
 	subi	r_scratch1, r_HL, 4
 	cmpd	r_scratch1, r_addr
@@ -51,7 +50,9 @@ sk_load_word:
 	.globl	sk_load_half
 sk_load_half:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_half_neg
+	.globl	sk_load_half_positive_offset
+sk_load_half_positive_offset:
 	subi	r_scratch1, r_HL, 2
 	cmpd	r_scratch1, r_addr
 	blt	bpf_slow_path_half
@@ -61,7 +62,9 @@ sk_load_half:
 	.globl	sk_load_byte
 sk_load_byte:
 	cmpdi	r_addr, 0
-	blt	bpf_error
+	blt	bpf_slow_path_byte_neg
+	.globl	sk_load_byte_positive_offset
+sk_load_byte_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte
 	lbzx	r_A, r_D, r_addr
@@ -69,22 +72,20 @@ sk_load_byte:
 
 /*
  * BPF_S_LDX_B_MSH: ldxb  4*([offset]&0xf)
- * r_addr is the offset value, already known positive
+ * r_addr is the offset value
  */
 	.globl sk_load_byte_msh
 sk_load_byte_msh:
+	cmpdi	r_addr, 0
+	blt	bpf_slow_path_byte_msh_neg
+	.globl sk_load_byte_msh_positive_offset
+sk_load_byte_msh_positive_offset:
 	cmpd	r_HL, r_addr
 	ble	bpf_slow_path_byte_msh
 	lbzx	r_X, r_D, r_addr
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
 
-bpf_error:
-	/* Entered with cr0 = lt */
-	li	r3, 0
-	/* Generated code will 'blt epilogue', returning 0. */
-	blr
-
 /* Call out to skb_copy_bits:
  * We'll need to back up our volatile regs first; we have
  * local variable space at r1+(BPF_PPC_STACK_BASIC).
@@ -136,3 +137,84 @@ bpf_slow_path_byte_msh:
 	lbz	r_X, BPF_PPC_STACK_BASIC+(2*8)(r1)
 	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
 	blr
+
+/* Call out to bpf_internal_load_pointer_neg_helper:
+ * We'll need to back up our volatile regs first; we have
+ * local variable space at r1+(BPF_PPC_STACK_BASIC).
+ * Allocate a new stack frame here to remain ABI-compliant in
+ * stashing LR.
+ */
+#define sk_negative_common(SIZE)				\
+	mflr	r0;						\
+	std	r0, 16(r1);					\
+	/* R3 goes in parameter space of caller's frame */	\
+	std	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	std	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	std	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	stdu	r1, -BPF_PPC_SLOWPATH_FRAME(r1);		\
+	/* R3 = r_skb, as passed */				\
+	mr	r4, r_addr;					\
+	li	r5, SIZE;					\
+	bl	bpf_internal_load_pointer_neg_helper;		\
+	/* R3 != 0 on success */				\
+	addi	r1, r1, BPF_PPC_SLOWPATH_FRAME;			\
+	ld	r0, 16(r1);					\
+	ld	r_A, (BPF_PPC_STACK_BASIC+(0*8))(r1);		\
+	ld	r_X, (BPF_PPC_STACK_BASIC+(1*8))(r1);		\
+	mtlr	r0;						\
+	cmpldi	r3, 0;						\
+	beq	bpf_error_slow;	/* cr0 = EQ */			\
+	mr	r_addr, r3;					\
+	ld	r_skb, (BPF_PPC_STACKFRAME+48)(r1);		\
+	/* Great success! */
+
+bpf_slow_path_word_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_word_negative_offset
+sk_load_word_negative_offset:
+	sk_negative_common(4)
+	lwz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_half_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_half_negative_offset
+sk_load_half_negative_offset:
+	sk_negative_common(2)
+	lhz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_negative_offset
+sk_load_byte_negative_offset:
+	sk_negative_common(1)
+	lbz	r_A, 0(r_addr)
+	blr
+
+bpf_slow_path_byte_msh_neg:
+	lis     r_scratch1,-32	/* SKF_LL_OFF */
+	cmpd	r_addr, r_scratch1	/* addr < SKF_* */
+	blt	bpf_error	/* cr0 = LT */
+	.globl	sk_load_byte_msh_negative_offset
+sk_load_byte_msh_negative_offset:
+	sk_negative_common(1)
+	lbz	r_X, 0(r_addr)
+	rlwinm	r_X, r_X, 2, 32-4-2, 31-2
+	blr
+
+bpf_error_slow:
+	/* fabricate a cr0 = lt */
+	li	r_scratch1, -1
+	cmpdi	r_scratch1, 0
+bpf_error:
+	/* Entered with cr0 = lt */
+	li	r3, 0
+	/* Generated code will 'blt epilogue', returning 0. */
+	blr
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 73619d3..2dc8b14 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -127,6 +127,9 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 	PPC_BLR();
 }
 
+#define CHOOSE_LOAD_FUNC(K, func) \
+	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
+
 /* Assemble the body code between the prologue & epilogue. */
 static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			      struct codegen_context *ctx,
@@ -391,21 +394,16 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 
 			/*** Absolute loads from packet header/data ***/
 		case BPF_S_LD_W_ABS:
-			func = sk_load_word;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_word);
 			goto common_load;
 		case BPF_S_LD_H_ABS:
-			func = sk_load_half;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_half);
 			goto common_load;
 		case BPF_S_LD_B_ABS:
-			func = sk_load_byte;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte);
 		common_load:
-			/*
-			 * Load from [K].  Reference with the (negative)
-			 * SKF_NET_OFF/SKF_LL_OFF offsets is unsupported.
-			 */
+			/* Load from [K]. */
 			ctx->seen |= SEEN_DATAREF;
-			if ((int)K < 0)
-				return -ENOTSUPP;
 			PPC_LI64(r_scratch1, func);
 			PPC_MTLR(r_scratch1);
 			PPC_LI32(r_addr, K);
@@ -429,7 +427,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 		common_load_ind:
 			/*
 			 * Load from [X + K].  Negative offsets are tested for
-			 * in the helper functions, and result in a 'ret 0'.
+			 * in the helper functions.
 			 */
 			ctx->seen |= SEEN_DATAREF | SEEN_XREG;
 			PPC_LI64(r_scratch1, func);
@@ -443,13 +441,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image,
 			break;
 
 		case BPF_S_LDX_B_MSH:
-			/*
-			 * x86 version drops packet (RET 0) when K<0, whereas
-			 * interpreter does allow K<0 (__load_pointer, special
-			 * ancillary data).  common_load returns ENOTSUPP if K<0,
-			 * so we fall back to interpreter & filter works.
-			 */
-			func = sk_load_byte_msh;
+			func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
 			goto common_load;
 			break;
 

^ permalink raw reply related

* Re: [REGRESSION][PATCH V4 3/3] bpf jit: Let the powerpc jit handle negative offsets
From: Benjamin Herrenschmidt @ 2012-04-30  5:26 UTC (permalink / raw)
  To: kaffeemonster
  Cc: eric.dumazet, matt, netdev, linux-kernel, linuxppc-dev,
	David Miller
In-Reply-To: <4F9E188E.80503@googlemail.com>


> No idea, i was going by the old saying:
> "Thou shall not include kernel header, or you will feel the wrath of angry
> kernel gurus."

Heh :-)

Well, googling around, it looks like there's a mix of both type of
programs out there. Those using a struct bpf_program and those using a
struct soft_fprog.

Only the latter will work on BE machines as far as I can tell.

David, what's the right way to fix that ?

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] powerpc: Support lower minimum entitlement for virtual processors
From: Benjamin Herrenschmidt @ 2012-04-30  5:30 UTC (permalink / raw)
  To: Robert Jennings; +Cc: linuxppc-dev
In-Reply-To: <20120323212213.GB9456@linux.vnet.ibm.com>

On Fri, 2012-03-23 at 16:22 -0500, Robert Jennings wrote:
> This patch changes the architecture vector to advertise support for a
> lower minimum virtual processor entitled capacity.  The default
> minimum without this patch is 10%, this patch specifies 5%.  This will
> allow 20 LPARs per CPU rather than only 10.

Any reason why we don't just put 1% in there and thus don't have to
change again when pHyp decides to lower their minimum ? :-)

(Can you test that it works, ie, that pHyp doesn't barf if we put 1% and
properly uses 5% in that case ?)

Cheers,
Ben.

> Signed-off-by: Robert Jennings <rcj@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/prom_init.c |    8 ++++++--
>  1 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index eca626e..3d882ea 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -689,6 +689,9 @@ static void __init early_cmdline_parse(void)
>  #define OV3_VMX			0x40	/* VMX/Altivec */
>  #define OV3_DFP			0x20	/* decimal FP */
>  
> +/* Option vector 4: IBM PAPR implementation */
> +#define OV4_MIN_ENT_CAP		0x05	/* minimum VP entitled capacity */
> +
>  /* Option vector 5: PAPR/OF options supported */
>  #define OV5_LPAR		0x80	/* logical partitioning supported */
>  #define OV5_SPLPAR		0x40	/* shared-processor LPAR supported */
> @@ -753,8 +756,9 @@ static unsigned char ibm_architecture_vec[] = {
>  	OV3_FP | OV3_VMX | OV3_DFP,
>  
>  	/* option vector 4: IBM PAPR implementation */
> -	2 - 2,				/* length */
> +	3 - 2,				/* length */
>  	0,				/* don't halt */
> +	OV4_MIN_ENT_CAP,		/* minimum VP entitled capacity */
>  
>  	/* option vector 5: PAPR/OF options */
>  	13 - 2,				/* length */
> @@ -771,7 +775,7 @@ static unsigned char ibm_architecture_vec[] = {
>  	 * must match by the macro below. Update the definition if
>  	 * the structure layout changes.
>  	 */
> -#define IBM_ARCH_VEC_NRCORES_OFFSET	100
> +#define IBM_ARCH_VEC_NRCORES_OFFSET	101
>  	W(NR_CPUS),			/* number of cores supported */
>  
>  	/* option vector 6: IBM PAPR hints */

^ permalink raw reply

* Re: [PATCH 3/4] powerpc: Add 64-bit CPU targets for gcc
From: Benjamin Herrenschmidt @ 2012-04-30  5:57 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev, paulus, Anton Blanchard
In-Reply-To: <7A78BE77-1D97-4E8D-B8F3-026E2B3B2EEA@kernel.crashing.org>

On Wed, 2012-04-18 at 09:33 -0500, Kumar Gala wrote:
> On Apr 17, 2012, at 11:45 PM, Anton Blanchard wrote:
> 
> > 
> > Add a menu to select various 64-bit CPU targets for gcc. We
> > default to -mtune=power7 and if gcc doesn't understand that we
> > fallback to -mtune=power4.
> > 
> > Signed-off-by: Anton Blanchard <anton@samba.org>
> > ---
> 
> Can you add a target for e5500 cpu.

I'm going to put Anton patch in, can you send an add-on for e5500 ?

Cheers,
Ben.

> - k
> 
> > 
> > Index: linux-build/arch/powerpc/Makefile
> > ===================================================================
> > --- linux-build.orig/arch/powerpc/Makefile	2012-04-18 14:31:31.614666419 +1000
> > +++ linux-build/arch/powerpc/Makefile	2012-04-18 14:37:08.680708678 +1000
> > @@ -69,6 +69,16 @@ LDFLAGS_vmlinux	:= $(LDFLAGS_vmlinux-y)
> > 
> > CFLAGS-$(CONFIG_PPC64)	:= -mminimal-toc -mtraceback=no -mcall-aixdesc
> > CFLAGS-$(CONFIG_PPC32)	:= -ffixed-r2 -mmultiple
> > +
> > +CFLAGS-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=power7,-mtune=power4)
> > +CFLAGS-$(CONFIG_CELL_CPU) += $(call cc-option,-mcpu=cell)
> > +CFLAGS-$(CONFIG_POWER4_CPU) += $(call cc-option,-mcpu=power4)
> > +CFLAGS-$(CONFIG_POWER5_CPU) += $(call cc-option,-mcpu=power5)
> > +CFLAGS-$(CONFIG_POWER6_CPU) += $(call cc-option,-mcpu=power6)
> > +CFLAGS-$(CONFIG_POWER7_CPU) += $(call cc-option,-mcpu=power7)
> > +
> > +CFLAGS-$(CONFIG_TUNE_CELL) += $(call cc-option,-mtune=cell)
> > +
> > KBUILD_CPPFLAGS	+= -Iarch/$(ARCH)
> > KBUILD_AFLAGS	+= -Iarch/$(ARCH)
> > KBUILD_CFLAGS	+= -msoft-float -pipe -Iarch/$(ARCH) $(CFLAGS-y)
> > @@ -76,22 +86,11 @@ CPP		= $(CC) -E $(KBUILD_CFLAGS)
> > 
> > CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
> > 
> > -ifeq ($(CONFIG_PPC64),y)
> > -ifeq ($(CONFIG_POWER4_ONLY),y)
> > -	KBUILD_CFLAGS += $(call cc-option,-mcpu=power4)
> > -else
> > -	KBUILD_CFLAGS += $(call cc-option,-mtune=power4)
> > -endif
> > -endif
> > -
> > KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
> > 
> > -ifeq ($(CONFIG_TUNE_CELL),y)
> > -	KBUILD_CFLAGS += $(call cc-option,-mtune=cell)
> > -endif
> > -
> > -# No AltiVec instruction when building kernel
> > +# No AltiVec or VSX instructions when building kernel
> > KBUILD_CFLAGS += $(call cc-option,-mno-altivec)
> > +KBUILD_CFLAGS += $(call cc-option,-mno-vsx)
> > 
> > # No SPE instruction when building kernel
> > # (We use all available options to help semi-broken compilers)
> > Index: linux-build/arch/powerpc/platforms/Kconfig.cputype
> > ===================================================================
> > --- linux-build.orig/arch/powerpc/platforms/Kconfig.cputype	2012-04-18 14:31:25.134549903 +1000
> > +++ linux-build/arch/powerpc/platforms/Kconfig.cputype	2012-04-18 14:36:40.576207829 +1000
> > @@ -78,6 +78,36 @@ config PPC_BOOK3E_64
> > 
> > endchoice
> > 
> > +choice
> > +	prompt "CPU selection"
> > +	depends on PPC64
> > +	default GENERIC_CPU
> > +	help
> > +	  This will create a kernel which is optimised for a particular CPU.
> > +	  The resulting kernel may not run on other CPUs, so use this with care.
> > +
> > +	  If unsure, select Generic.
> > +
> > +config GENERIC_CPU
> > +	bool "Generic"
> > +
> > +config CELL_CPU
> > +	bool "Cell Broadband Engine"
> > +
> > +config POWER4_CPU
> > +	bool "POWER4"
> > +
> > +config POWER5_CPU
> > +	bool "POWER5"
> > +
> > +config POWER6_CPU
> > +	bool "POWER6"
> > +
> > +config POWER7_CPU
> > +	bool "POWER7"
> > +
> > +endchoice
> > +
> > config PPC_BOOK3S
> > 	def_bool y
> > 	depends on PPC_BOOK3S_32 || PPC_BOOK3S_64
> > _______________________________________________
> > Linuxppc-dev mailing list
> > Linuxppc-dev@lists.ozlabs.org
> > https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Borislav Petkov @ 2012-04-30  7:59 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9D4D55.9070705@redhat.com>

On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote:
> > Hey, are you looking at compiled code or at source code? Because I'm
> > looking at source code, and it is a pretty safe bet the majority of the
> > people here do that too.
> 
> What I said is that, from source code POV, a code where the loop variables are
> initialized just before the loop is easier to read it when the initialization
> of those vars are on another part of the code.
> 
> That's basically why the "for" syntax starts with a var initialization clause.
> 
> The tot_dimms & friends are loop vars: their value is calculated within the loop.
> 
> At the object code, this won't bring any difference.
> 
> > 
> >> it, either by using registers for those vars or by moving the initialization
> >> to the top of the function.
> >>
> >> This function is too complex, so it is better to initialize those vars
> >> just before the loops that are calculating those totals.
> > 
> > Simply initialize those variables at declaration time and that's it.
> > Initializing them before the loop doesn't make the function less complex
> > - splitting it and sanitizing it does.
> 
> Initializing loop-calculated vars just before the loop makes the code easier
> to read, and may avoid issues that might happen during code lifecycle.

This is getting ridiculous: the variable declaration and initialization
are on the same screen as the loop (unless one uses a screen which can
only show less than 40ish lines).

So the argument about making the code easier to read is bogus.

This function is already cluttered with a lot of crap, and is very large
so adding more lines which can simply be stashed away at declaration
time is better readability.

Besides, every modern editor can jump to the declaration of a local
variable so that the user can see to what it is initialized to.

> +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
> +                                    unsigned n_layers,
> +                                    struct edac_mc_layer *layers,
> +                                    bool rev_order,
> +                                    unsigned sz_pvt)
>  {
>       void *ptr = NULL;
>       struct mem_ctl_info *mci;
> -     struct csrow_info *csi, *csrow;
> +     struct edac_mc_layer *layer;
> +     struct csrow_info *csi, *csr;
>       struct rank_info *chi, *chp, *chan;
>       struct dimm_info *dimm;
> +     u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>       void *pvt;
> -     unsigned size;
> -     int row, chn;
> +     unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
> +     unsigned tot_csrows, tot_channels, tot_errcount = 0;
> +     int i, j;
>       int err;
> +     int row, chn;
> +     bool per_rank = false;
> +
> +     BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
> +     /*
> +      * Calculate the total amount of dimms and csrows/cschannels while
> +      * in the old API emulation mode
> +      */
> +     tot_dimms = 1;
> +     tot_channels = 1;
> +     tot_csrows = 1;
> +     for (i = 0; i < n_layers; i++) {
> +             tot_dimms *= layers[i].size;
> +             if (layers[i].is_virt_csrow)
> +                     tot_csrows *= layers[i].size;
> +             else
> +                     tot_channels *= layers[i].size;
> +
> +             if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
> +                     per_rank = true;


-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Borislav Petkov @ 2012-04-30  8:15 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9D46F8.1020104@redhat.com>

On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
> > [   10.486440] EDAC MC: DCT0 chip selects:
> > [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> > [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> > [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
> > [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
> > [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
> > [   10.486455] EDAC MC: DCT1 chip selects:
> > [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
> > [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
> > [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
> > [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
> > [   10.486467] EDAC amd64: using x8 syndromes.
> > [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
> > [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
> > [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
> > [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
> > [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
> > [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
> > [   10.486488] EDAC amd64: MCT channel count: 2
> > [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
> > [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
> > [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
> > [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
> > [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
> > [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
> > [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
> > [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
> > [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
> > [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
> > [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
> > [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
> > [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
> > [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
> > [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
> > [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
> > [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
> > 
> > DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
> > 
> > Now your change is showing 16 ranks. Still b0rked.
> > 
> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
> 
> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
> doesn't know how many ranks are filled, as the driver logic first calls it to 
> allocate for the max amount of ranks, and then fills the rank with their info 
> (or let them untouched with 0 pages, if they're empty).

Basically you're saying you're generating dimm_info structs for all
_possible_ dimms and the loop where this debug message comes from goes
and marrily initializes them all although some of them are empty:

+       for (i = 0; i < tot_dimms; i++) {
+               chan = &csi[row].channels[chn];
+               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
+                              pos[0], pos[1], pos[2]);
+               dimm->mci = mci;
+
+               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
+                       i, (dimm - mci->dimms),
+                       pos[0], pos[1], pos[2], row, chn);
+
+               /* Copy DIMM location */
+               for (j = 0; j < n_layers; j++)
+                       dimm->location[j] = pos[j];
...

definitely superfluous.

Oh well, looking at edac_mc_alloc, it used to allocate structs for all
csrows on the controller even though some of them were empty...

Ok, then please remove this debug call because it is misleading. Having

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)

is enough.

You probably want to say how many channels/csrows there are, though:

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels)

or something similar. Simply dump tot_dimms, tot_channels and tot_csrows
and that's it.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 10:58 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20120430081513.GD8182@aftab.osrc.amd.com>

Em 30-04-2012 05:15, Borislav Petkov escreveu:
> On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
>>> [   10.486440] EDAC MC: DCT0 chip selects:
>>> [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
>>> [   10.486455] EDAC MC: DCT1 chip selects:
>>> [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486467] EDAC amd64: using x8 syndromes.
>>> [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
>>> [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
>>> [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
>>> [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
>>> [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
>>> [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
>>> [   10.486488] EDAC amd64: MCT channel count: 2
>>> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
>>> [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
>>> [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
>>> [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
>>> [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
>>> [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
>>> [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
>>> [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
>>> [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
>>> [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
>>> [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
>>> [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
>>> [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
>>> [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
>>> [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
>>> [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
>>> [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
>>>
>>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
>>>
>>> Now your change is showing 16 ranks. Still b0rked.
>>>
>> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
>>
>> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
>> doesn't know how many ranks are filled, as the driver logic first calls it to 
>> allocate for the max amount of ranks, and then fills the rank with their info 
>> (or let them untouched with 0 pages, if they're empty).
> 
> Basically you're saying you're generating dimm_info structs for all
> _possible_ dimms and the loop where this debug message comes from goes
> and marrily initializes them all although some of them are empty:
> 
> +       for (i = 0; i < tot_dimms; i++) {
> +               chan = &csi[row].channels[chn];
> +               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
> +                              pos[0], pos[1], pos[2]);
> +               dimm->mci = mci;
> +
> +               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
> +                       i, (dimm - mci->dimms),
> +                       pos[0], pos[1], pos[2], row, chn);
> +
> +               /* Copy DIMM location */
> +               for (j = 0; j < n_layers; j++)
> +                       dimm->location[j] = pos[j];
> ...
> 
> definitely superfluous.
> 
> Oh well, looking at edac_mc_alloc, it used to allocate structs for all
> csrows on the controller even though some of them were empty...
> 
> Ok, then please remove this debug call because it is misleading. Having
> 
> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
> 
> is enough.
> 
> You probably want to say how many channels/csrows there are, though:
> 
> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels)
> 
> or something similar. Simply dump tot_dimms, tot_channels and tot_csrows
> and that's it.
> 

It seems you have a very short memory. We had a similar discussion about that a while ago:
	https://lkml.org/lkml/2012/3/8/440

See my comments at:
	https://lkml.org/lkml/2012/3/9/101
	https://lkml.org/lkml/2012/3/9/267

As it was explained there, those debug messages provide a map between the legacy per-csrow
data, used by the old API and the dimm_info representation. For a per-csrow memory controller,
the map is trivial, as the memory location will match the csrow/channel information, but
for modern memory controllers, the map info is not trivial and it helps to check what it is
expected to be found when retrieving information via the legacy EDAC API.

For example, this is the mapping used by the second memory controller of the SB machine
I'm using on my tests:

[52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
...
[52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
[52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
[52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
[52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
[52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
[52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
[52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
[52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
[52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
[52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
[52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
[52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
[52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
[52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3

With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
EDAC API call.

Regards,
Mauro

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Borislav Petkov @ 2012-04-30 11:11 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9E7059.5070804@redhat.com>

On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
> It seems you have a very short memory.

Oh, puh-lease, let's don't start with the insults now. You're not a
saint yourself. And maybe the fact that I'm having hard time grasping
your code is maybe because it is a load of crap and you seem to generate
a lot of senseless drivel when explaining what it does. And don't get me
started on the patch bombs.

So, let's stay constructive here before I, as the last and only one
person reviewing this stinking pile stops messing with it (I got other
stuff to do, you know) and NACK it completely.

> For example, this is the mapping used by the second memory controller of the SB machine
> I'm using on my tests:
> 
> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
> ...
> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
> 
> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
> EDAC API call.

Are all those DIMM slots above populated? What happens if they're not,
are you issuing the same dimm0-dimm11 lines for slots which aren't even
populated?

I have a much better idea: Generally, this debug info should come from
the specific driver that allocates the dimm descriptors, not from the
EDAC core. This way, you know in the driver which slots are populated
and those which are not should be omitted.

This way it says "initializing 12 dimms" and the user thinks there are
12 DIMMs on his system where this might not be true.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 11:23 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20120430075940.GC8182@aftab.osrc.amd.com>

Em 30-04-2012 04:59, Borislav Petkov escreveu:
> On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote:
>>> Hey, are you looking at compiled code or at source code? Because I'm
>>> looking at source code, and it is a pretty safe bet the majority of the
>>> people here do that too.
>>
>> What I said is that, from source code POV, a code where the loop variables are
>> initialized just before the loop is easier to read it when the initialization
>> of those vars are on another part of the code.
>>
>> That's basically why the "for" syntax starts with a var initialization clause.
>>
>> The tot_dimms & friends are loop vars: their value is calculated within the loop.
>>
>> At the object code, this won't bring any difference.
>>
>>>
>>>> it, either by using registers for those vars or by moving the initialization
>>>> to the top of the function.
>>>>
>>>> This function is too complex, so it is better to initialize those vars
>>>> just before the loops that are calculating those totals.
>>>
>>> Simply initialize those variables at declaration time and that's it.
>>> Initializing them before the loop doesn't make the function less complex
>>> - splitting it and sanitizing it does.
>>
>> Initializing loop-calculated vars just before the loop makes the code easier
>> to read, and may avoid issues that might happen during code lifecycle.
> 
> This is getting ridiculous:

With this I fully agree: you're nacking patches because it is not the way you
write your code, not because the code there is doing anything wrong.

If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why
should I do a change that will obfuscate the code?

> the variable declaration and initialization
> are on the same screen as the loop (unless one uses a screen which can
> only show less than 40ish lines).
> 
> So the argument about making the code easier to read is bogus.
> 
> This function is already cluttered with a lot of crap, and is very large
> so adding more lines which can simply be stashed away at declaration
> time is better readability.
> 
> Besides, every modern editor can jump to the declaration of a local
> variable so that the user can see to what it is initialized to.

The editor used by te developer is not relevant. This is not a reason
to obfuscate the code.

>> +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
>> +                                    unsigned n_layers,
>> +                                    struct edac_mc_layer *layers,
>> +                                    bool rev_order,
>> +                                    unsigned sz_pvt)
>>  {
>>       void *ptr = NULL;
>>       struct mem_ctl_info *mci;
>> -     struct csrow_info *csi, *csrow;
>> +     struct edac_mc_layer *layer;
>> +     struct csrow_info *csi, *csr;
>>       struct rank_info *chi, *chp, *chan;
>>       struct dimm_info *dimm;
>> +     u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
>>       void *pvt;
>> -     unsigned size;
>> -     int row, chn;
>> +     unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
>> +     unsigned tot_csrows, tot_channels, tot_errcount = 0;
>> +     int i, j;
>>       int err;
>> +     int row, chn;
>> +     bool per_rank = false;
>> +
>> +     BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
>> +     /*
>> +      * Calculate the total amount of dimms and csrows/cschannels while
>> +      * in the old API emulation mode
>> +      */
>> +     tot_dimms = 1;
>> +     tot_channels = 1;
>> +     tot_csrows = 1;
>> +     for (i = 0; i < n_layers; i++) {
>> +             tot_dimms *= layers[i].size;
>> +             if (layers[i].is_virt_csrow)
>> +                     tot_csrows *= layers[i].size;
>> +             else
>> +                     tot_channels *= layers[i].size;
>> +
>> +             if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
>> +                     per_rank = true;

Regards,
Mauro

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 11:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20120430081513.GD8182@aftab.osrc.amd.com>

Em 30-04-2012 05:15, Borislav Petkov escreveu:
> On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
>>> [   10.486440] EDAC MC: DCT0 chip selects:
>>> [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486448] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486450] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x00000088
>>> [   10.486455] EDAC MC: DCT1 chip selects:
>>> [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
>>> [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
>>> [   10.486463] EDAC amd64: MC: 4:     0MB 5:     0MB
>>> [   10.486465] EDAC amd64: MC: 6:     0MB 7:     0MB
>>> [   10.486467] EDAC amd64: using x8 syndromes.
>>> [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100
>>> [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all DIMMs support ECC: yes
>>> [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
>>> [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
>>> [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no
>>> [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding
>>> [   10.486488] EDAC amd64: MCT channel count: 2
>>> [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
>>> [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0
>>> [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1
>>> [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0
>>> [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1
>>> [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0
>>> [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1
>>> [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0
>>> [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1
>>> [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0
>>> [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1
>>> [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0
>>> [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1
>>> [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0
>>> [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1
>>> [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0
>>> [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1
>>>
>>> DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
>>>
>>> Now your change is showing 16 ranks. Still b0rked.
>>>
>> No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
>>
>> As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc
>> doesn't know how many ranks are filled, as the driver logic first calls it to 
>> allocate for the max amount of ranks, and then fills the rank with their info 
>> (or let them untouched with 0 pages, if they're empty).
> 
> Basically you're saying you're generating dimm_info structs for all
> _possible_ dimms and the loop where this debug message comes from goes
> and marrily initializes them all although some of them are empty:
> 
> +       for (i = 0; i < tot_dimms; i++) {
> +               chan = &csi[row].channels[chn];
> +               dimm = EDAC_DIMM_PTR(lay, mci->dimms, n_layers,
> +                              pos[0], pos[1], pos[2]);
> +               dimm->mci = mci;
> +
> +               debugf2("%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n", __func__,
> +                       i, (dimm - mci->dimms),
> +                       pos[0], pos[1], pos[2], row, chn);
> +
> +               /* Copy DIMM location */
> +               for (j = 0; j < n_layers; j++)
> +                       dimm->location[j] = pos[j];
> ...
> 
> definitely superfluous.

This is the way the EDAC core works: everything is allocated, on one shot, when this
function is called, and, on most drivers, before the code that probes how many DIMMS/ranks
got initialized. That happens because the edac_mc_alloc() arguments provide the total
amount of ranks/dimms, but doesn't say anything about what is used there.

Changing from this model to another model that would dynamically initialize the per-dimm/rank
data is possible, but that would require another set of patches that will touch on all
drivers, and to convert the edac_mc_alloc function into 3 or 4 function calls, with the
corresponding changes on all drivers. Also, the changes at the drivers won't likely be
trivial.

The patches that convert kobj into "struct device" does part of the job, as, after it,
each dimm/csrow will be allocated by a separate kmalloc.

After having this series fully applied, it would be possible to work on such solution.
I'll eventually do that, as this would simplify the code at i7core_edac and sb_edac.

Regards,
Mauro

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 11:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20120430111126.GD9303@aftab.osrc.amd.com>

Em 30-04-2012 08:11, Borislav Petkov escreveu:
> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

>> For example, this is the mapping used by the second memory controller of the SB machine
>> I'm using on my tests:
>>
>> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
>> ...
>> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
>> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
>> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
>> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
>> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
>> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
>> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
>> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
>> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
>> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
>> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
>> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
>> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
>> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
>>
>> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
>> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
>> EDAC API call.
> 
> Are all those DIMM slots above populated? What happens if they're not,
> are you issuing the same dimm0-dimm11 lines for slots which aren't even
> populated?
> 
> I have a much better idea: Generally, this debug info should come from
> the specific driver that allocates the dimm descriptors, not from the
> EDAC core. This way, you know in the driver which slots are populated
> and those which are not should be omitted.

The drivers don't allocate the dimm descriptors. They're allocated by the
core.

> This way it says "initializing 12 dimms" and the user thinks there are
> 12 DIMMs on his system where this might not be true.


I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
new.

With regards do the other messages, if the debug messages are not clear, 
then let's fix them, instead of removing. What if we print, instead,
on a message like:

	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"

Regards,
Mauro

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Borislav Petkov @ 2012-04-30 12:38 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9E7B45.8010908@redhat.com>

On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
> Em 30-04-2012 08:11, Borislav Petkov escreveu:
> > On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
> 
> >> For example, this is the mapping used by the second memory controller of the SB machine
> >> I'm using on my tests:
> >>
> >> [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
> >> ...
> >> [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels)
> >> [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
> >> [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0
> >> [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1
> >> [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2
> >> [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3
> >> [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0
> >> [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1
> >> [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2
> >> [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3
> >> [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0
> >> [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1
> >> [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2
> >> [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3
> >>
> >> With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is
> >> called "dimm11" at the new API, and corresponds to "csrow 2, channel 3" for a legacy
> >> EDAC API call.
> > 
> > Are all those DIMM slots above populated? What happens if they're not,
> > are you issuing the same dimm0-dimm11 lines for slots which aren't even
> > populated?
> > 
> > I have a much better idea: Generally, this debug info should come from
> > the specific driver that allocates the dimm descriptors, not from the
> > EDAC core. This way, you know in the driver which slots are populated
> > and those which are not should be omitted.
> 
> The drivers don't allocate the dimm descriptors. They're allocated by the
> core.

I know that. The drivers call into EDAC core using edac_mc_alloc, this
is what I meant above.

> > This way it says "initializing 12 dimms" and the user thinks there are
> > 12 DIMMs on his system where this might not be true.
> 
> 
> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
> new.
> 
> With regards do the other messages, if the debug messages are not clear, 
> then let's fix them, instead of removing. What if we print, instead,
> on a message like:
> 
> 	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"

How about the following instead: the specific driver calls
edac_mc_alloc(), it gets the allocated dimm array in mci->dimms
_without_ dumping each dimm%d line. Then, each driver figures out which
subset of that dimms array actually has populated slots and prints only
the populated rank/slot/...

This information is much more valuable than saying how many _possible_
slots the edac core has allocated.

Then, each driver can decide whether it makes sense to dump that info or
not.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Borislav Petkov @ 2012-04-30 12:51 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9E763E.2050808@redhat.com>

On Mon, Apr 30, 2012 at 08:23:42AM -0300, Mauro Carvalho Chehab wrote:
> With this I fully agree: you're nacking patches because it is not the way you

Where? Have I written Nacked-by somewhere?

> write your code, not because the code there is doing anything wrong.
> 
> If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why
> should I do a change that will obfuscate the code?

What obfuscation are you talking about? Having the initialization of
variables along with their declaration is not it.

Now let's look what you're doing:

> >> +     unsigned tot_csrows, tot_channels, tot_errcount = 0;
> >> +     unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];

just to reassign 1 to some of them

> >> +     tot_dimms = 1;
> >> +     tot_channels = 1;
> >> +     tot_csrows = 1;

a couple of lines below.

Now this is misleading.

Now let's look at what I'm proposing

	unsigned tot_dimms = 1;
	unsigned tot_csrows = 1;
	unsigned tot_channels = 1;

How is this an obfuscation? It is basic code layout practices.

> The editor used by te developer is not relevant. This is not a reason
> to obfuscate the code.
> 
> >> +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
> >> +                                    unsigned n_layers,
> >> +                                    struct edac_mc_layer *layers,
> >> +                                    bool rev_order,
> >> +                                    unsigned sz_pvt)
> >>  {
> >>       void *ptr = NULL;
> >>       struct mem_ctl_info *mci;
> >> -     struct csrow_info *csi, *csrow;
> >> +     struct edac_mc_layer *layer;
> >> +     struct csrow_info *csi, *csr;
> >>       struct rank_info *chi, *chp, *chan;
> >>       struct dimm_info *dimm;
> >> +     u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
> >>       void *pvt;
> >> -     unsigned size;
> >> -     int row, chn;
> >> +     unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
> >> +     unsigned tot_csrows, tot_channels, tot_errcount = 0;
> >> +     int i, j;
> >>       int err;
> >> +     int row, chn;
> >> +     bool per_rank = false;
> >> +
> >> +     BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
> >> +     /*
> >> +      * Calculate the total amount of dimms and csrows/cschannels while
> >> +      * in the old API emulation mode
> >> +      */
> >> +     tot_dimms = 1;
> >> +     tot_channels = 1;
> >> +     tot_csrows = 1;
> >> +     for (i = 0; i < n_layers; i++) {
> >> +             tot_dimms *= layers[i].size;
> >> +             if (layers[i].is_virt_csrow)
> >> +                     tot_csrows *= layers[i].size;
> >> +             else
> >> +                     tot_channels *= layers[i].size;
> >> +
> >> +             if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
> >> +                     per_rank = true;

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 13:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20120430123819.GF9303@aftab.osrc.amd.com>

Em 30-04-2012 09:38, Borislav Petkov escreveu:
> On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
>> Em 30-04-2012 08:11, Borislav Petkov escreveu:
>>> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

>>> This way it says "initializing 12 dimms" and the user thinks there are
>>> 12 DIMMs on his system where this might not be true.
>>
>>
>> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
>> new.
>>
>> With regards do the other messages, if the debug messages are not clear, 
>> then let's fix them, instead of removing. What if we print, instead,
>> on a message like:
>>
>> 	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"
> 
> How about the following instead: the specific driver calls
> edac_mc_alloc(), it gets the allocated dimm array in mci->dimms
> _without_ dumping each dimm%d line. Then, each driver figures out which
> subset of that dimms array actually has populated slots and prints only
> the populated rank/slot/...
> 
> This information is much more valuable than saying how many _possible_
> slots the edac core has allocated.
> 
> Then, each driver can decide whether it makes sense to dump that info or
> not.

No, that would add extra complexity at the drivers level just due to debug
messages. I think that the better is to move this printk to the debug-specific
routine that is called only when the dimm is filled (edac_mc_dump_dimm).

With this cange, the message will be printed only for the filled dimms.

This is a cleanup patch, so I'll write it, together with the change that
will get rid of the loop that uses KERN_CONT. It will use a function added
by a latter patch at edac_mc_sysfs so it can't be merged on this patch
anyway.

Regards,
Mauro

^ permalink raw reply

* Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
From: Mauro Carvalho Chehab @ 2012-04-30 13:53 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Shaohui Xie, Jason Uhlenkott, Aristeu Rozanski, Hitoshi Mitake,
	Mark Gross, Dmitry Eremin-Solenikov, Ranganathan Desikan,
	Egor Martovetsky, Niklas Söderlund, Tim Small, Arvind R.,
	Chris Metcalf, Olof Johansson, Doug Thompson,
	Linux Edac Mailing List, Michal Marek, Jiri Kosina,
	Linux Kernel Mailing List, Joe Perches, Andrew Morton,
	linuxppc-dev
In-Reply-To: <4F9E8CFE.1040801@redhat.com>

Em 30-04-2012 10:00, Mauro Carvalho Chehab escreveu:
> Em 30-04-2012 09:38, Borislav Petkov escreveu:
>> On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
>>> Em 30-04-2012 08:11, Borislav Petkov escreveu:
>>>> On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
> 
>>>> This way it says "initializing 12 dimms" and the user thinks there are
>>>> 12 DIMMs on his system where this might not be true.
>>>
>>>
>>> I'm OK to remove the "initializing 12 dimms" message. It doesn't add anything
>>> new.
>>>
>>> With regards do the other messages, if the debug messages are not clear, 
>>> then let's fix them, instead of removing. What if we print, instead,
>>> on a message like:
>>>
>>> 	"row 1, chan 1 will represent dimm5 (1:2:0) if not empty"
>>
>> How about the following instead: the specific driver calls
>> edac_mc_alloc(), it gets the allocated dimm array in mci->dimms
>> _without_ dumping each dimm%d line. Then, each driver figures out which
>> subset of that dimms array actually has populated slots and prints only
>> the populated rank/slot/...
>>
>> This information is much more valuable than saying how many _possible_
>> slots the edac core has allocated.
>>
>> Then, each driver can decide whether it makes sense to dump that info or
>> not.
> 
> No, that would add extra complexity at the drivers level just due to debug
> messages. I think that the better is to move this printk to the debug-specific
> routine that is called only when the dimm is filled (edac_mc_dump_dimm).
> 
> With this cange, the message will be printed only for the filled dimms.
> 
> This is a cleanup patch, so I'll write it, together with the change that
> will get rid of the loop that uses KERN_CONT. It will use a function added
> by a latter patch at edac_mc_sysfs so it can't be merged on this patch
> anyway.

The following patch dos the debug cleanup. I'll add at the end of my tree.

Regards,
Mauro.


From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Mon, 30 Apr 2012 10:24:43 -0300
Subject: [PATCH] edac_mc: Cleanup per-dimm_info debug messages

The edac_mc_alloc() routine allocates one dimm_info device for all
possible memories, including the non-filled ones. The debug messages
there are somewhat confusing. So, cleans them, by moving the code
that prints the memory location to edac_mc, and using it on both
edac_mc_sysfs and edac_mc.

After this patch, a dimm-based memory controller will print the debug
info as:

[  728.430828] EDAC DEBUG: edac_mc_dump_dimm: 	dimm2: channel 0 slot 2 mapped as virtual row 0, chan 2
[  728.430834] EDAC DEBUG: edac_mc_dump_dimm: 	dimm->label = 'mc#0channel#0slot#2'
[  728.430839] EDAC DEBUG: edac_mc_dump_dimm: 	dimm->nr_pages = 0x0
[  728.430846] EDAC DEBUG: edac_mc_dump_dimm: 	dimm->grain = 0
[  728.430850] EDAC DEBUG: edac_mc_dump_dimm: 	dimm->nr_pages = 0x0

(a rank-based memory controller would print, instead, "rank2"
 on the above debug info)

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index d8278b3..1bc2843 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -40,6 +40,25 @@
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
 
+unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
+			         int len)
+{
+	struct mem_ctl_info *mci = dimm->mci;
+	int i, n, count = 0;
+	char *p = buf;
+
+	for (i = 0; i < mci->n_layers; i++) {
+		n = snprintf(p, len, "%s %d ",
+			      edac_layer_name[mci->layers[i].type],
+			      dimm->location[i]);
+		p += n;
+		len -= n;
+		count += n;
+	}
+
+	return count;
+}
+
 #ifdef CONFIG_EDAC_DEBUG
 
 static void edac_mc_dump_channel(struct rank_info *chan)
@@ -50,20 +69,18 @@ static void edac_mc_dump_channel(struct rank_info *chan)
 	debugf4("\tchannel->dimm = %p\n", chan->dimm);
 }
 
-static void edac_mc_dump_dimm(struct dimm_info *dimm)
+static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
 {
-	int i;
+	char location[80];
+
+	edac_dimm_info_location(dimm, location, sizeof(location));
 
 	debugf4("\tdimm = %p\n", dimm);
+	debugf4("\t%s%i: %smapped as virtual row %d, chan %d\n",
+		dimm->mci->mem_is_per_rank ? "rank" : "dimm",
+		number, location, dimm->csrow, dimm->cschannel);
 	debugf4("\tdimm->label = '%s'\n", dimm->label);
 	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
-	debugf4("\tdimm location ");
-	for (i = 0; i < dimm->mci->n_layers; i++) {
-		printk(KERN_CONT "%d", dimm->location[i]);
-		if (i < dimm->mci->n_layers - 1)
-			printk(KERN_CONT ".");
-	}
-	printk(KERN_CONT "\n");
 	debugf4("\tdimm->grain = %d\n", dimm->grain);
 	debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
 }
@@ -337,8 +354,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned edac_index,
 	memset(&pos, 0, sizeof(pos));
 	row = 0;
 	chn = 0;
-	debugf4("initializing %d %s\n", tot_dimms,
-		per_rank ? "ranks" : "dimms");
 	for (i = 0; i < tot_dimms; i++) {
 		chan = mci->csrows[row]->channels[chn];
 		off = EDAC_DIMM_OFF(layer, n_layers, pos[0], pos[1], pos[2]);
@@ -351,10 +366,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned edac_index,
 		mci->dimms[off] = dimm;
 		dimm->mci = mci;
 
-		debugf2("%d: %s%i (%d:%d:%d): row %d, chan %d\n", i,
-			per_rank ? "rank" : "dimm", off,
-			pos[0], pos[1], pos[2], row, chn);
-
 		/*
 		 * Copy DIMM location and initialize the memory location
 		 */
@@ -730,7 +741,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
 				edac_mc_dump_channel(mci->csrows[i]->channels[j]);
 		}
 		for (i = 0; i < mci->tot_dimms; i++)
-			edac_mc_dump_dimm(mci->dimms[i]);
+			edac_mc_dump_dimm(mci->dimms[i], i);
 	}
 #endif
 	mutex_lock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 8f96c49..e3e9e75 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -488,13 +488,7 @@ static ssize_t dimmdev_location_show(struct device *dev,
 	int i;
 	char *p = data;
 
-	for (i = 0; i < mci->n_layers; i++) {
-		p += sprintf(p, "%s %d ",
-			     edac_layer_name[mci->layers[i].type],
-			     dimm->location[i]);
-	}
-
-	return p - data;
+	return edac_dimm_info_location(dimm, data, PAGE_SIZE);
 }
 
 static ssize_t dimmdev_label_show(struct device *dev,
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 1af1367..de92756 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -34,6 +34,9 @@ extern int edac_mc_get_panic_on_ue(void);
 extern int edac_get_poll_msec(void);
 extern int edac_mc_get_poll_msec(void);
 
+unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
+				 int len);
+
 	/* on edac_device.c */
 extern int edac_device_register_sysfs_main_kobj(
 				struct edac_device_ctl_info *edac_dev);

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox