LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [RFC v9 PATCH 02/21] memory-hotplug: implement offline_memory()
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, Vasilis Liaskovitis, isimatu.yasuaki,
	paulus, minchan.kim, kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

The function offline_memory() will be called when hot removing a
memory device. The memory device may contain more than one memory
block. If the memory block has been offlined, __offline_pages()
will fail. So we should try to offline one memory block at a
time.

If the memory block is offlined in offline_memory(), we also
update it's state, and notify the userspace that its state is
changed.

The function offline_memory() also check each memory block's
state. So there is no need to check the memory block's state
before calling offline_memory().

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
CC: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/base/memory.c          |   31 +++++++++++++++++++++++++++----
 include/linux/memory_hotplug.h |    2 ++
 mm/memory_hotplug.c            |   37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 44e7de6..86c8821 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -275,13 +275,11 @@ memory_block_action(unsigned long phys_index, unsigned long action)
 	return ret;
 }
 
-static int memory_block_change_state(struct memory_block *mem,
+static int __memory_block_change_state(struct memory_block *mem,
 		unsigned long to_state, unsigned long from_state_req)
 {
 	int ret = 0;
 
-	mutex_lock(&mem->state_mutex);
-
 	if (mem->state != from_state_req) {
 		ret = -EINVAL;
 		goto out;
@@ -309,10 +307,20 @@ static int memory_block_change_state(struct memory_block *mem,
 		break;
 	}
 out:
-	mutex_unlock(&mem->state_mutex);
 	return ret;
 }
 
+static int memory_block_change_state(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req)
+{
+	int ret;
+
+	mutex_lock(&mem->state_mutex);
+	ret = __memory_block_change_state(mem, to_state, from_state_req);
+	mutex_unlock(&mem->state_mutex);
+
+	return ret;
+}
 static ssize_t
 store_mem_state(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
@@ -653,6 +661,21 @@ int unregister_memory_section(struct mem_section *section)
 }
 
 /*
+ * offline one memory block. If the memory block has been offlined, do nothing.
+ */
+int offline_memory_block(struct memory_block *mem)
+{
+	int ret = 0;
+
+	mutex_lock(&mem->state_mutex);
+	if (mem->state != MEM_OFFLINE)
+		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+	mutex_unlock(&mem->state_mutex);
+
+	return ret;
+}
+
+/*
  * Initialize the sysfs support for memory devices...
  */
 int __init memory_dev_init(void)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c183f39..0b040bb 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -10,6 +10,7 @@ struct page;
 struct zone;
 struct pglist_data;
 struct mem_section;
+struct memory_block;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
@@ -234,6 +235,7 @@ extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory_block(struct memory_block *mem);
 extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bb42316..6fc1908 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1001,7 +1001,42 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
 int offline_memory(u64 start, u64 size)
 {
-	return -EINVAL;
+	struct memory_block *mem = NULL;
+	struct mem_section *section;
+	unsigned long start_pfn, end_pfn;
+	unsigned long pfn, section_nr;
+	int ret;
+
+	start_pfn = PFN_DOWN(start);
+	end_pfn = start_pfn + PFN_DOWN(size);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+		if (!present_section_nr(section_nr))
+			continue;
+
+		section = __nr_to_section(section_nr);
+		/* same memblock? */
+		if (mem)
+			if ((section_nr >= mem->start_section_nr) &&
+			    (section_nr <= mem->end_section_nr))
+				continue;
+
+		mem = find_memory_block_hinted(section, mem);
+		if (!mem)
+			continue;
+
+		ret = offline_memory_block(mem);
+		if (ret) {
+			kobject_put(&mem->dev.kobj);
+			return ret;
+		}
+	}
+
+	if (mem)
+		kobject_put(&mem->dev.kobj);
+
+	return 0;
 }
 #else
 int offline_pages(unsigned long start, unsigned long size)
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 04/21] memory-hotplug: offline and remove memory when removing the memory device
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

We should offline and remove memory when removing the memory device.
The memory device can be removed by 2 ways:
1. send eject request by SCI
2. echo 1 >/sys/bus/pci/devices/PNP0C80:XX/eject

In the 1st case, acpi_memory_disable_device() will be called. In the 2nd
case, acpi_memory_device_remove() will be called. acpi_memory_device_remove()
will also be called when we unbind the memory device from the driver
acpi_memhotplug. If the type is ACPI_BUS_REMOVAL_EJECT, it means
that the user wants to eject the memory device, and we should offline
and remove memory in acpi_memory_device_remove().

The function remove_memory() is not implemeted now. It only check whether
all memory has been offllined now.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/acpi/acpi_memhotplug.c |   45 +++++++++++++++++++++++++++++++++------
 drivers/base/memory.c          |   39 ++++++++++++++++++++++++++++++++++
 include/linux/memory.h         |    5 ++++
 include/linux/memory_hotplug.h |    5 ++++
 mm/memory_hotplug.c            |   22 +++++++++++++++++++
 5 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 7873832..9d47458 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/slab.h>
 #include <acpi/acpi_drivers.h>
@@ -310,25 +311,44 @@ static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
-static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+static int
+acpi_memory_device_remove_memory(struct acpi_memory_device *mem_device)
 {
 	int result;
 	struct acpi_memory_info *info, *n;
+	int node = mem_device->nid;
 
-
-	/*
-	 * Ask the VM to offline this memory range.
-	 * Note: Assume that this function returns zero on success
-	 */
 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 		if (info->enabled) {
 			result = offline_memory(info->start_addr, info->length);
 			if (result)
 				return result;
+
+			result = remove_memory(node, info->start_addr,
+					       info->length);
+			if (result)
+				return result;
 		}
+
+		list_del(&info->list);
 		kfree(info);
 	}
 
+	return 0;
+}
+
+static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
+{
+	int result;
+
+	/*
+	 * Ask the VM to offline this memory range.
+	 * Note: Assume that this function returns zero on success
+	 */
+	result = acpi_memory_device_remove_memory(mem_device);
+	if (result)
+		return result;
+
 	/* Power-off and eject the device */
 	result = acpi_memory_powerdown_device(mem_device);
 	if (result) {
@@ -477,12 +497,23 @@ static int acpi_memory_device_add(struct acpi_device *device)
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
 	struct acpi_memory_device *mem_device = NULL;
-
+	int result;
 
 	if (!device || !acpi_driver_data(device))
 		return -EINVAL;
 
 	mem_device = acpi_driver_data(device);
+
+	if (type == ACPI_BUS_REMOVAL_EJECT) {
+		/*
+		 * offline and remove memory only when the memory device is
+		 * ejected.
+		 */
+		result = acpi_memory_device_remove_memory(mem_device);
+		if (result)
+			return result;
+	}
+
 	kfree(mem_device);
 
 	return 0;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..038be73 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -70,6 +70,45 @@ void unregister_memory_isolate_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_memory_isolate_notifier);
 
+bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+	struct memory_block *mem = NULL;
+	struct mem_section *section;
+	unsigned long start_pfn, end_pfn;
+	unsigned long pfn, section_nr;
+
+	start_pfn = PFN_DOWN(start);
+	end_pfn = PFN_UP(start + size);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+		if (!present_section_nr(section_nr))
+			continue;
+
+		section = __nr_to_section(section_nr);
+		/* same memblock? */
+		if (mem)
+			if ((section_nr >= mem->start_section_nr) &&
+			    (section_nr <= mem->end_section_nr))
+				continue;
+
+		mem = find_memory_block_hinted(section, mem);
+		if (!mem)
+			continue;
+		if (mem->state == MEM_OFFLINE)
+			continue;
+
+		kobject_put(&mem->dev.kobj);
+		return false;
+	}
+
+	if (mem)
+		kobject_put(&mem->dev.kobj);
+
+	return true;
+}
+EXPORT_SYMBOL(is_memblk_offline);
+
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 1ac7f6e..7c66126 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -106,6 +106,10 @@ static inline int memory_isolate_notify(unsigned long val, void *v)
 {
 	return 0;
 }
+static inline bool is_memblk_offline(unsigned long start, unsigned long size)
+{
+	return false;
+}
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
@@ -120,6 +124,7 @@ extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
 							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+extern bool is_memblk_offline(unsigned long start, unsigned long size);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 enum mem_add_context { BOOT, HOTPLUG };
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 0b040bb..fd84ea9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -222,6 +222,7 @@ static inline void unlock_memory_hotplug(void) {}
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
 extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern int remove_memory(int nid, u64 start, u64 size);
 
 #else
 static inline int is_mem_section_removable(unsigned long pfn,
@@ -229,6 +230,10 @@ static inline int is_mem_section_removable(unsigned long pfn,
 {
 	return 0;
 }
+static inline int remove_memory(int nid, u64 start, u64 size)
+{
+	return -EBUSY;
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern int mem_online_node(int nid);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6fc1908..49f7747 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1038,6 +1038,28 @@ int offline_memory(u64 start, u64 size)
 
 	return 0;
 }
+
+int remove_memory(int nid, u64 start, u64 size)
+{
+	int ret = -EBUSY;
+	lock_memory_hotplug();
+	/*
+	 * The memory might become online by other task, even if you offine it.
+	 * So we check whether the memory has been onlined or not.
+	 */
+	if (!is_memblk_offline(start, size)) {
+		pr_warn("memory removing [mem %#010llx-%#010llx] failed, "
+			"because the memmory range is online\n",
+			start, start + size);
+		ret = -EAGAIN;
+	}
+
+	unlock_memory_hotplug();
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
 #else
 int offline_pages(unsigned long start, unsigned long size)
 {
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 05/21] memory-hotplug: check whether memory is present or not
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

If system supports memory hot-remove, online_pages() may online removed pages.
So online_pages() need to check whether onlining pages are present or not.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
---
 include/linux/mmzone.h |   19 +++++++++++++++++++
 mm/memory_hotplug.c    |   13 +++++++++++++
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2daa54f..ac3ae30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1180,6 +1180,25 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_SPARSEMEM
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+	int i;
+	for (i = 0; i < nr_pages; i++) {
+		if (pfn_present(pfn + i))
+			continue;
+		else
+			return -EINVAL;
+	}
+	return 0;
+}
+#else
+static inline int pfns_present(unsigned long pfn, unsigned long nr_pages)
+{
+	return 0;
+}
+#endif /* CONFIG_SPARSEMEM*/
+
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool early_pfn_in_nid(unsigned long pfn, int nid);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 49f7747..299747d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -467,6 +467,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 	struct memory_notify arg;
 
 	lock_memory_hotplug();
+	/*
+	 * If system supports memory hot-remove, the memory may have been
+	 * removed. So we check whether the memory has been removed or not.
+	 *
+	 * Note: When CONFIG_SPARSEMEM is defined, pfns_present() become
+	 *       effective. If CONFIG_SPARSEMEM is not defined, pfns_present()
+	 *       always returns 0.
+	 */
+	ret = pfns_present(pfn, nr_pages);
+	if (ret) {
+		unlock_memory_hotplug();
+		return ret;
+	}
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
 	arg.status_change_nid = -1;
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 06/21] memory-hotplug: export the function acpi_bus_remove()
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

The function acpi_bus_remove() can remove a acpi device from acpi device.
When a acpi device is removed, we need to call this function to remove
the acpi device from acpi bus. So export this function.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/acpi/scan.c     |    3 ++-
 include/acpi/acpi_bus.h |    1 +
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d1ecca2..1cefc34 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1224,7 +1224,7 @@ static int acpi_device_set_context(struct acpi_device *device)
 	return -ENODEV;
 }
 
-static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 {
 	if (!dev)
 		return -EINVAL;
@@ -1246,6 +1246,7 @@ static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 
 	return 0;
 }
+EXPORT_SYMBOL(acpi_bus_remove);
 
 static int acpi_add_single_object(struct acpi_device **child,
 				  acpi_handle handle, int type,
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bde976e..2ccf109 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -360,6 +360,7 @@ bool acpi_bus_power_manageable(acpi_handle handle);
 bool acpi_bus_can_wakeup(acpi_handle handle);
 int acpi_power_resource_register_device(struct device *dev, acpi_handle handle);
 void acpi_power_resource_unregister_device(struct device *dev, acpi_handle handle);
+int acpi_bus_remove(struct acpi_device *dev, int rmdevice);
 #ifdef CONFIG_ACPI_PROC_EVENT
 int acpi_bus_generate_proc_event(struct acpi_device *device, u8 type, int data);
 int acpi_bus_generate_proc_event4(const char *class, const char *bid, u8 type, int data);
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 08/21] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
sysfs files are created. But there is no code to remove these files. The patch
implements the function to remove them.

Note : The code does not free firmware_map_entry since there is no way to free
       memory which is allocated by bootmem.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/firmware/memmap.c    |   98 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/firmware-map.h |    6 +++
 mm/memory_hotplug.c          |    9 +++-
 3 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index c1cdc92..6740d26 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 /*
  * Data types ------------------------------------------------------------------
@@ -41,6 +42,7 @@ struct firmware_map_entry {
 	const char		*type;	/* type of the memory range */
 	struct list_head	list;	/* entry for the linked list */
 	struct kobject		kobj;   /* kobject for each entry */
+	unsigned int		bootmem:1; /* allocated from bootmem */
 };
 
 /*
@@ -79,7 +81,26 @@ static const struct sysfs_ops memmap_attr_ops = {
 	.show = memmap_attr_show,
 };
 
+
+static inline struct firmware_map_entry *
+to_memmap_entry(struct kobject *kobj)
+{
+	return container_of(kobj, struct firmware_map_entry, kobj);
+}
+
+static void release_firmware_map_entry(struct kobject *kobj)
+{
+	struct firmware_map_entry *entry = to_memmap_entry(kobj);
+
+	if (entry->bootmem)
+		/* There is no way to free memory allocated from bootmem */
+		return;
+
+	kfree(entry);
+}
+
 static struct kobj_type memmap_ktype = {
+	.release	= release_firmware_map_entry,
 	.sysfs_ops	= &memmap_attr_ops,
 	.default_attrs	= def_attrs,
 };
@@ -94,6 +115,7 @@ static struct kobj_type memmap_ktype = {
  * in firmware initialisation code in one single thread of execution.
  */
 static LIST_HEAD(map_entries);
+static DEFINE_SPINLOCK(map_entries_lock);
 
 /**
  * firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
@@ -118,11 +140,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
 	INIT_LIST_HEAD(&entry->list);
 	kobject_init(&entry->kobj, &memmap_ktype);
 
+	spin_lock(&map_entries_lock);
 	list_add_tail(&entry->list, &map_entries);
+	spin_unlock(&map_entries_lock);
 
 	return 0;
 }
 
+/**
+ * firmware_map_remove_entry() - Does the real work to remove a firmware
+ * memmap entry.
+ * @entry: removed entry.
+ **/
+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
+{
+	spin_lock(&map_entries_lock);
+	list_del(&entry->list);
+	spin_unlock(&map_entries_lock);
+}
+
 /*
  * Add memmap entry on sysfs
  */
@@ -144,6 +180,35 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
 	return 0;
 }
 
+/*
+ * Remove memmap entry on sysfs
+ */
+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
+{
+	kobject_put(&entry->kobj);
+}
+
+/*
+ * Search memmap entry
+ */
+
+static struct firmware_map_entry * __meminit
+firmware_map_find_entry(u64 start, u64 end, const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	spin_lock(&map_entries_lock);
+	list_for_each_entry(entry, &map_entries, list)
+		if ((entry->start == start) && (entry->end == end) &&
+		    (!strcmp(entry->type, type))) {
+			spin_unlock(&map_entries_lock);
+			return entry;
+		}
+
+	spin_unlock(&map_entries_lock);
+	return NULL;
+}
+
 /**
  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
  * memory hotplug.
@@ -193,9 +258,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 	if (WARN_ON(!entry))
 		return -ENOMEM;
 
+	entry->bootmem = 1;
 	return firmware_map_add_entry(start, end, type, entry);
 }
 
+/**
+ * firmware_map_remove() - remove a firmware mapping entry
+ * @start: Start of the memory range.
+ * @end:   End of the memory range.
+ * @type:  Type of the memory range.
+ *
+ * removes a firmware mapping entry.
+ *
+ * Returns 0 on success, or -EINVAL if no entry.
+ **/
+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	entry = firmware_map_find_entry(start, end - 1, type);
+	if (!entry)
+		return -EINVAL;
+
+	firmware_map_remove_entry(entry);
+
+	/* remove the memmap entry */
+	remove_sysfs_fw_map_entry(entry);
+
+	return 0;
+}
+
 /*
  * Sysfs functions -------------------------------------------------------------
  */
@@ -217,8 +309,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
 }
 
-#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
+{
+	return container_of(attr, struct memmap_attribute, attr);
+}
 
 static ssize_t memmap_attr_show(struct kobject *kobj,
 				struct attribute *attr, char *buf)
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 43fe52f..71d4fa7 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
 
 int firmware_map_add_early(u64 start, u64 end, const char *type);
 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
+int firmware_map_remove(u64 start, u64 end, const char *type);
 
 #else /* CONFIG_FIRMWARE_MEMMAP */
 
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
 	return 0;
 }
 
+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FIRMWARE_MEMMAP */
 
 #endif /* _LINUX_FIRMWARE_MAP_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 299747d..e74a01d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1052,9 +1052,9 @@ int offline_memory(u64 start, u64 size)
 	return 0;
 }
 
-int remove_memory(int nid, u64 start, u64 size)
+int __ref remove_memory(int nid, u64 start, u64 size)
 {
-	int ret = -EBUSY;
+	int ret = 0;
 	lock_memory_hotplug();
 	/*
 	 * The memory might become online by other task, even if you offine it.
@@ -1065,8 +1065,13 @@ int remove_memory(int nid, u64 start, u64 size)
 			"because the memmory range is online\n",
 			start, start + size);
 		ret = -EAGAIN;
+		goto out;
 	}
 
+	/* remove memmap entry */
+	firmware_map_remove(start, start + size, "System RAM");
+
+out:
 	unlock_memory_hotplug();
 	return ret;
 
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 00/21] memory-hotplug: hot-remove physical memory
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97

From: Wen Congyang <wency@cn.fujitsu.com>

This patch series aims to support physical memory hot-remove.

The patches can free/remove the following things:

  - acpi_memory_info                          : [RFC PATCH 4/19]
  - /sys/firmware/memmap/X/{end, start, type} : [RFC PATCH 8/19]
  - iomem_resource                            : [RFC PATCH 9/19]
  - mem_section and related sysfs files       : [RFC PATCH 10-11, 13-16/19]
  - page table of removed memory              : [RFC PATCH 12/19]
  - node and related sysfs files              : [RFC PATCH 18-19/19]

If you find lack of function for physical memory hot-remove, please let me
know.

How to test this patchset?
1. apply this patchset and build the kernel. MEMORY_HOTPLUG, MEMORY_HOTREMOVE,
   ACPI_HOTPLUG_MEMORY must be selected.
2. load the module acpi_memhotplug
3. hotplug the memory device(it depends on your hardware)
   You will see the memory device under the directory /sys/bus/acpi/devices/.
   Its name is PNP0C80:XX.
4. online/offline pages provided by this memory device
   You can write online/offline to /sys/devices/system/memory/memoryX/state to
   online/offline pages provided by this memory device
5. hotremove the memory device
   You can hotremove the memory device by the hardware, or writing 1 to
   /sys/bus/acpi/devices/PNP0C80:XX/eject.

Note: if the memory provided by the memory device is used by the kernel, it
can't be offlined. It is not a bug.

Known problems:
1. memory can't be offlined when CONFIG_MEMCG is selected.
   For example: there is a memory device on node 1. The address range
   is [1G, 1.5G). You will find 4 new directories memory8, memory9, memory10,
   and memory11 under the directory /sys/devices/system/memory/.
   If CONFIG_MEMCG is selected, we will allocate memory to store page cgroup
   when we online pages. When we online memory8, the memory stored page cgroup
   is not provided by this memory device. But when we online memory9, the memory
   stored page cgroup may be provided by memory8. So we can't offline memory8
   now. We should offline the memory in the reversed order.
   When the memory device is hotremoved, we will auto offline memory provided
   by this memory device. But we don't know which memory is onlined first, so
   offlining memory may fail. In such case, you should offline the memory by
   hand before hotremoving the memory device.
2. hotremoving memory device may cause kernel panicked
   This bug will be fixed by Liu Jiang's patch:
   https://lkml.org/lkml/2012/7/3/1

change log of v9:
 [RFC PATCH v9 8/21]
   * add a lock to protect the list map_entries
   * add an indicator to firmware_map_entry to remember whether the memory
     is allocated from bootmem
 [RFC PATCH v9 10/21]
   * change the macro to inline function
 [RFC PATCH v9 19/21]
   * don't offline the node if the cpu on the node is onlined
 [RFC PATCH v9 21/21]
   * create new patch: auto offline page_cgroup when onlining memory block
     failed

change log of v8:
 [RFC PATCH v8 17/20]
   * Fix problems when one node's range include the other nodes
 [RFC PATCH v8 18/20]
   * fix building error when CONFIG_MEMORY_HOTPLUG_SPARSE or CONFIG_HUGETLBFS
     is not defined.
 [RFC PATCH v8 19/20]
   * don't offline node when some memory sections are not removed
 [RFC PATCH v8 20/20]
   * create new patch: clear hwpoisoned flag when onlining pages

change log of v7:
 [RFC PATCH v7 4/19]
   * do not continue if acpi_memory_device_remove_memory() fails.
 [RFC PATCH v7 15/19]
   * handle usemap in register_page_bootmem_info_section() too.

change log of v6:
 [RFC PATCH v6 12/19]
   * fix building error on other archtitectures than x86

 [RFC PATCH v6 15-16/19]
   * fix building error on other archtitectures than x86

change log of v5:
 * merge the patchset to clear page table and the patchset to hot remove
   memory(from ishimatsu) to one big patchset.

 [RFC PATCH v5 1/19]
   * rename remove_memory() to offline_memory()/offline_pages()

 [RFC PATCH v5 2/19]
   * new patch: implement offline_memory(). This function offlines pages,
     update memory block's state, and notify the userspace that the memory
     block's state is changed.

 [RFC PATCH v5 4/19]
   * offline and remove memory in acpi_memory_disable_device() too.

 [RFC PATCH v5 17/19]
   * new patch: add a new function __remove_zone() to revert the things done
     in the function __add_zone().

 [RFC PATCH v5 18/19]
   * flush work befor reseting node device.

change log of v4:
 * remove "memory-hotplug : unify argument of firmware_map_add_early/hotplug"
   from the patch series, since the patch is a bugfix. It is being disccussed
   on other thread. But for testing the patch series, the patch is needed.
   So I added the patch as [PATCH 0/13].

 [RFC PATCH v4 2/13]
   * check memory is online or not at remove_memory()
   * add memory_add_physaddr_to_nid() to acpi_memory_device_remove() for
     getting node id
 
 [RFC PATCH v4 3/13]
   * create new patch : check memory is online or not at online_pages()

 [RFC PATCH v4 4/13]
   * add __ref section to remove_memory()
   * call firmware_map_remove_entry() before remove_sysfs_fw_map_entry()

 [RFC PATCH v4 11/13]
   * rewrite register_page_bootmem_memmap() for removing page used as PT/PMD

change log of v3:
 * rebase to 3.5.0-rc6

 [RFC PATCH v2 2/13]
   * remove extra kobject_put()

   * The patch was commented by Wen. Wen's comment is
     "acpi_memory_device_remove() should ignore a return value of
     remove_memory() since caller does not care the return value".
     But I did not change it since I think caller should care the
     return value. And I am trying to fix it as follow:

     https://lkml.org/lkml/2012/7/5/624

 [RFC PATCH v2 4/13]
   * remove a firmware_memmap_entry allocated by kzmalloc()

change log of v2:
 [RFC PATCH v2 2/13]
   * check whether memory block is offline or not before calling offline_memory()
   * check whether section is valid or not in is_memblk_offline()
   * call kobject_put() for each memory_block in is_memblk_offline()

 [RFC PATCH v2 3/13]
   * unify the end argument of firmware_map_add_early/hotplug

 [RFC PATCH v2 4/13]
   * add release_firmware_map_entry() for freeing firmware_map_entry

 [RFC PATCH v2 6/13]
  * add release_memory_block() for freeing memory_block

 [RFC PATCH v2 11/13]
  * fix wrong arguments of free_pages()


Wen Congyang (8):
  memory-hotplug: implement offline_memory()
  memory-hotplug: store the node id in acpi_memory_device
  memory-hotplug: export the function acpi_bus_remove()
  memory-hotplug: call acpi_bus_remove() to remove memory device
  memory-hotplug: introduce new function arch_remove_memory()
  memory-hotplug: remove sysfs file of node
  memory-hotplug: clear hwpoisoned flag when onlining pages
  memory-hotplug: auto offline page_cgroup when onlining memory block
    failed

Yasuaki Ishimatsu (13):
  memory-hotplug: rename remove_memory() to
    offline_memory()/offline_pages()
  memory-hotplug: offline and remove memory when removing the memory
    device
  memory-hotplug: check whether memory is present or not
  memory-hotplug: remove /sys/firmware/memmap/X sysfs
  memory-hotplug: does not release memory region in PAGES_PER_SECTION
    chunks
  memory-hotplug: add memory_block_release
  memory-hotplug: remove_memory calls __remove_pages
  memory-hotplug: check page type in get_page_bootmem
  memory-hotplug: move register_page_bootmem_info_node and
    put_page_bootmem for sparse-vmemmap
  memory-hotplug: implement register_page_bootmem_info_section of
    sparse-vmemmap
  memory-hotplug: free memmap of sparse-vmemmap
  memory_hotplug: clear zone when the memory is removed
  memory-hotplug: add node_device_release

 arch/ia64/mm/discontig.c                        |   14 +
 arch/ia64/mm/init.c                             |   16 +
 arch/powerpc/mm/init_64.c                       |   14 +
 arch/powerpc/mm/mem.c                           |   14 +
 arch/powerpc/platforms/pseries/hotplug-memory.c |   16 +-
 arch/s390/mm/init.c                             |   12 +
 arch/s390/mm/vmem.c                             |   14 +
 arch/sh/mm/init.c                               |   15 +
 arch/sparc/mm/init_64.c                         |   14 +
 arch/tile/mm/init.c                             |    8 +
 arch/x86/include/asm/pgtable_types.h            |    1 +
 arch/x86/mm/init_32.c                           |   10 +
 arch/x86/mm/init_64.c                           |  331 ++++++++++++++++++
 arch/x86/mm/pageattr.c                          |   47 ++--
 drivers/acpi/acpi_memhotplug.c                  |   54 +++-
 drivers/acpi/scan.c                             |    3 +-
 drivers/base/memory.c                           |   88 ++++-
 drivers/base/node.c                             |   11 +
 drivers/firmware/memmap.c                       |   98 +++++-
 include/acpi/acpi_bus.h                         |    1 +
 include/linux/firmware-map.h                    |    6 +
 include/linux/memory.h                          |    5 +
 include/linux/memory_hotplug.h                  |   25 +-
 include/linux/mm.h                              |    5 +-
 include/linux/mmzone.h                          |   19 +
 mm/memory_hotplug.c                             |  424 +++++++++++++++++++++--
 mm/page_cgroup.c                                |    3 +
 mm/sparse.c                                     |    5 +-
 28 files changed, 1181 insertions(+), 92 deletions(-)

^ permalink raw reply

* [RFC v9 PATCH 01/21] memory-hotplug: rename remove_memory() to offline_memory()/offline_pages()
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

remove_memory() only try to offline pages. It is called in two cases:
1. hot remove a memory device
2. echo offline >/sys/devices/system/memory/memoryXX/state

In the 1st case, we should also change memory block's state, and notify
the userspace that the memory block's state is changed after offlining
pages.

So rename remove_memory() to offline_memory()/offline_pages(). And in
the 1st case, offline_memory() will be used. The function offline_memory()
is not implemented. In the 2nd case, offline_pages() will be used.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/acpi/acpi_memhotplug.c |    2 +-
 drivers/base/memory.c          |    9 +++------
 include/linux/memory_hotplug.h |    3 ++-
 mm/memory_hotplug.c            |   22 ++++++++++++++--------
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24c807f..2a7beac 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -318,7 +318,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 	 */
 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 		if (info->enabled) {
-			result = remove_memory(info->start_addr, info->length);
+			result = offline_memory(info->start_addr, info->length);
 			if (result)
 				return result;
 		}
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7dda4f7..44e7de6 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -248,26 +248,23 @@ static bool pages_correctly_reserved(unsigned long start_pfn,
 static int
 memory_block_action(unsigned long phys_index, unsigned long action)
 {
-	unsigned long start_pfn, start_paddr;
+	unsigned long start_pfn;
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	struct page *first_page;
 	int ret;
 
 	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
+	start_pfn = page_to_pfn(first_page);
 
 	switch (action) {
 		case MEM_ONLINE:
-			start_pfn = page_to_pfn(first_page);
-
 			if (!pages_correctly_reserved(start_pfn, nr_pages))
 				return -EBUSY;
 
 			ret = online_pages(start_pfn, nr_pages);
 			break;
 		case MEM_OFFLINE:
-			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
-			ret = remove_memory(start_paddr,
-					    nr_pages << PAGE_SHIFT);
+			ret = offline_pages(start_pfn, nr_pages);
 			break;
 		default:
 			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 910550f..c183f39 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -233,7 +233,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
 extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
-extern int remove_memory(u64 start, u64 size);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_memory(u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3ad25f9..bb42316 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -866,7 +866,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
 		  unsigned long end_pfn, unsigned long timeout)
 {
 	unsigned long pfn, nr_pages, expire;
@@ -994,18 +994,24 @@ out:
 	return ret;
 }
 
-int remove_memory(u64 start, u64 size)
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
-	unsigned long start_pfn, end_pfn;
+	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
 
-	start_pfn = PFN_DOWN(start);
-	end_pfn = start_pfn + PFN_DOWN(size);
-	return offline_pages(start_pfn, end_pfn, 120 * HZ);
+int offline_memory(u64 start, u64 size)
+{
+	return -EINVAL;
 }
 #else
-int remove_memory(u64 start, u64 size)
+int offline_pages(unsigned long start, unsigned long size)
+{
+	return -EINVAL;
+}
+
+int offline_memory(u64 start, u64 size)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-EXPORT_SYMBOL_GPL(remove_memory);
+EXPORT_SYMBOL_GPL(offline_memory);
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 09/21] memory-hotplug: does not release memory region in PAGES_PER_SECTION chunks
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Since applying a patch(de7f0cba96786c), release_mem_region() has been changed
as called in PAGES_PER_SECTION chunks because register_memory_resource() is
called in PAGES_PER_SECTION chunks by add_memory(). But it seems firmware
dependency. If CRS are written in the PAGES_PER_SECTION chunks in ACPI DSDT
Table, register_memory_resource() is called in PAGES_PER_SECTION chunks.
But if CRS are written in the DIMM unit in ACPI DSDT Table,
register_memory_resource() is called in DIMM unit. So release_mem_region()
should not be called in PAGES_PER_SECTION chunks. The patch fixes it.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |   13 +++++++++----
 mm/memory_hotplug.c                             |    4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e05..dc0a035 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 {
 	unsigned long start, start_pfn;
 	struct zone *zone;
-	int ret;
+	int i, ret;
+	int sections_to_remove;
 
 	start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 	 * to sysfs "state" file and we can't remove sysfs entries
 	 * while writing to it. So we have to defer it to here.
 	 */
-	ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-	if (ret)
-		return ret;
+	sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+	for (i = 0; i < sections_to_remove; i++) {
+		unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+		ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Update memory regions for memory remove
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e74a01d..2353887 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -358,11 +358,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 	BUG_ON(nr_pages % PAGES_PER_SECTION);
 
+	release_mem_region(phys_start_pfn << PAGE_SHIFT,  nr_pages * PAGE_SIZE);
+
 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
 	for (i = 0; i < sections_to_remove; i++) {
 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
-		release_mem_region(pfn << PAGE_SHIFT,
-				   PAGES_PER_SECTION << PAGE_SHIFT);
 		ret = __remove_section(zone, __pfn_to_section(pfn));
 		if (ret)
 			break;
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 03/21] memory-hotplug: store the node id in acpi_memory_device
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

The memory device has only one node id. Store the node id when
enable the memory device, and we can reuse it when removing the
memory device.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
---
 drivers/acpi/acpi_memhotplug.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 2a7beac..7873832 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -83,6 +83,7 @@ struct acpi_memory_info {
 struct acpi_memory_device {
 	struct acpi_device * device;
 	unsigned int state;	/* State of the memory device */
+	int nid;
 	struct list_head res_list;
 };
 
@@ -256,6 +257,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 		info->enabled = 1;
 		num_enabled++;
 	}
+
+	mem_device->nid = node;
+
 	if (!num_enabled) {
 		printk(KERN_ERR PREFIX "add_memory failed\n");
 		mem_device->state = MEMORY_INVALID_STATE;
-- 
1.7.1

^ permalink raw reply related

* [RFC v9 PATCH 07/21] memory-hotplug: call acpi_bus_remove() to remove memory device
From: wency @ 2012-09-05  9:25 UTC (permalink / raw)
  To: x86, linux-mm, linux-kernel, linuxppc-dev, linux-acpi, linux-s390,
	linux-sh, linux-ia64, cmetcalf, sparclinux
  Cc: len.brown, Wen Congyang, isimatu.yasuaki, paulus, minchan.kim,
	kosaki.motohiro, rientjes, cl, akpm, liuj97
In-Reply-To: <1346837155-534-1-git-send-email-wency@cn.fujitsu.com>

From: Wen Congyang <wency@cn.fujitsu.com>

The memory device has been ejected and powoffed, so we can call
acpi_bus_remove() to remove the memory device from acpi bus.

CC: David Rientjes <rientjes@google.com>
CC: Jiang Liu <liuj97@gmail.com>
CC: Len Brown <len.brown@intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Christoph Lameter <cl@linux.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 drivers/acpi/acpi_memhotplug.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 9d47458..b152767 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -425,8 +425,9 @@ static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		/*
-		 * TBD: Invoke acpi_bus_remove to cleanup data structures
+		 * Invoke acpi_bus_remove() to remove memory device
 		 */
+		acpi_bus_remove(device, 1);
 
 		/* _EJ0 succeeded; _OST is not necessary */
 		return;
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v2] powerpc: fix personality handling in ppc64_personality()
From: Jiri Kosina @ 2012-09-05  8:56 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Paul Mackerras, Andreas Schwab, linuxppc-dev, linux-kernel
In-Reply-To: <1346813643.2257.24.camel@pasglop>

On Wed, 5 Sep 2012, Benjamin Herrenschmidt wrote:

> > Directly comparing current->personality against PER_LINUX32 doesn't work
> > in cases when any of the personality flags stored in the top three bytes
> > are used.
> > 
> > Directly forcefully setting personality to PER_LINUX32 or PER_LINUX
> > discards any flags stored in the top three bytes
> > 
> > Use personality() macro to compare only PER_MASK bytes and make sure that
> > we are setting only the bits that should be set, instead of
> > overwriting the whole value.
> > 
> > Signed-off-by: Jiri Kosina <jkosina@suse.cz>
> > ---
> > 
> > changed since v1: fix the bit ops to reflect the fact that PER_LINUX is 
> > actually 0
> 
> Had already merged v1 (oops.. didn't spot the issue with PER_LINUX being
> 0). Can you send an incremental fixup ?

Hi Benjamin,

actually commit 7256a5d2da56 seems to contain the correct PER_LINUX 
handling, so seems like you picked the right one :)

Thanks,

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply

* RE: [PATCH] DMA/RaidEngine: Enable FSL RaidEngine
From: Shi Xuelin-B29237 @ 2012-09-05  8:42 UTC (permalink / raw)
  To: dan.j.williams@gmail.com
  Cc: Rai Harninder-B01044, iws@ovro.caltech.edu, vinod.koul@intel.com,
	linux-kernel@vger.kernel.org, Burmi Naveen-B16502,
	Shi Xuelin-B29237, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1345616661-8550-1-git-send-email-b29237@freescale.com>

Hi Dan,

Do you have any comments about this RaidEngine patch?

Thanks,
Forrest

-----Original Message-----
From: Linuxppc-dev [mailto:linuxppc-dev-bounces+qiang.liu=3Dfreescale.com@l=
ists.ozlabs.org] On Behalf Of Shi Xuelin-B29237
Sent: 2012=1B$BG/=1B(B8=1B$B7n=1B(B22=1B$BF|=1B(B 14:24
To: dan.j.williams@gmail.com; vinod.koul@intel.com; linuxppc-dev@lists.ozla=
bs.org; linux-kernel@vger.kernel.org
Cc: Rai Harninder-B01044; Rai Harninder-B01044; iws@ovro.caltech.edu; Burmi=
 Naveen-B16502; Burmi Naveen-B16502; Shi Xuelin-B29237
Subject: [PATCH] DMA/RaidEngine: Enable FSL RaidEngine

From: Xuelin Shi <b29237@freescale.com>

The RaidEngine is a new FSL hardware that used as hardware acceration for R=
AID5/6.

This patch enables the RaidEngine functionality and provides hardware offlo=
ading capability for memcpy, xor and raid6 pq computation. It works under d=
maengine control with async_layer interface.

Signed-off-by: Harninder Rai <harninder.rai@freescale.com>
Signed-off-by: Naveen Burmi <naveenburmi@freescale.com>
Signed-off-by: Xuelin Shi <b29237@freescale.com>
---
 arch/powerpc/boot/dts/fsl/p5020si-post.dtsi    |    1 +
 arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi     |    6 +
 arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi |   85 ++
 drivers/dma/Kconfig                            |   14 +
 drivers/dma/Makefile                           |    1 +
 drivers/dma/fsl_raid.c                         | 1090 ++++++++++++++++++++=
++++
 drivers/dma/fsl_raid.h                         |  294 +++++++
 7 files changed, 1491 insertions(+), 0 deletions(-)  create mode 100644 ar=
ch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi
 create mode 100644 drivers/dma/fsl_raid.c  create mode 100644 drivers/dma/=
fsl_raid.h

diff --git a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi b/arch/powerpc/boo=
t/dts/fsl/p5020si-post.dtsi
index 64b6abe..5d7205b 100644
--- a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
@@ -354,4 +354,5 @@
 /include/ "qoriq-sata2-0.dtsi"
 /include/ "qoriq-sata2-1.dtsi"
 /include/ "qoriq-sec4.2-0.dtsi"
+/include/ "qoriq-raid1.0-0.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi b/arch/powerpc/boot=
/dts/fsl/p5020si-pre.dtsi
index ae823a4..d54cd90 100644
--- a/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5020si-pre.dtsi
@@ -70,6 +70,12 @@
 		rtic_c =3D &rtic_c;
 		rtic_d =3D &rtic_d;
 		sec_mon =3D &sec_mon;
+
+		raideng =3D &raideng;
+		raideng_jr0 =3D &raideng_jr0;
+		raideng_jr1 =3D &raideng_jr1;
+		raideng_jr2 =3D &raideng_jr2;
+		raideng_jr3 =3D &raideng_jr3;
 	};
=20
 	cpus {
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi b/arch/powerpc/=
boot/dts/fsl/qoriq-raid1.0-0.dtsi
new file mode 100644
index 0000000..8d2e8aa
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/qoriq-raid1.0-0.dtsi
@@ -0,0 +1,85 @@
+/*
+ * QorIQ RAID 1.0 device tree stub [ controller @ offset 0x320000 ]
+ *
+ * Copyright 2012 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are =
met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in t=
he
+ *       documentation and/or other materials provided with the distributi=
on.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote produ=
cts
+ *       derived from this software without specific prior written permiss=
ion.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of=20
+the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND=20
+ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE=20
+IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE=20
+ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR=20
+ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL=20
+DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR=20
+SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER=20
+CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,=20
+OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE=20
+USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+raideng: raideng@320000 {
+	compatible =3D "fsl,raideng-v1.0";
+	#address-cells =3D <1>;
+	#size-cells =3D <1>;
+	reg =3D <0x320000 0x10000>;
+	ranges =3D <0 0x320000 0x10000>;
+
+	raideng_jq0@1000 {
+		compatible =3D "fsl,raideng-v1.0-job-queue";
+		#address-cells =3D <1>;
+		#size-cells =3D <1>;
+		reg =3D <0x1000 0x1000>;
+		ranges =3D <0x0 0x1000 0x1000>;
+
+		raideng_jr0: jr@0 {
+			compatible =3D "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-hp-ring";
+			reg =3D <0x0 0x400>;
+			interrupts =3D <139 2 0 0>;
+			interrupt-parent =3D <&mpic>;
+		};
+
+		raideng_jr1: jr@400 {
+			compatible =3D "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-lp-ring";
+			reg =3D <0x400 0x400>;
+			interrupts =3D <140 2 0 0>;
+			interrupt-parent =3D <&mpic>;
+		};
+	};
+
+	raideng_jq1@2000 {
+		compatible =3D "fsl,raideng-v1.0-job-queue";
+		#address-cells =3D <1>;
+		#size-cells =3D <1>;
+		reg =3D <0x2000 0x1000>;
+		ranges =3D <0x0 0x2000 0x1000>;
+
+		raideng_jr2: jr@0 {
+			compatible =3D "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-hp-ring";
+			reg =3D <0x0 0x400>;
+			interrupts =3D <141 2 0 0>;
+			interrupt-parent =3D <&mpic>;
+		};
+
+		raideng_jr3: jr@400 {
+			compatible =3D "fsl,raideng-v1.0-job-ring", "fsl,raideng-v1.0-lp-ring";
+			reg =3D <0x400 0x400>;
+			interrupts =3D <142 2 0 0>;
+			interrupt-parent =3D <&mpic>;
+		};
+	};
+};
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index d06ea29..1b94c=
8e 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -292,6 +292,20 @@ config DMA_OMAP
 	select DMA_ENGINE
 	select DMA_VIRTUAL_CHANNELS
=20
+config FSL_RAID
+        tristate "Freescale RAID Engine Device Driver"
+        depends on FSL_SOC && !FSL_DMA
+        select DMA_ENGINE
+        select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+        select ASYNC_MEMCPY
+        select ASYNC_XOR
+        select ASYNC_PQ
+        ---help---
+          Enable support for Freescale RAID Engine. RAID Engine is
+          available on some QorIQ SoCs (like P5020). It has
+          the capability to offload RAID5/RAID6 operations from CPU.
+          RAID5 is XOR and memcpy. RAID6 is P/Q and memcpy
+
 config DMA_ENGINE
 	bool
=20
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 4cf6b12..b3e=
7702 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_EP93XX_DMA) +=3D ep93xx_dma.o
 obj-$(CONFIG_DMA_SA11X0) +=3D sa11x0-dma.o
 obj-$(CONFIG_MMP_TDMA) +=3D mmp_tdma.o
 obj-$(CONFIG_DMA_OMAP) +=3D omap-dma.o
+obj-$(CONFIG_FSL_RAID) +=3D fsl_raid.o
diff --git a/drivers/dma/fsl_raid.c b/drivers/dma/fsl_raid.c new file mode =
100644 index 0000000..ec860fd
--- /dev/null
+++ b/drivers/dma/fsl_raid.c
@@ -0,0 +1,1090 @@
+/*
+ * drivers/dma/fsl_raid.c
+ *
+ * Freescale RAID Engine device driver
+ *
+ * Author:
+ *	Harninder Rai <harninder.rai@freescale.com>
+ *	Naveen Burmi <naveenburmi@freescale.com>
+ *	Xuelin Shi <b29237@freescale.com>
+ *
+ * Copyright (c) 2010-2012 Freescale Semiconductor, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are =
met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in t=
he
+ *       documentation and/or other materials provided with the distributi=
on.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote produ=
cts
+ *       derived from this software without specific prior written permiss=
ion.
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of=20
+the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND=20
+ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE=20
+IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE=20
+ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR=20
+ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL=20
+DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR=20
+SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER=20
+CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,=20
+OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE=20
+USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Theory of operation:
+ *
+ * General capabilities:
+ *	RAID Engine (RE) block is capable of offloading XOR, memcpy and P/Q
+ *	calculations required in RAID5 and RAID6 operations. RE driver
+ *	registers with Linux's ASYNC layer as dma driver. RE hardware
+ *	maintains strict ordering of the requests through chained
+ *	command queueing.
+ *
+ * Data flow:
+ *	Software RAID layer of Linux (MD layer) maintains RAID partitions,
+ *	strips, stripes etc. It sends requests to the underlying AYSNC layer
+ *	which further passes it to RE driver. ASYNC layer decides which request
+ *	goes to which job ring of RE hardware. For every request processed by
+ *	RAID Engine, driver gets an interrupt unless coalescing is set. The
+ *	per job ring interrupt handler checks the status register for errors,
+ *	clears the interrupt and schedules a tasklet. Main request processing
+ *	is done in tasklet. A software shadow copy of the HW ring is kept to
+ *	maintain virtual to physical translation. Based on the internal indexes
+ *	maintained, the tasklet picks the descriptor address from shadow copy,
+ *	updates the corresponding cookie, updates the outbound ring job removed
+ *	register in RE hardware and eventually calls the callback function. Thi=
s
+ *	callback function gets passed as part of request from MD layer.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/of_platform.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/dmaengine.h>
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "fsl_raid.h"
+
+#define MAX_RE_JRS		4
+#define MAX_XOR_SRCS		16
+#define MAX_PQ_SRCS		16
+#define MAX_INITIAL_DESCS	256
+#define FRAME_FORMAT		0x1
+#define MAX_DATA_LENGTH		(1024*1024)
+
+#define to_fsl_re_dma_desc(tx) container_of(tx, \
+		struct fsl_re_dma_async_tx_desc, async_tx)
+
+struct re_drv_private {
+	struct device *dev;
+	u8 total_jrs;
+	struct dma_device dma_dev;
+	struct re_ctrl *re_regs;
+	struct re_jr *re_jrs[MAX_RE_JRS];
+};
+
+static struct kmem_cache *re_sw_desc_cache;
+
+/* Add descriptors into per jr software queue - submit_q */ static=20
+dma_cookie_t re_jr_tx_submit(struct dma_async_tx_descriptor *tx) {
+	struct fsl_re_dma_async_tx_desc *desc =3D NULL;
+	struct re_jr *jr =3D NULL;
+	dma_cookie_t cookie;
+
+	desc =3D container_of(tx, struct fsl_re_dma_async_tx_desc, async_tx);
+	jr =3D container_of(tx->chan, struct re_jr, chan);
+
+	spin_lock_bh(&jr->desc_lock);
+
+	jr->timer.data =3D (unsigned long)tx->chan;
+	cookie =3D jr->chan.cookie + 1;
+	if (cookie < 0)
+		cookie =3D 1;
+
+	desc->async_tx.cookie =3D cookie;
+	jr->chan.cookie =3D desc->async_tx.cookie;
+	list_add_tail(&desc->node, &jr->submit_q);
+
+	if (!timer_pending(&jr->timer))
+		add_timer(&jr->timer);
+
+	spin_unlock_bh(&jr->desc_lock);
+
+	return cookie;
+}
+
+static void re_jr_unmap_dest_src(struct fsl_re_dma_async_tx_desc *desc)=20
+{
+	int i, j;
+	struct cmpnd_frame *cf;
+	dma_addr_t dest1 =3D 0, dest2 =3D 0, src;
+	struct device *dev;
+	enum dma_ctrl_flags flags;
+	enum dma_data_direction dir;
+
+	BUG_ON(!desc);
+	cf =3D desc->cf_addr;
+	dest1 =3D cf[1].address;
+	j =3D 2;
+	if (desc->dest_cnt =3D=3D 2) {
+		dest2 =3D cf[2].address;
+		j =3D 3;
+	}
+	dev =3D desc->jr->chan.device->dev;
+	flags =3D desc->async_tx.flags;
+	if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+		if (desc->cdb_opcode =3D=3D RE_MOVE_OPCODE)
+			dir =3D DMA_FROM_DEVICE;
+		else
+			dir =3D DMA_BIDIRECTIONAL;
+
+		dma_unmap_page(dev, dest1, desc->dma_len, dir);
+
+		if (dest2)
+			dma_unmap_page(dev, dest2, desc->dma_len, dir);
+	}
+
+	if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+		dir =3D DMA_TO_DEVICE;
+		for (i =3D j; i < desc->src_cnt+j; i++) {
+			src =3D cf[i].address;
+			if (src =3D=3D dest1 || src =3D=3D dest2)
+				continue;
+			dma_unmap_page(dev, src, desc->dma_len, dir);
+		}
+	}
+}
+
+static void re_jr_desc_done(struct fsl_re_dma_async_tx_desc *desc) {
+	struct re_jr *dma_jr =3D desc->jr;
+	dma_async_tx_callback callback;
+	void *callback_param;
+
+	dma_run_dependencies(&desc->async_tx);
+
+	spin_lock_bh(&dma_jr->desc_lock);
+	if (dma_jr->completed_cookie < desc->async_tx.cookie) {
+		dma_jr->completed_cookie =3D desc->async_tx.cookie;
+		if (dma_jr->completed_cookie =3D=3D DMA_MAX_COOKIE)
+			dma_jr->completed_cookie =3D DMA_MIN_COOKIE;
+	}
+	spin_unlock_bh(&dma_jr->desc_lock);
+
+	callback =3D desc->async_tx.callback;
+	callback_param =3D desc->async_tx.callback_param;
+
+	re_jr_unmap_dest_src(desc);
+
+	/* Free the cf/cdb address stored in descs
+	 * unconditionally. These pointers are only
+	 * required for RE driver's housekeeping
+	 */
+	dma_pool_free(dma_jr->desc_pool, desc->cf_addr, desc->cf_paddr);
+	dma_pool_free(dma_jr->desc_pool, desc->cdb_addr, desc->cdb_paddr);
+
+	if (callback)
+		callback(callback_param);
+}
+
+/*
+ * Get the virtual address of software desc from virt_addr.
+ * Storing the address of software desc like this makes the
+ * order of alogorithm as O(1)
+ */
+static void re_jr_dequeue(unsigned long data) {
+	struct device *dev =3D (struct device *)data;
+	struct re_jr *jr =3D dev_get_drvdata(dev);
+	struct fsl_re_dma_async_tx_desc *desc;
+	unsigned int count =3D 0;
+	struct fsl_re_dma_async_tx_desc *ack_desc =3D NULL, *_ack_desc =3D NULL;
+
+	while ((count =3D
+		RE_JR_OUB_SLOT_FULL(in_be32(&jr->jrregs->oubring_slot_full)))) {
+		while (count--) {
+			spin_lock_bh(&jr->desc_lock);
+
+			/* Wrap around index */
+			jr->oub_count &=3D RING_SIZE-1;
+
+			desc =3D (struct fsl_re_dma_async_tx_desc *)
+				jr->virt_arry[jr->oub_count].virt_addr;
+
+			/* Re-initialise so that it can be reused */
+			jr->virt_arry[jr->oub_count].virt_addr =3D 0;
+			jr->virt_arry[jr->oub_count++].phys_addr =3D 0;
+
+			/* One job processed */
+			out_be32(&jr->jrregs->oubring_job_rmvd,
+						RE_JR_OUB_JOB_REMOVE);
+
+			list_add_tail(&desc->node, &jr->ack_q);
+
+			spin_unlock_bh(&jr->desc_lock);
+
+			re_jr_desc_done(desc);
+		}
+	}
+
+	/* To save memory, parse the ack_q and free up descs */
+	list_for_each_entry_safe(ack_desc, _ack_desc, &jr->ack_q, node) {
+		if (async_tx_test_ack(&ack_desc->async_tx)) {
+			list_del(&ack_desc->node);
+			kmem_cache_free(re_sw_desc_cache, ack_desc);
+		}
+	}
+}
+
+/* Per Job Ring interrupt handler */
+static irqreturn_t re_jr_interrupt(int irq, void *data) {
+	struct device *dev =3D data;
+	struct re_jr *jr =3D dev_get_drvdata(dev);
+	u32 irqstate;
+
+	irqstate =3D in_be32(&jr->jrregs->jr_interrupt_status);
+	if (!irqstate)
+		return IRQ_NONE;
+
+	/*
+	 * There's no way in upper layer (read MD layer) to recover from
+	 * error conditions except restart everything. In long term we
+	 * need to do something more than just crashing
+	 */
+	if (irqstate & RE_JR_ERROR) {
+		dev_err(dev, "%s: jr error irqstate: %x\n", __func__,
+			irqstate);
+		BUG();
+	}
+
+	/* Clear interrupt */
+	out_be32(&jr->jrregs->jr_interrupt_status, RE_JR_CLEAR_INT);
+
+	tasklet_schedule(&jr->irqtask);
+
+	return IRQ_HANDLED;
+}
+
+static int re_jr_alloc_chan_resources(struct dma_chan *chan) {
+	struct re_jr *jr =3D container_of(chan, struct re_jr, chan);
+
+	jr->desc_pool =3D dma_pool_create("re_jr_desc_pool", jr->dev,
+					RE_CF_CDB_SIZE, RE_CF_CDB_ALIGN, 0);
+
+	if (!jr->desc_pool) {
+		pr_err("%s:No memory for re_jr_desc_pool\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/* This function is just to please the ASYNC layer */ static void=20
+re_jr_free_chan_resources(struct dma_chan *chan) {
+	struct re_jr *jr =3D container_of(chan, struct re_jr, chan);
+	dma_pool_destroy(jr->desc_pool);
+}
+
+static enum dma_status re_jr_tx_status(struct dma_chan *chan,
+		dma_cookie_t cookie, struct dma_tx_state *txstate) {
+	struct re_jr *jr =3D NULL;
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	jr =3D container_of(chan, struct re_jr, chan);
+	last_used =3D chan->cookie;
+	smp_mb();
+	last_complete =3D jr->completed_cookie;
+
+	dma_set_tx_state(txstate, last_complete, last_used, 0);
+
+	return dma_async_is_complete(cookie, last_complete, last_used); }
+
+
+/* Copy descriptor from per jr software queue into hardware job ring */=20
+void re_jr_issue_pending(struct dma_chan *chan) {
+	struct re_jr *jr =3D NULL;
+	struct fsl_re_dma_async_tx_desc *desc, *_desc;
+
+	jr =3D container_of(chan, struct re_jr, chan);
+	if (timer_pending(&jr->timer))
+		del_timer_sync(&jr->timer);
+
+	spin_lock_bh(&jr->desc_lock);
+
+	list_for_each_entry_safe(desc, _desc, &jr->submit_q, node) {
+
+		if (!in_be32(&jr->jrregs->inbring_slot_avail))
+			goto out_unlock;
+
+		/* Wrap around ring index */
+		jr->inb_count &=3D RING_SIZE-1;
+
+		if (jr->virt_arry[jr->inb_count].phys_addr !=3D 0)
+			goto out_unlock;
+
+		jr->virt_arry[jr->inb_count].phys_addr =3D
+			(phys_addr_t)desc->hwdesc.address;
+		jr->virt_arry[jr->inb_count++].virt_addr =3D (unsigned long)desc;
+
+		/* Wrap around ring index */
+		jr->inb_ring_index &=3D RING_SIZE-1;
+
+		/* Copy frame descriptor into job ring */
+		memcpy(&jr->inb_ring_virt_addr[jr->inb_ring_index],
+			&desc->hwdesc, sizeof(struct jr_hw_desc));
+
+		smp_wmb();
+
+		jr->inb_ring_index++;
+		list_del_init(&desc->node);
+
+		wmb();
+
+		/* One job has been added into job ring */
+		out_be32(&jr->jrregs->inbring_add_job, RE_JR_INB_JOB_ADD);
+	}
+out_unlock:
+	spin_unlock_bh(&jr->desc_lock);
+}
+
+/* Per Job Ring timer handler */
+static void raide_timer_handler(unsigned long data) {
+	struct dma_chan *chan =3D NULL;
+	chan =3D (struct dma_chan *)data;
+
+	re_jr_issue_pending(chan);
+}
+
+static void fill_cfd_frame(struct cmpnd_frame *cf, u8 index,
+		size_t length, dma_addr_t addr, bool final) {
+	cf[index].extension =3D 0;
+	cf[index].final =3D final;
+	cf[index].rsvd1 =3D 0;
+	cf[index].rsvd3 =3D 0;
+	cf[index].rsvd4 =3D 0;
+	cf[index].rsvd5 =3D 0;
+	cf[index].length =3D length;
+	cf[index].offset =3D 0;
+	cf[index].address =3D addr;
+
+}
+
+static struct fsl_re_dma_async_tx_desc *re_jr_alloc_desc(struct re_jr *jr,
+					int cf_num, unsigned long flags)
+{
+	struct fsl_re_dma_async_tx_desc *desc =3D NULL;
+	struct jr_hw_desc *hw_desc =3D NULL;
+	struct cmpnd_frame *cf;
+	dma_addr_t paddr;
+
+	desc =3D kmem_cache_alloc(re_sw_desc_cache, GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	memset(desc, 0, sizeof(*desc));
+	desc->async_tx.tx_submit =3D re_jr_tx_submit;
+	dma_async_tx_descriptor_init(&desc->async_tx, &jr->chan);
+	INIT_LIST_HEAD(&desc->node);
+	hw_desc =3D &desc->hwdesc;
+	hw_desc->format =3D FRAME_FORMAT;
+
+	cf =3D dma_pool_alloc(jr->desc_pool, GFP_ATOMIC, &paddr);
+	if (!cf) {
+		kmem_cache_free(re_sw_desc_cache, desc);
+		return NULL;
+	}
+
+	hw_desc->address =3D paddr;
+	desc->cf_addr =3D cf;
+	desc->cf_paddr =3D paddr;
+	desc->cf_len =3D sizeof(struct cmpnd_frame)*cf_num;
+	memset(cf, 0, desc->cf_len);
+
+	desc->jr =3D jr;
+	desc->async_tx.cookie =3D -EBUSY;
+	desc->async_tx.flags =3D flags;
+
+	return desc;
+}
+
+static void re_jr_free_desc(struct fsl_re_dma_async_tx_desc *desc) {
+	struct re_jr *jr =3D NULL;
+
+	if (desc =3D=3D NULL)
+		return;
+
+	jr =3D desc->jr;
+
+	dma_pool_free(jr->desc_pool, desc->cf_addr, desc->cf_paddr);
+	kmem_cache_free(re_sw_desc_cache, desc);
+
+}
+
+static struct dma_async_tx_descriptor *re_jr_prep_genq(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf, size_t len,
+		unsigned long flags)
+{
+	struct re_jr *jr =3D NULL;
+	struct fsl_re_dma_async_tx_desc *desc =3D NULL;
+	struct xor_cdb *xor =3D NULL;
+	unsigned int cfs_reqd =3D src_cnt + 2; /* CDB+dest+src_cnt */
+	struct cmpnd_frame *cf;
+	int i =3D 0, j =3D 0;
+	dma_addr_t paddr;
+
+	if (len > MAX_DATA_LENGTH) {
+		pr_err("%s: Length greater than %d not supported\n",
+				__func__, MAX_DATA_LENGTH);
+		return NULL;
+	}
+	jr =3D container_of(chan, struct re_jr, chan);
+	desc =3D re_jr_alloc_desc(jr, cfs_reqd, flags);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	desc->dma_len =3D len;
+	desc->dest_cnt =3D 1;
+	desc->src_cnt =3D src_cnt;
+
+	/* Allocate memory for GenQ CDB */
+	xor =3D dma_pool_alloc(jr->desc_pool, GFP_ATOMIC, &paddr);
+	if (!xor) {
+		re_jr_free_desc(desc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	desc->cdb_addr =3D xor;
+	desc->cdb_paddr =3D paddr;
+	desc->cdb_len =3D sizeof(struct xor_cdb);
+	memset(xor, 0, desc->cdb_len);
+	desc->cdb_opcode =3D RE_XOR_OPCODE;
+
+	/* Filling xor CDB */
+	xor->opcode =3D RE_XOR_OPCODE;
+	xor->nrcs =3D (src_cnt - 1);
+	xor->blk_size =3D RE_BLOCK_SIZE;
+	xor->cache_attrib =3D CACHEABLE_INPUT_OUTPUT;
+	xor->buffer_attrib =3D BUFFERABLE_OUTPUT;
+	xor->error_attrib =3D INTERRUPT_ON_ERROR;
+	xor->data_depend =3D DATA_DEPENDENCY;
+	xor->dpi =3D ENABLE_DPI;
+
+	if (scf !=3D NULL) {
+		/* compute q =3D src0*coef0^src1*coef1^..., * is GF(8) mult */
+		for (i =3D 0; i < src_cnt; i++)
+			xor->gfm[i] =3D scf[i];
+	} else {
+		/* compute P, that is XOR all srcs */
+		for (i =3D 0; i < src_cnt; i++)
+			xor->gfm[i] =3D 1;
+	}
+
+	/* Filling frame 0 of compound frame descriptor with CDB */
+	cf =3D desc->cf_addr;
+	fill_cfd_frame(cf, 0, desc->cdb_len, paddr, 0);
+
+	/* Fill CFD's 1st frame with dest buffer */
+	fill_cfd_frame(cf, 1, len, dest, 0);
+
+	/* Fill CFD's rest of the frames with source buffers */
+	for (i =3D 2, j =3D 0; j < src_cnt; i++, j++)
+		fill_cfd_frame(cf, i, len, src[j], 0);
+
+	/* Setting the final bit in the last source buffer frame in CFD */
+	cf[i - 1].final =3D 1;
+
+	return &desc->async_tx;
+}
+
+/*
+ * Prep function for P parity calculation.In RAID Engine terminology,
+ * XOR calculation is called GenQ calculation done through GenQ command =20
+*/ static struct dma_async_tx_descriptor *re_jr_prep_dma_xor(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+		unsigned int src_cnt, size_t len, unsigned long flags) {
+	/* NULL let genq take all coef as 1 */
+	return re_jr_prep_genq(chan, dest, src, src_cnt, NULL, len, flags); }
+
+/*
+ * Prep function for P/Q parity calculation.In RAID Engine terminology,
+ * P/Q calculation is called GenQQ done through GenQQ command  */=20
+static struct dma_async_tx_descriptor *re_jr_prep_pq(
+		struct dma_chan *chan, dma_addr_t *dest, dma_addr_t *src,
+		unsigned int src_cnt, const unsigned char *scf, size_t len,
+		unsigned long flags)
+{
+	struct re_jr *jr =3D NULL;
+	struct fsl_re_dma_async_tx_desc *desc =3D NULL;
+	struct pq_cdb *pq =3D NULL;
+	u8 cfs_reqd =3D src_cnt + 3; /* CDB+P+Q+src_cnt */
+	struct cmpnd_frame *cf;
+	u8 *p;
+	int gfmq_len, i, j;
+	dma_addr_t paddr;
+
+	if (len > MAX_DATA_LENGTH) {
+		pr_err("%s: Length greater than %d not supported\n",
+				__func__, MAX_DATA_LENGTH);
+		return NULL;
+	}
+
+	/*
+	 * RE requires at least 2 sources, if given only one source, we pass
+	 * the second source same as the first one.
+	 * With only one source, generate P is meaningless, only care generate
+	 * Q.
+	 */
+	if (src_cnt =3D=3D 1) {
+		struct dma_async_tx_descriptor *tx =3D NULL;
+		dma_addr_t dma_src[2];
+		unsigned char coef[2];
+		dma_src[0] =3D *src;
+		coef[0] =3D *scf;
+		dma_src[1] =3D *src;
+		coef[1] =3D 0;
+		tx =3D re_jr_prep_genq(chan, dest[1], dma_src, 2, coef, len,
+				flags);
+		if (tx) {
+			desc =3D to_fsl_re_dma_desc(tx);
+			desc->src_cnt =3D 1;
+		}
+		return tx;
+	}
+
+	/*
+	 * During RAID6 array creation, Linux's MD layer gets P and Q
+	 * calculated separately in two steps. But our RAID Engine has
+	 * the capability to calculate both P and Q with a single command
+	 * Hence to merge well with MD layer, we need to provide a hook
+	 * here and call re_jq_prep_genq() function
+	 */
+
+	if (flags & DMA_PREP_PQ_DISABLE_P)
+		return re_jr_prep_genq(chan, dest[1], src, src_cnt,
+				scf, len, flags);
+
+	jr =3D container_of(chan, struct re_jr, chan);
+	desc =3D re_jr_alloc_desc(jr, cfs_reqd, flags);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	desc->dma_len =3D len;
+	desc->dest_cnt =3D 2;
+	desc->src_cnt =3D src_cnt;
+
+	/* Filling frame 0 of compound frame descriptor with CDB */
+	pq =3D dma_pool_alloc(jr->desc_pool, GFP_ATOMIC, &paddr);
+	if (!pq) {
+		re_jr_free_desc(desc);
+		return ERR_PTR(-ENOMEM);
+	}
+	desc->cdb_addr =3D pq;
+	desc->cdb_paddr =3D paddr;
+	desc->cdb_len =3D sizeof(struct pq_cdb);
+	memset(pq, 0, desc->cdb_len);
+	desc->cdb_opcode =3D RE_PQ_OPCODE;
+
+	/* Filling GenQQ CDB */
+	pq->opcode =3D RE_PQ_OPCODE;
+	pq->excl_enable =3D 0x0; /* Don't exclude for Q1/Q2 parity */
+	pq->excl_q1 =3D 0x0;
+	pq->excl_q2 =3D 0x0;
+	pq->blk_size =3D RE_BLOCK_SIZE;
+	pq->cache_attrib =3D CACHEABLE_INPUT_OUTPUT;
+	pq->buffer_attrib =3D BUFFERABLE_OUTPUT;
+	pq->error_attrib =3D INTERRUPT_ON_ERROR;
+	pq->data_depend =3D DATA_DEPENDENCY;
+	pq->dpi =3D ENABLE_DPI;
+	pq->nrcs =3D (src_cnt - 1);
+
+	p =3D pq->gfm_q1;
+	/* Init gfm_q1[] */
+	for (i =3D 0; i < src_cnt; i++)
+		p[i] =3D 1;
+
+	/* Align gfm[] to 32bit */
+	gfmq_len =3D ((src_cnt+3)/4)*4;
+
+	/* Init gfm_q2[] */
+	p +=3D gfmq_len;
+	for (i =3D 0; i < src_cnt; i++)
+		p[i] =3D scf[i];
+
+	/* Filling frame 0 of compound frame descriptor with CDB */
+	cf =3D desc->cf_addr;
+	fill_cfd_frame(cf, 0, desc->cdb_len, paddr, 0);
+
+	/* Fill CFD's 1st & 2nd frame with dest buffers */
+	for (i =3D 1, j =3D 0; i < 3; i++, j++)
+		fill_cfd_frame(cf, i, len, dest[j], 0);
+
+	/* Fill CFD's rest of the frames with source buffers */
+	for (i =3D 3, j =3D 0; j < src_cnt; i++, j++)
+		fill_cfd_frame(cf, i, len, src[j], 0);
+
+	/* Setting the final bit in the last source buffer frame in CFD */
+	cf[i - 1].final =3D 1;
+
+	return &desc->async_tx;
+}
+
+/*
+ * Prep function for memcpy. In RAID Engine, memcpy is done through=20
+MOVE
+ * command. Logic of this function will need to be modified once=20
+multipage
+ * support is added in Linux's MD/ASYNC Layer  */ static struct=20
+dma_async_tx_descriptor *re_jr_prep_memcpy(
+		struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
+		size_t len, unsigned long flags)
+{
+	struct re_jr *jr =3D NULL;
+	struct fsl_re_dma_async_tx_desc *desc =3D NULL;
+	size_t length =3D 0;
+	unsigned int cfs_reqd =3D 3; /* CDB+dest+src */
+	struct cmpnd_frame *cf =3D NULL;
+	struct move_cdb *move =3D NULL;
+	dma_addr_t paddr;
+
+	jr =3D container_of(chan, struct re_jr, chan);
+
+	if (len > MAX_DATA_LENGTH) {
+		pr_err("%s: Length greater than %d not supported\n",
+				__func__, MAX_DATA_LENGTH);
+		return NULL;
+	}
+
+	desc =3D re_jr_alloc_desc(jr, cfs_reqd, flags);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	desc->dma_len =3D len;
+	desc->src_cnt =3D 1;
+	desc->dest_cnt =3D 1;
+
+	move =3D dma_pool_alloc(jr->desc_pool, GFP_ATOMIC, &paddr);
+	if (!move) {
+		re_jr_free_desc(desc);
+		return ERR_PTR(-ENOMEM);
+	}
+	desc->cdb_addr =3D move;
+	desc->cdb_paddr =3D paddr;
+	desc->cdb_len =3D sizeof(struct move_cdb);
+	memset(move, 0, desc->cdb_len);
+	desc->cdb_opcode =3D RE_MOVE_OPCODE;
+
+	/* Filling move CDB */
+	move->opcode =3D RE_MOVE_OPCODE; /* Unicast move */
+	move->blk_size =3D RE_BLOCK_SIZE;
+	move->cache_attrib =3D CACHEABLE_INPUT_OUTPUT;
+	move->buffer_attrib =3D BUFFERABLE_OUTPUT;
+	move->error_attrib =3D INTERRUPT_ON_ERROR;
+	move->data_depend =3D DATA_DEPENDENCY;
+	move->dpi =3D ENABLE_DPI;
+
+	/* Filling frame 0 of CFD with move CDB */
+	cf =3D desc->cf_addr;
+	fill_cfd_frame(cf, 0, desc->cdb_len, paddr, 0);
+
+	length =3D min_t(size_t, len, MAX_DATA_LENGTH);
+
+	/* Fill CFD's 1st frame with dest buffer */
+	fill_cfd_frame(cf, 1, length, dest, 0);
+
+	/* Fill CFD's 2nd frame with src buffer */
+	fill_cfd_frame(cf, 2, length, src, 1);
+
+	return &desc->async_tx;
+}
+
+/*
+ * Job ring probe function. This function gets called for each detected
+ * job ring in the system
+ */
+int re_jr_probe(struct platform_device *ofdev,
+		struct device_node *np, u8 q, u32 *off) {
+	struct device *dev =3D NULL;
+	struct re_drv_private *repriv =3D NULL;
+	struct re_jr *jr =3D NULL;
+	struct dma_device *dma_dev =3D NULL;
+	u32 *ptr =3D NULL;
+	u32 status;
+	int ret =3D 0;
+	int k =3D 0;
+	struct platform_device *jr_ofdev =3D NULL;
+
+	dev =3D &ofdev->dev;
+	repriv =3D dev_get_drvdata(dev);
+	dma_dev =3D &repriv->dma_dev;
+
+	jr =3D kzalloc(sizeof(struct re_jr), GFP_KERNEL);
+	if (!jr) {
+		dev_err(dev, "%s: No free memory for allocating JR struct\n",
+			__func__);
+		return -ENOMEM;
+	}
+
+	jr_ofdev =3D of_platform_device_create(np, NULL, dev);
+	if (jr_ofdev =3D=3D NULL) {
+		dev_err(dev, "%s: Not able to create ofdev for jr %d\n",
+			__func__, q);
+		ret =3D -EINVAL;
+		goto err_free_2;
+	}
+	dev_set_drvdata(&jr_ofdev->dev, jr);
+
+	ptr =3D (u32 *)of_get_property(np, "reg", NULL);
+	if (!ptr) {
+		dev_err(dev, "%s: Reg property not found in JR number %d\n",
+			__func__, q);
+		ret =3D -ENODEV;
+		goto err_free_2;
+	}
+
+	jr->jrregs =3D (struct jr_config_regs *)((u8 *)repriv->re_regs +
+			*off + *ptr);
+
+	jr->irq =3D irq_of_parse_and_map(np, 0);
+	if (jr->irq =3D=3D NO_IRQ) {
+		dev_err(dev, "%s: No IRQ defined for JR %d\n", __func__, q);
+		ret =3D -ENODEV;
+		goto err_free_2;
+	}
+
+	tasklet_init(&jr->irqtask, re_jr_dequeue,
+			(unsigned long)&jr_ofdev->dev);
+	ret =3D request_irq(jr->irq, re_jr_interrupt, 0, "re-jr", &jr_ofdev->dev)=
;
+	if (ret) {
+		dev_err(dev, "%s: Unable to register JR interrupt for JR %d\n",
+			__func__, q);
+		ret =3D -EINVAL;
+		goto err_free_2;
+	}
+
+	repriv->re_jrs[q] =3D jr;
+	jr->chan.device =3D dma_dev;
+	jr->dev =3D &ofdev->dev;
+	jr->chan.private =3D jr;
+
+	INIT_LIST_HEAD(&jr->submit_q);
+	INIT_LIST_HEAD(&jr->ack_q);
+	spin_lock_init(&jr->desc_lock);
+
+	init_timer(&jr->timer);
+	jr->timer.expires =3D jiffies + 10*HZ;
+	jr->timer.function =3D raide_timer_handler;
+
+	list_add_tail(&jr->chan.device_node, &dma_dev->channels);
+	dma_dev->chancnt++;
+
+	jr->inb_desc_pool =3D dma_pool_create("re_jr_inb_desc_pool", jr->dev,
+			sizeof(struct jr_hw_desc) * RING_SIZE,
+			FRAME_DESC_ALIGNMENT, 0);
+	if (!jr->inb_desc_pool) {
+		dev_err(dev, "%s:No memory for re_jr_inb_desc_pool\n",
+			__func__);
+		ret =3D -ENOMEM;
+		goto err_free_2;
+	}
+
+	jr->inb_ring_virt_addr =3D dma_pool_alloc(jr->inb_desc_pool, GFP_ATOMIC,
+			&jr->inb_phys_addr);
+	if (!jr->inb_ring_virt_addr) {
+		dev_err(dev, "%s:No dma_pool mem for inb_ring_virt_addr\n",
+			__func__);
+		ret =3D -ENOMEM;
+		goto pool_destroy;
+	}
+
+	jr->oub_desc_pool =3D dma_pool_create("re_jr_oub_desc_pool", jr->dev,
+			sizeof(struct jr_hw_desc) * RING_SIZE,
+			FRAME_DESC_ALIGNMENT, 0);
+	if (!jr->oub_desc_pool) {
+		dev_err(dev, "%s:No memory for re_jr_oub_desc_pool\n",
+			__func__);
+		ret =3D -ENOMEM;
+		goto pool_free;
+	}
+
+	jr->oub_ring_virt_addr =3D dma_pool_alloc(jr->oub_desc_pool, GFP_ATOMIC,
+			&jr->oub_phys_addr);
+	if (!jr->inb_ring_virt_addr) {
+		dev_err(dev, "%s:No dma_pool mem for oub_ring_virt_addr\n",
+			__func__);
+		ret =3D -ENOMEM;
+		goto pool_destroy_2;
+	}
+
+	jr->desc_pool =3D dma_pool_create("re_jr_desc_pool", jr->dev,
+			RE_CF_CDB_SIZE, RE_CF_CDB_ALIGN, 0);
+
+	if (!jr->desc_pool) {
+		dev_err(dev, "%s:No memory for re_jr_desc_pool\n",
+			__func__);
+		ret =3D -ENOMEM;
+		goto err_free_2;
+	}
+
+	jr->inb_ring_index =3D 0;
+	jr->inb_count =3D 0;
+	jr->oub_count =3D 0;
+
+	status =3D in_be32(&jr->jrregs->jr_status);
+
+	if (status & RE_JR_PAUSE) {
+		dev_info(dev, "%s: JR is in paused state...enable it\n",
+			__func__);
+	} else {
+		dev_err(dev, "%s: Error:- JR shud be in paused state\n",
+			__func__);
+		ret =3D -EINVAL;
+		goto pool_free_2;
+	}
+
+	/* Program the Inbound/Outbound ring base addresses and size */
+	out_be32(&jr->jrregs->inbring_base_h,
+			jr->inb_phys_addr & RE_JR_ADDRESS_BIT_MASK);
+	out_be32(&jr->jrregs->oubring_base_h,
+			jr->oub_phys_addr & RE_JR_ADDRESS_BIT_MASK);
+	out_be32(&jr->jrregs->inbring_base_l,
+			jr->inb_phys_addr >> RE_JR_ADDRESS_BIT_SHIFT);
+	out_be32(&jr->jrregs->oubring_base_l,
+			jr->oub_phys_addr >> RE_JR_ADDRESS_BIT_SHIFT);
+	out_be32(&jr->jrregs->inbring_size, RING_SIZE << RING_SIZE_SHIFT);
+	out_be32(&jr->jrregs->oubring_size, RING_SIZE << RING_SIZE_SHIFT);
+
+	/* Read LIODN value from u-boot */
+	status =3D in_be32(&jr->jrregs->jr_config_1) & RE_JR_REG_LIODN_MASK;
+
+	/* Program the CFG reg */
+	out_be32(&jr->jrregs->jr_config_1,
+			RE_JR_CFG1_CBSI | RE_JR_CFG1_CBS0 | status);
+
+	/* Enable RE/JR */
+	out_be32(&jr->jrregs->jr_command, RE_JR_ENABLE);
+
+	/* Zero'ing the array */
+	for (k =3D 0; k < RING_SIZE; k++) {
+		jr->virt_arry[k].virt_addr =3D 0;
+		jr->virt_arry[k].phys_addr =3D 0;
+	}
+
+	return 0;
+
+pool_free_2:
+	dma_pool_free(jr->oub_desc_pool, jr->oub_ring_virt_addr,
+			jr->oub_phys_addr);
+pool_destroy_2:
+	dma_pool_destroy(jr->oub_desc_pool);
+pool_free:
+	dma_pool_free(jr->inb_desc_pool, jr->inb_ring_virt_addr,
+			jr->inb_phys_addr);
+pool_destroy:
+	dma_pool_destroy(jr->inb_desc_pool);
+err_free_2:
+	kfree(jr);
+	return ret;
+}
+
+/* Probe function for RAID Engine */
+static int __devinit raide_probe(struct platform_device *ofdev) {
+	struct re_drv_private *repriv =3D NULL;
+	struct device *dev =3D NULL;
+	struct device_node *np =3D NULL;
+	struct device_node *child =3D NULL;
+	u32 *off =3D NULL;
+	u8 ridx =3D 0;
+	struct dma_device *dma_dev =3D NULL;
+
+	dev_info(&ofdev->dev, "Freescale RAID Engine driver\n");
+
+	repriv =3D kzalloc(sizeof(struct re_drv_private), GFP_KERNEL);
+	if (!repriv) {
+		dev_err(dev, "%s: No memory for repriv\n", __func__);
+		return -ENOMEM;
+	}
+
+	dev =3D &ofdev->dev;
+	dev_set_drvdata(dev, repriv);
+
+	/* IOMAP the entire RAID Engine region */
+	repriv->re_regs =3D of_iomap(ofdev->dev.of_node, 0);
+	if (repriv->re_regs =3D=3D NULL) {
+		dev_err(dev, "%s: of_iomap failed\n", __func__);
+		kfree(repriv);
+		return -ENOMEM;
+	}
+
+
+	re_sw_desc_cache =3D kmem_cache_create("re_desc_cache",
+				sizeof(struct fsl_re_dma_async_tx_desc), 0,
+				SLAB_HWCACHE_ALIGN, NULL);
+	if (re_sw_desc_cache =3D=3D NULL) {
+		dev_err(dev, "%s: No memory for re_sw_desc_cache\n", __func__);
+		iounmap(repriv->re_regs);
+		kfree(repriv);
+		return -ENOMEM;
+	}
+
+	/* Print the RE version to make sure RE is alive */
+	dev_info(dev, "Ver =3D %x\n", in_be32(&repriv->re_regs->re_version_id));
+
+	/* Program the RE mode */
+	out_be32(&repriv->re_regs->global_config, RE_NON_DPAA_MODE);
+	dev_info(dev, "%s:RE mode is %x\n", __func__,
+			in_be32(&repriv->re_regs->global_config));
+
+	/* Program Galois Field polymomial */
+	out_be32(&repriv->re_regs->galois_field_config, RE_GFM_POLY);
+	dev_info(dev, "%s:Galois Field Polynomial is %x\n", __func__,
+			in_be32(&repriv->re_regs->galois_field_config));
+
+	dma_dev =3D &repriv->dma_dev;
+	dma_dev->dev =3D dev;
+	INIT_LIST_HEAD(&dma_dev->channels);
+
+	dma_dev->device_alloc_chan_resources =3D re_jr_alloc_chan_resources;
+	dma_dev->device_tx_status =3D re_jr_tx_status;
+	dma_dev->device_issue_pending =3D re_jr_issue_pending;
+
+	dma_dev->max_xor =3D MAX_XOR_SRCS;
+	dma_dev->device_prep_dma_xor =3D re_jr_prep_dma_xor;
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+
+	dma_dev->max_pq =3D MAX_PQ_SRCS;
+	dma_dev->device_prep_dma_pq =3D re_jr_prep_pq;
+	dma_cap_set(DMA_PQ, dma_dev->cap_mask);
+
+	dma_dev->device_prep_dma_memcpy =3D re_jr_prep_memcpy;
+	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
+
+	dma_dev->device_free_chan_resources =3D re_jr_free_chan_resources;
+
+	repriv->total_jrs =3D 0;
+
+	/* Parse Device tree to find out the total number of JQs present */
+	for_each_compatible_node(np, NULL, "fsl,raideng-v1.0-job-queue") {
+		off =3D (u32 *)of_get_property(np, "reg", NULL);
+		if (!off) {
+			dev_err(dev, "%s: Reg property not found in JQ node\n",
+				__func__);
+			return -ENODEV;
+		}
+
+		/* Find out the Job Rings present under each JQ */
+		for_each_child_of_node(np, child) {
+			if (of_device_is_compatible(child,
+				"fsl,raideng-v1.0-job-ring")) {
+				re_jr_probe(ofdev, child, ridx++, off);
+				repriv->total_jrs++;
+			}
+		}
+	}
+
+	dma_async_device_register(dma_dev);
+
+	return 0;
+}
+
+static void release_jr(struct re_jr *jr) {
+	/* Free the memory allocated from DMA pools and destroy them */
+	dma_pool_free(jr->oub_desc_pool, jr->oub_ring_virt_addr,
+			jr->oub_phys_addr);
+	dma_pool_destroy(jr->oub_desc_pool);
+	dma_pool_free(jr->inb_desc_pool, jr->inb_ring_virt_addr,
+			jr->inb_phys_addr);
+	dma_pool_destroy(jr->inb_desc_pool);
+
+	kfree(jr);
+}
+
+static int raide_remove(struct platform_device *ofdev) {
+	struct re_drv_private *repriv =3D NULL;
+	struct device *dev =3D NULL;
+	int i;
+
+	dev =3D &ofdev->dev;
+	repriv =3D dev_get_drvdata(dev);
+
+	/* Cleanup JR related memory areas */
+	for (i =3D 0; i < repriv->total_jrs; i++)
+		release_jr(repriv->re_jrs[i]);
+
+	kmem_cache_destroy(re_sw_desc_cache);
+
+	/* Unregister the driver */
+	dma_async_device_unregister(&repriv->dma_dev);
+
+	/* Unmap the RAID Engine region */
+	iounmap(repriv->re_regs);
+
+	kfree(repriv);
+
+	return 0;
+}
+
+static struct of_device_id raide_ids[] =3D {
+	{ .compatible =3D "fsl,raideng-v1.0", },
+	{}
+};
+
+static struct platform_driver raide_driver =3D {
+	.driver =3D {
+		.name =3D "fsl-raideng",
+		.owner =3D THIS_MODULE,
+		.of_match_table =3D raide_ids,
+	},
+	.probe =3D raide_probe,
+	.remove =3D raide_remove,
+};
+
+static __init int raide_init(void)
+{
+	int ret =3D 0;
+
+	ret =3D platform_driver_register(&raide_driver);
+	if (ret)
+		pr_err("fsl-raid: Failed to register platform driver\n");
+
+	return ret;
+}
+
+static void __exit raide_exit(void)
+{
+	platform_driver_unregister(&raide_driver);
+}
+
+subsys_initcall(raide_init);
+module_exit(raide_exit);
+
+MODULE_AUTHOR("Harninder Rai <harninder.rai@freescale.com>");=20
+MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Freescale RAID Engine=20
+Device Driver");
diff --git a/drivers/dma/fsl_raid.h b/drivers/dma/fsl_raid.h new file mode =
100644 index 0000000..0f39b8d
--- /dev/null
+++ b/drivers/dma/fsl_raid.h
@@ -0,0 +1,294 @@
+/*
+ * drivers/dma/fsl_raid.h
+ *
+ * Freescale RAID Engine device driver
+ *
+ * Author:
+ *	Harninder Rai <harninder.rai@freescale.com>
+ *	Naveen Burmi <naveenburmi@freescale.com>
+ *	Xuelin Shi <b29237@freescale.com>
+ *
+ * Copyright (c) 2010-2012 Freescale Semiconductor, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are =
met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in t=
he
+ *       documentation and/or other materials provided with the distributi=
on.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote produ=
cts
+ *       derived from this software without specific prior written permiss=
ion.
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of=20
+the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND=20
+ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE=20
+IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE=20
+ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR=20
+ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL=20
+DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR=20
+SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER=20
+CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,=20
+OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE=20
+USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define RE_DPAA_MODE		(1 << 30)
+#define RE_NON_DPAA_MODE	(1 << 31)
+#define RE_GFM_POLY		(0x1d000000)
+#define RE_JR_INB_JOB_ADD	(1 << 16)
+#define RE_JR_OUB_JOB_REMOVE	(1 << 16)
+#define RE_JR_CFG1_CBSI		0x08000000
+#define RE_JR_CFG1_CBS0		0x00080000
+#define RE_JR_OUB_SLOT_FULL_SHIFT	8
+#define RE_JR_OUB_SLOT_FULL(x)	((x) >> RE_JR_OUB_SLOT_FULL_SHIFT)
+#define RE_PQ_OPCODE		0x1B
+#define RE_XOR_OPCODE		0x1A
+#define RE_MOVE_OPCODE		0x8
+#define FRAME_DESC_ALIGNMENT	16
+#define RE_BLOCK_SIZE		0x3 /* 4096 bytes */
+#define CACHEABLE_INPUT_OUTPUT	0x0
+#define BUFFERABLE_OUTPUT	0x0
+#define INTERRUPT_ON_ERROR	0x1
+#define DATA_DEPENDENCY		0x1
+#define ENABLE_DPI		0x0
+#define RING_SIZE		0x1000
+#define RING_SIZE_SHIFT		8
+#define RE_JR_ADDRESS_BIT_SHIFT	4
+#define RE_JR_ADDRESS_BIT_MASK	((1 << RE_JR_ADDRESS_BIT_SHIFT) - 1)
+#define RE_JR_ERROR		0x40000000
+#define RE_JR_INTERRUPT		0x80000000
+#define RE_JR_CLEAR_INT		0x80000000
+#define RE_JR_PAUSE		0x80000000
+#define RE_JR_ENABLE		0x80000000
+
+#define RE_JR_REG_LIODN_MASK	0x00000fff
+#define RE_CF_CDB_ALIGN		64
+
+/*
+ * max size is 19*sizeof(struct cmpnd_frame), 64 bytes align to 320
+ * here 19 =3D 1(cdb)+2(dest)+16(src)
+ */
+#define RE_CF_CDB_SIZE		320
+
+struct re_ctrl {
+	/* General Configuration Registers */
+	__be32 global_config;	/* Global Configuration Register */
+	u8     rsvd1[4];
+	__be32 galois_field_config; /* Galois Field Configuration Register */
+	u8     rsvd2[4];
+	__be32 jq_wrr_config;   /* WRR Configuration register */
+	u8     rsvd3[4];
+	__be32 crc_config;	/* CRC Configuration register */
+	u8     rsvd4[228];
+	__be32 system_reset;	/* System Reset Register */
+	u8     rsvd5[252];
+	__be32 global_status;	/* Global Status Register */
+	u8     rsvd6[832];
+	__be32 re_liodn_base;	/* LIODN Base Register */
+	u8     rsvd7[1712];
+	__be32 re_version_id;	/* Version ID register of RE */
+	__be32 re_version_id_2; /* Version ID 2 register of RE */
+	u8     rsvd8[512];
+	__be32 host_config;	/* Host I/F Configuration Register */
+};
+
+struct jr_config_regs {
+	/* Registers for JR interface */
+	__be32 jr_config_0;	/* Job Queue Configuration 0 Register */
+	__be32 jr_config_1;	/* Job Queue Configuration 1 Register */
+	__be32 jr_interrupt_status; /* Job Queue Interrupt Status Register */
+	u8     rsvd1[4];
+	__be32 jr_command;	/* Job Queue Command Register */
+	u8     rsvd2[4];
+	__be32 jr_status;	/* Job Queue Status Register */
+	u8     rsvd3[228];
+
+	/* Input Ring */
+	__be32 inbring_base_h;	/* Inbound Ring Base Address Register - High */
+	__be32 inbring_base_l;	/* Inbound Ring Base Address Register - Low */
+	__be32 inbring_size;	/* Inbound Ring Size Register */
+	u8     rsvd4[4];
+	__be32 inbring_slot_avail; /* Inbound Ring Slot Available Register */
+	u8     rsvd5[4];
+	__be32 inbring_add_job;	/* Inbound Ring Add Job Register */
+	u8     rsvd6[4];
+	__be32 inbring_cnsmr_indx; /* Inbound Ring Consumer Index Register */
+	u8     rsvd7[220];
+
+	/* Output Ring */
+	__be32 oubring_base_h;	/* Outbound Ring Base Address Register - High */
+	__be32 oubring_base_l;	/* Outbound Ring Base Address Register - Low */
+	__be32 oubring_size;	/* Outbound Ring Size Register */
+	u8     rsvd8[4];
+	__be32 oubring_job_rmvd; /* Outbound Ring Job Removed Register */
+	u8     rsvd9[4];
+	__be32 oubring_slot_full; /* Outbound Ring Slot Full Register */
+	u8     rsvd10[4];
+	__be32 oubring_prdcr_indx; /* Outbound Ring Producer Index */ };
+
+/*
+ * Command Descriptor Block (CDB) for unicast move command.
+ * In RAID Engine terms, memcpy is done through move command  */ struct=20
+move_cdb {
+	u32 opcode:5;
+	u32 rsvd1:11;
+	u32 blk_size:2;
+	u32 cache_attrib:2;
+	u32 buffer_attrib:1;
+	u32 error_attrib:1;
+	u32 rsvd2:6;
+	u32 data_depend:1;
+	u32 dpi:1;
+	u32 rsvd3:2;
+} __packed;
+
+/* Data protection/integrity related fields */ struct dpi_related {
+	u32 apps_mthd:2;
+	u32 ref_mthd:2;
+	u32 guard_mthd:2;
+	u32 dpi_attr:2;
+	u32 rsvd1:8;
+	u32 meta_tag:16;
+	u32 ref_tag:32;
+} __packed;
+
+/*
+ * CDB for GenQ command. In RAID Engine terminology, XOR is
+ * done through this command
+ */
+struct xor_cdb {
+	u32 opcode:5;
+	u32 rsvd1:11;
+	u32 blk_size:2;
+	u32 cache_attrib:2;
+	u32 buffer_attrib:1;
+	u32 error_attrib:1;
+	u32 nrcs:4;
+	u32 rsvd2:2;
+	u32 data_depend:1;
+	u32 dpi:1;
+	u32 rsvd3:2;
+	u8 gfm[16];
+	struct dpi_related dpi_dest_spec;
+	struct dpi_related dpi_src_spec[16];
+} __packed;
+
+/* CDB for no-op command */
+struct noop_cdb {
+	u32 opcode:5;
+	u32 rsvd1:23;
+	u32 dependency:1;
+	u32 rsvd2:3;
+} __packed;
+
+/*
+ * CDB for GenQQ command. In RAID Engine terminology, P/Q is
+ * done through this command
+ */
+struct pq_cdb {
+	u32 opcode:5;
+	u32 rsvd1:1;
+	u32 excl_enable:2;
+	u32 excl_q1:4;
+	u32 excl_q2:4;
+	u32 blk_size:2;
+	u32 cache_attrib:2;
+	u32 buffer_attrib:1;
+	u32 error_attrib:1;
+	u32 nrcs:4;
+	u32 rsvd2:2;
+	u32 data_depend:1;
+	u32 dpi:1;
+	u32 rsvd3:2;
+	u8 gfm_q1[16];
+	u8 gfm_q2[16];
+	struct dpi_related dpi_dest_spec[2];
+	struct dpi_related dpi_src_spec[16];
+} __packed;
+
+/* Compound frame */
+struct cmpnd_frame {
+	u64 rsvd1:24;
+	u64 address:40;
+	u32 extension:1;
+	u32 final:1;
+	u32 rsvd3:10;
+	u32 length:20;
+	u32 rsvd4:8;
+	u32 bpid:8;
+	u32 rsvd5:3;
+	u32 offset:13;
+} __packed;
+
+/* Frame descriptor */
+struct jr_hw_desc {
+	u64 debug:2;
+	u64 liodn_off:6;
+	u64 bpid:8;
+	u64 eliodn_off:4;
+	u64 rsvd1:4;
+	u64 address:40;
+	u64 format:3;
+	u64 rsvd2:29;
+	u64 status:32;
+} __packed;
+
+/* Array to store the virtual/physical address of descriptors */ struct=20
+virt_struct {
+	unsigned long virt_addr;
+	phys_addr_t phys_addr;
+};
+
+/* Per job ring data structure */
+struct re_jr {
+	dma_cookie_t completed_cookie;
+	spinlock_t desc_lock;
+	struct list_head submit_q;
+	struct list_head ack_q;
+	struct device *dev;
+	struct dma_chan chan;
+	struct jr_config_regs *jrregs;
+	int irq;
+	struct tasklet_struct irqtask;
+	dma_addr_t inb_phys_addr;
+	dma_addr_t oub_phys_addr;
+	struct dma_pool *inb_desc_pool;
+	struct dma_pool *oub_desc_pool;
+	struct jr_hw_desc *inb_ring_virt_addr;
+	struct jr_hw_desc *oub_ring_virt_addr;
+	u32 inb_ring_index;
+	u32 inb_count;
+	u32 oub_count;
+	struct virt_struct virt_arry[RING_SIZE];
+	struct timer_list timer;
+	struct dma_pool *desc_pool;
+};
+
+/* Async transaction descriptor */
+struct fsl_re_dma_async_tx_desc {
+	struct dma_async_tx_descriptor async_tx;
+	struct list_head node;
+	struct list_head tx_list;
+	struct jr_hw_desc hwdesc;
+	struct re_jr *jr;
+	void *cf_addr;
+	int dma_len;
+	int dest_cnt;
+	int src_cnt;
+	dma_addr_t cf_paddr;
+	int cf_len;
+	void *cdb_addr;
+	u32 cdb_opcode;
+	dma_addr_t cdb_paddr;
+	int cdb_len;
+};
--
1.7.0.4


_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply related

* Re: [PATCH -V7 04/12] arch/powerpc: Convert virtual address to vpn
From: Paul Mackerras @ 2012-09-05  8:26 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1346749289-16986-5-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Tue, Sep 04, 2012 at 02:31:21PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> This patch convert different functions to take virtual page number
> instead of virtual address. Virtual page number is virtual address
> shifted right by VPN_SHIFT (12) bits. This enable us to have an
> address range of upto 76 bits.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

A few comments below...

> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index 1c65a59..d3a1139 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -15,6 +15,10 @@
>  #include <asm/asm-compat.h>
>  #include <asm/page.h>
>  
> +#ifndef __ASSEMBLY__
> +#include <linux/bug.h>
> +#endif
> +

This is unnecessary, since you haven't added any BUG_ONs or WARN_ONs
in this file.

> @@ -233,13 +276,19 @@ static inline unsigned long hpt_va(unsigned long ea, unsigned long vsid,
>  static inline unsigned long hpt_hash(unsigned long va, unsigned int shift,
>  				     int ssize)
>  {
> +	int mask;
>  	unsigned long hash, vsid;
>  
> +	/* VPN_SHIFT can be atmost 12 */
>  	if (ssize == MMU_SEGSIZE_256M) {
> -		hash = (va >> 28) ^ ((va & 0x0fffffffUL) >> shift);
> +		mask = (1ul << (SID_SHIFT - VPN_SHIFT)) - 1;
> +		hash = ((va >> (SID_SHIFT - VPN_SHIFT)) & 0x0000007fffffffff) ^

You have added the "& 0x0000007fffffffff" part, which is unnecessary
since the result is anded with that same value before being returned.

> +			(((va & mask) >> (shift - VPN_SHIFT)) & 0xffff);

Similarly the "& 0xffff" is completely redundant, since you have anded
the va (really vpn) with the mask already.

>  	} else {
> -		vsid = va >> 40;
> -		hash = vsid ^ (vsid << 25) ^ ((va & 0xffffffffffUL) >> shift);
> +		mask = (1ul << (SID_SHIFT_1T - VPN_SHIFT)) - 1;
> +		vsid = va >> (SID_SHIFT_1T - VPN_SHIFT);
> +		hash = (vsid & 0xffffff) ^ ((vsid << 25) & 0x7fffffffff) ^

Here the "vsid & 0xffffff" is actually wrong, since the architecture
says that this term takes the whole VSID, not just the bottom 24 bits.
I realise you were copying what happened before, where the VSID was
restricted to 24 bits anyway because the VA was restricted to 64 bits,
but now that you are extending the VA size you need to include the
whole VSID.

As in the previous case, the "& 0x7fffffffff" part is redundant.

Also, it's very confusing to review this patch with variables called
"va" containing vpns.  It would actually be easier to review if you
combined this patch and the following one (that renames va to vpn).

> +			(((va & mask) >> (shift - VPN_SHIFT)) & 0xfffffff);

Once again the "& 0xfffffff" is redundant because the "& mask" has
already removed any bits that the second and would remove.

Paul.

^ permalink raw reply

* Re: [PATCH 13/25] macintosh/mediabay: add a const qualifier
From: Uwe Kleine-König @ 2012-09-05  8:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Arnd Bergmann, Rob Herring, kernel, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <1346812817.2257.23.camel@pasglop>

Hello,

On Wed, Sep 05, 2012 at 12:40:17PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2012-07-23 at 11:13 +0200, Uwe Kleine-König wrote:
> > This prepares *of_device_id.data becoming const. Without this change
> > the following warning would occur:
> > 
> > 	drivers/macintosh/mediabay.c: In function 'media_bay_attach':
> > 	drivers/macintosh/mediabay.c:589:11: warning: assignment discards 'const' qualifier from pointer target type [enabled by default]
> > 
> > Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
> > ---
> 
> Ack all of these assuming you test built (I didn't).
"all" means the two mediabay patches? And yes, they are all built
tested.

>                                                      Do you need me to
> carry any of this via the powerpc tree ?
The patch that adds the const to of_device_id depends[1] on this patch.
And the other mediabay is only valid after that const is added. So the
easiest is if the series is pulled in one go. Arnd intended to let Linus
pull it for 3.6-rc1 but it was missed because of a communication
problem. So I intend to get it in during the 3.7 merge window.

Best regards
Uwe

[1] it's only a soft depend, because the result is "only" a warning, but
still.

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply

* [PATCH 19/21] ppc/eeh: move stats to PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch removes the eeh related statistics for eeh device since
they have been maintained by the corresponding eeh PE. Also, the
flags used to trace the state of eeh device and PE have been reworked
for a little bit.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h              |    9 +--------
 arch/powerpc/platforms/pseries/eeh.c        |   13 +++----------
 arch/powerpc/platforms/pseries/eeh_cache.c  |    3 +--
 arch/powerpc/platforms/pseries/eeh_driver.c |    6 +++---
 arch/powerpc/platforms/pseries/eeh_sysfs.c  |    9 ---------
 5 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e07ece1..b7ac3f7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -77,20 +77,13 @@ struct eeh_pe {
  * another tree except the currently existing tree of PCI
  * buses and PCI devices
  */
-#define EEH_MODE_SUPPORTED	(1<<0)	/* EEH supported on the device	*/
-#define EEH_MODE_NOCHECK	(1<<1)	/* EEH check should be skipped	*/
-#define EEH_MODE_ISOLATED	(1<<2)	/* The device has been isolated	*/
-#define EEH_MODE_RECOVERING	(1<<3)	/* Recovering the device	*/
-#define EEH_MODE_IRQ_DISABLED	(1<<4)	/* Interrupt disabled		*/
+#define EEH_DEV_IRQ_DISABLED	(1<<0)	/* Interrupt disabled		*/
 
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
 	int class_code;			/* Class code of the device	*/
 	int config_addr;		/* Config address		*/
 	int pe_config_addr;		/* PE config address		*/
-	int check_count;		/* Times of ignored error	*/
-	int freeze_count;		/* Times of froze up		*/
-	int false_positives;		/* Times of reported #ff's	*/
 	u32 config_space[16];		/* Saved PCI config space	*/
 	struct eeh_pe *pe;		/* Associated PE		*/
 	struct list_head list;		/* Form link list in the PE	*/
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index d855c20..1438c4e 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -537,7 +537,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
 	 * pci slot reset line is dropped. Make sure we don't miss
 	 * these, and clear the flag now.
 	 */
-	eeh_pe_state_clear(pe, EEH_MODE_ISOLATED);
+	eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 
 	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 
@@ -625,9 +625,6 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 
 	edev->class_code = 0;
 	edev->mode = 0;
-	edev->check_count = 0;
-	edev->freeze_count = 0;
-	edev->false_positives = 0;
 
 	if (!of_device_is_available(dn))
 		return NULL;
@@ -637,10 +634,8 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 		return NULL;
 
 	/* There is nothing to check on PCI to ISA bridges */
-	if (dn->type && !strcmp(dn->type, "isa")) {
-		edev->mode |= EEH_MODE_NOCHECK;
+	if (dn->type && !strcmp(dn->type, "isa"))
 		return NULL;
-	}
 	edev->class_code = *class_code;
 
 	/* Ok... see if this device supports EEH.  Some do, some don't,
@@ -679,7 +674,6 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 
 		if (enable) {
 			eeh_subsystem_enabled = 1;
-			edev->mode |= EEH_MODE_SUPPORTED;
 
 			eeh_pe_create(edev);
 
@@ -692,9 +686,8 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 			 * EEH parent, in which case we mark it as supported.
 			 */
 			if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-			    (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
+			    of_node_to_eeh_dev(dn->parent)->pe) {
 				/* Parent supports EEH. */
-				edev->mode |= EEH_MODE_SUPPORTED;
 				edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
 				edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
 
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index f50b717..a191057 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -192,8 +192,7 @@ static void __pci_addr_cache_insert_device(struct pci_dev *dev)
 	}
 
 	/* Skip any devices for which EEH is not enabled. */
-	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-	    edev->mode & EEH_MODE_NOCHECK) {
+	if (!edev->pe) {
 #ifdef DEBUG
 		pr_info("PCI: skip building address cache for=%s - %s\n",
 			pci_name(dev), dn->full_name);
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 343c807..8370ce7 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -93,7 +93,7 @@ static void eeh_disable_irq(struct pci_dev *dev)
 	if (!irq_has_action(dev->irq))
 		return;
 
-	edev->mode |= EEH_MODE_IRQ_DISABLED;
+	edev->mode |= EEH_DEV_IRQ_DISABLED;
 	disable_irq_nosync(dev->irq);
 }
 
@@ -108,8 +108,8 @@ static void eeh_enable_irq(struct pci_dev *dev)
 {
 	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
 
-	if ((edev->mode) & EEH_MODE_IRQ_DISABLED) {
-		edev->mode &= ~EEH_MODE_IRQ_DISABLED;
+	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
 		enable_irq(dev->irq);
 	}
 }
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
index 243b351..d377083 100644
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ b/arch/powerpc/platforms/pseries/eeh_sysfs.c
@@ -53,9 +53,6 @@ static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
 EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
 EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
 EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
-EEH_SHOW_ATTR(eeh_check_count,     check_count,     "%d"  );
-EEH_SHOW_ATTR(eeh_freeze_count,    freeze_count,    "%d"  );
-EEH_SHOW_ATTR(eeh_false_positives, false_positives, "%d"  );
 
 void eeh_sysfs_add_device(struct pci_dev *pdev)
 {
@@ -64,9 +61,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev)
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
 
 	if (rc)
 		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
@@ -77,8 +71,5 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
 	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
 	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
 	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
 }
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 21/21] ppc/eeh: trace eeh device from I/O cache
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The idea comes from Benjamin Herrenschmidt. The eeh cache helps
fetching the pci device according to the given I/O address. Since
the eeh cache is serving for eeh, it's reasonable for eeh cache
to trace eeh device except pci device.

The patch make eeh cache to trace eeh device. Also, the major
eeh entry function eeh_dn_check_failure has been renamed to
eeh_dev_check_failure since it will take eeh device as input
parameter.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h             |    7 +----
 arch/powerpc/include/asm/pci-bridge.h      |    2 +
 arch/powerpc/include/asm/ppc-pci.h         |    2 +-
 arch/powerpc/kernel/rtas_pci.c             |    2 +-
 arch/powerpc/platforms/pseries/eeh.c       |   33 +++++++++++----------------
 arch/powerpc/platforms/pseries/eeh_cache.c |   14 ++++++-----
 6 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 91c38b7..4d59da0 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -196,7 +196,7 @@ int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 unsigned long eeh_check_failure(const volatile void __iomem *token,
 				unsigned long val);
-int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev);
+int eeh_dev_check_failure(struct eeh_dev *edev);
 void __init pci_addr_cache_build(void);
 void eeh_add_device_tree_early(struct device_node *);
 void eeh_add_device_tree_late(struct pci_bus *);
@@ -231,10 +231,7 @@ static inline unsigned long eeh_check_failure(const volatile void __iomem *token
 	return val;
 }
 
-static inline int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
-{
-	return 0;
-}
+#define eeh_dev_check_failure(x) (0)
 
 static inline void pci_addr_cache_build(void) { }
 
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 8cccbee..973df4d 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -184,6 +184,8 @@ static inline struct eeh_dev *of_node_to_eeh_dev(struct device_node *dn)
 {
 	return PCI_DN(dn)->edev;
 }
+#else
+#define of_node_to_eeh_dev(x) (NULL)
 #endif
 
 /** Find the bus corresponding to the indicated device node */
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 56d55c7..962a902 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -50,7 +50,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void pci_addr_cache_build(void);
 void pci_addr_cache_insert_device(struct pci_dev *dev);
 void pci_addr_cache_remove_device(struct pci_dev *dev);
-struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
+struct eeh_dev *pci_addr_cache_get_device(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_reset_pe(struct eeh_pe *);
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 140735c..6de63e3 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -81,7 +81,7 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	if (returnval == EEH_IO_ERROR_VALUE(size) &&
-	    eeh_dn_check_failure (pdn->node, NULL))
+	    eeh_dev_check_failure(of_node_to_eeh_dev(pdn->node)))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	return PCIBIOS_SUCCESSFUL;
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index b2caf84..81e8c8e 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -270,9 +270,8 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 }
 
 /**
- * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
- * @dn: device node
- * @dev: pci device, if known
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
  *
  * Check for an EEH failure for the given device node.  Call this
  * routine if the result of a read was all 0xff's and you want to
@@ -284,12 +283,13 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
  *
  * It is safe to call this routine in an interrupt context.
  */
-int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
+int eeh_dev_check_failure(struct eeh_dev *edev)
 {
 	int ret;
 	unsigned long flags;
+	struct device_node *dn;
+	struct pci_dev *dev;
 	struct eeh_pe *pe;
-	struct eeh_dev *edev;
 	int rc = 0;
 	const char *location;
 
@@ -298,15 +298,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (dn) {
-		edev = of_node_to_eeh_dev(dn);
-	} else if (dev) {
-		edev = pci_dev_to_eeh_dev(dev);
-		dn = pci_device_to_OF_node(dev);
-	} else {
+	if (!edev) {
 		eeh_stats.no_dn++;
 		return 0;
 	}
+	dn = eeh_dev_to_of_node(edev);
+	dev = eeh_dev_to_pci_dev(edev);
 	pe = edev->pe;
 
 	/* Access to IO BARs might get this far and still not want checking. */
@@ -393,7 +390,7 @@ dn_unlock:
 	return rc;
 }
 
-EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
 
 /**
  * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
@@ -410,21 +407,19 @@ EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
 {
 	unsigned long addr;
-	struct pci_dev *dev;
-	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Finding the phys addr + pci device; this is pretty quick. */
 	addr = eeh_token_to_phys((unsigned long __force) token);
-	dev = pci_addr_cache_get_device(addr);
-	if (!dev) {
+	edev = pci_addr_cache_get_device(addr);
+	if (!edev) {
 		eeh_stats.no_device++;
 		return val;
 	}
 
-	dn = pci_device_to_OF_node(dev);
-	eeh_dn_check_failure(dn, dev);
+	eeh_dev_check_failure(edev);
 
-	pci_dev_put(dev);
+	pci_dev_put(eeh_dev_to_pci_dev(edev));
 	return val;
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index a191057..6c5ef75 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -50,6 +50,7 @@ struct pci_io_addr_range {
 	struct rb_node rb_node;
 	unsigned long addr_lo;
 	unsigned long addr_hi;
+	struct eeh_dev *edev;
 	struct pci_dev *pcidev;
 	unsigned int flags;
 };
@@ -59,7 +60,7 @@ static struct pci_io_addr_cache {
 	spinlock_t piar_lock;
 } pci_io_addr_cache_root;
 
-static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
+static inline struct eeh_dev *__pci_addr_cache_get_device(unsigned long addr)
 {
 	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
 
@@ -74,7 +75,7 @@ static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
 				n = n->rb_right;
 			} else {
 				pci_dev_get(piar->pcidev);
-				return piar->pcidev;
+				return piar->edev;
 			}
 		}
 	}
@@ -92,15 +93,15 @@ static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
  * from zero (that is, they do *not* have pci_io_addr added in).
  * It is safe to call this function within an interrupt.
  */
-struct pci_dev *pci_addr_cache_get_device(unsigned long addr)
+struct eeh_dev *pci_addr_cache_get_device(unsigned long addr)
 {
-	struct pci_dev *dev;
+	struct eeh_dev *edev;
 	unsigned long flags;
 
 	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	dev = __pci_addr_cache_get_device(addr);
+	edev = __pci_addr_cache_get_device(addr);
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-	return dev;
+	return edev;
 }
 
 #ifdef DEBUG
@@ -158,6 +159,7 @@ pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 	pci_dev_get(dev);
 	piar->addr_lo = alo;
 	piar->addr_hi = ahi;
+	piar->edev = pci_dev_to_eeh_dev(dev);
 	piar->pcidev = dev;
 	piar->flags = flags;
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 20/21] ppc/eeh: probe mode support
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

While EEH module is installed, PCI devices is checked one by one
to see if it supports eeh. That is done based on OF nodes or
PCI device referred by "struct pci_dev". In order to distinguish
the case, global variable "eeh_probe_mode" is introduced.

The patch implements the support to eeh probe mode. Also, the
EEH on pseries has set it into EEH_PROBE_MODE_FDT. That means
the probe will be done based on OF nodes on pSeries platform.

In addition, On pSeries platform, it's done by OF nodes. The patch
moves the the probe function to platform dependent backend and do
some cleanup.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h               |   21 ++++
 arch/powerpc/include/asm/ppc-pci.h           |    1 +
 arch/powerpc/platforms/pseries/eeh.c         |  131 +++++---------------------
 arch/powerpc/platforms/pseries/eeh_pseries.c |   96 +++++++++++++++++++
 4 files changed, 140 insertions(+), 109 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index b7ac3f7..91c38b7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -129,6 +129,8 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 struct eeh_ops {
 	char *name;
 	int (*init)(void);
+	void* (*of_probe)(struct device_node *dn, void *flag);
+	void* (*dev_probe)(struct pci_dev *dev, void *flag);
 	int (*set_option)(struct eeh_pe *pe, int option);
 	int (*get_pe_addr)(struct eeh_pe *pe);
 	int (*get_state)(struct eeh_pe *pe, int *state);
@@ -143,6 +145,25 @@ struct eeh_ops {
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
 extern struct mutex eeh_mutex;
+extern int eeh_probe_mode;
+
+#define EEH_PROBE_MODE_DEV	(1<<0)	/* From PCI device	*/
+#define EEH_PROBE_MODE_FDT	(1<<1)	/* From FDT		*/
+
+static inline void eeh_probe_mode_set(int flag)
+{
+	eeh_probe_mode = flag;
+}
+
+static inline int eeh_probe_mode_fdt(void)
+{
+	return (eeh_probe_mode == EEH_PROBE_MODE_FDT);
+}
+
+static inline int eeh_probe_mode_dev(void)
+{
+	return (eeh_probe_mode == EEH_PROBE_MODE_DEV);
+}
 
 static inline void eeh_lock(void)
 {
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 2a80f08..56d55c7 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -54,6 +54,7 @@ struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_reset_pe(struct eeh_pe *);
+void eeh_save_bars(struct eeh_dev *edev);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 1438c4e..b2caf84 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -92,6 +92,17 @@ struct eeh_ops *eeh_ops = NULL;
 int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
+/*
+ * EEH probe mode support. The intention is to support multiple
+ * platforms for EEH. Some platforms like pSeries do PCI emunation
+ * based on FDT (Flat Device Tree). However, other platforms like
+ * powernv probe PCI devices from hardware. The flag is used to
+ * distinguish that. In addition, struct eeh_ops::probe would be
+ * invoked for particular OF node or PCI device so that the
+ * corresponding PE would be created there.
+ */
+int eeh_probe_mode;
+
 /* Global EEH mutex */
 DEFINE_MUTEX(eeh_mutex);
 
@@ -590,7 +601,7 @@ int eeh_reset_pe(struct eeh_pe *pe)
  * PCI devices are added individually; but, for the restore,
  * an entire slot is reset at a time.
  */
-static void eeh_save_bars(struct eeh_dev *edev)
+void eeh_save_bars(struct eeh_dev *edev)
 {
 	int i;
 	struct device_node *dn;
@@ -604,108 +615,6 @@ static void eeh_save_bars(struct eeh_dev *edev)
 }
 
 /**
- * eeh_early_enable - Early enable EEH on the indicated device
- * @dn: device node
- * @data: BUID
- *
- * Enable EEH functionality on the specified PCI device. The function
- * is expected to be called before real PCI probing is done. However,
- * the PHBs have been initialized at this point.
- */
-static void *eeh_early_enable(struct device_node *dn, void *data)
-{
-	int ret;
-	const u32 *class_code = of_get_property(dn, "class-code", NULL);
-	const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL);
-	const u32 *device_id = of_get_property(dn, "device-id", NULL);
-	const u32 *regs;
-	int enable;
-	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
-	struct eeh_pe pe;
-
-	edev->class_code = 0;
-	edev->mode = 0;
-
-	if (!of_device_is_available(dn))
-		return NULL;
-
-	/* Ignore bad nodes. */
-	if (!class_code || !vendor_id || !device_id)
-		return NULL;
-
-	/* There is nothing to check on PCI to ISA bridges */
-	if (dn->type && !strcmp(dn->type, "isa"))
-		return NULL;
-	edev->class_code = *class_code;
-
-	/* Ok... see if this device supports EEH.  Some do, some don't,
-	 * and the only way to find out is to check each and every one.
-	 */
-	regs = of_get_property(dn, "reg", NULL);
-	if (regs) {
-		/* Initialize the fake PE */
-		memset(&pe, 0, sizeof(struct eeh_pe));
-		pe.phb = edev->phb;
-		pe.config_addr = regs[0];
-
-		/* First register entry is addr (00BBSS00)  */
-		/* Try to enable eeh */
-		ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
-
-		enable = 0;
-		if (ret == 0) {
-			edev->config_addr = regs[0];
-
-			/* If the newer, better, ibm,get-config-addr-info is supported, 
-			 * then use that instead.
-			 */
-			edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
-			pe.addr = edev->pe_config_addr;
-
-			/* Some older systems (Power4) allow the
-			 * ibm,set-eeh-option call to succeed even on nodes
-			 * where EEH is not supported. Verify support
-			 * explicitly.
-			 */
-			ret = eeh_ops->get_state(&pe, NULL);
-			if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
-				enable = 1;
-		}
-
-		if (enable) {
-			eeh_subsystem_enabled = 1;
-
-			eeh_pe_create(edev);
-
-			pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n",
-				 dn->full_name, edev->config_addr,
-				 edev->pe_config_addr);
-		} else {
-
-			/* This device doesn't support EEH, but it may have an
-			 * EEH parent, in which case we mark it as supported.
-			 */
-			if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-			    of_node_to_eeh_dev(dn->parent)->pe) {
-				/* Parent supports EEH. */
-				edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
-				edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
-
-				eeh_pe_create(edev);
-
-				return NULL;
-			}
-		}
-	} else {
-		printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
-		       dn->full_name);
-	}
-
-	eeh_save_bars(edev);
-	return NULL;
-}
-
-/**
  * eeh_ops_register - Register platform dependent EEH operations
  * @ops: platform dependent EEH operations
  *
@@ -790,15 +699,18 @@ static int __init eeh_init(void)
 	raw_spin_lock_init(&confirm_error_lock);
 
 	/* Enable EEH for all adapters */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-		phb = hose->dn;
-		traverse_pci_devices(phb, eeh_early_enable, NULL);
+	if (eeh_probe_mode_fdt()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb = hose->dn;
+			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+		}
 	}
 
 	if (eeh_subsystem_enabled)
-		printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
+		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
 	else
-		printk(KERN_WARNING "EEH: No capable adapters found\n");
+		pr_warning("EEH: No capable adapters found\n");
 
 	return ret;
 }
@@ -829,7 +741,8 @@ static void eeh_add_device_early(struct device_node *dn)
 	if (NULL == phb || 0 == phb->buid)
 		return;
 
-	eeh_early_enable(dn, NULL);
+	/* FIXME: hotplug support on POWERNV */
+	eeh_ops->of_probe(dn, NULL);
 }
 
 /**
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 90a4f20..7578f83 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -129,10 +129,104 @@ static int pseries_eeh_init(void)
 		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
 	}
 
+	/* Set EEH probe mode */
+	eeh_probe_mode_set(EEH_PROBE_MODE_FDT);
+
 	return 0;
 }
 
 /**
+ * pseries_eeh_of_probe - EEH probe on the given device
+ * @dn: OF node
+ * @flag: Unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose.
+ */
+static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe pe;
+	const u32 *class_code, *vendor_id, *device_id;
+	const u32 *regs;
+	int enable = 0;
+	int ret;
+
+	/* Retrieve OF node and eeh device */
+	edev = of_node_to_eeh_dev(dn);
+	if (!of_device_is_available(dn))
+		return NULL;
+
+	/* Retrieve class/vendor/device IDs */
+	class_code = of_get_property(dn, "class-code", NULL);
+	vendor_id  = of_get_property(dn, "vendor-id", NULL);
+	device_id  = of_get_property(dn, "device-id", NULL);
+
+	/* Skip for bad OF node or PCI-ISA bridge */
+	if (!class_code || !vendor_id || !device_id)
+		return NULL;
+	if (dn->type && !strcmp(dn->type, "isa"))
+		return NULL;
+
+	/* Update class code and mode of eeh device */
+	edev->class_code = *class_code;
+	edev->mode = 0;
+
+	/* Retrieve the device address */
+	regs = of_get_property(dn, "reg", NULL);
+	if (!regs) {
+		pr_warning("%s: OF node property %s::reg not found\n",
+			__func__, dn->full_name);
+		return NULL;
+	}
+
+	/* Initialize the fake PE */
+	memset(&pe, 0, sizeof(struct eeh_pe));
+	pe.phb = edev->phb;
+	pe.config_addr = regs[0];
+
+	/* Enable EEH on the device */
+	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
+	if (!ret) {
+		edev->config_addr = regs[0];
+		/* Retrieve PE address */
+		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+		pe.addr = edev->pe_config_addr;
+
+		/* Some older systems (Power4) allow the ibm,set-eeh-option
+		 * call to succeed even on nodes where EEH is not supported.
+		 * Verify support explicitly.
+		 */
+		ret = eeh_ops->get_state(&pe, NULL);
+		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
+			enable = 1;
+
+		if (enable) {
+			eeh_subsystem_enabled = 1;
+			eeh_pe_create(edev);
+
+			pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n",
+				__func__, dn->full_name, pe.phb->global_number,
+				pe.addr, pe.config_addr);
+		} else if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
+			   (of_node_to_eeh_dev(dn->parent))->pe) {
+			/* This device doesn't support EEH, but it may have an
+			 * EEH parent, in which case we mark it as supported.
+			 */
+			edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
+			edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
+			eeh_pe_create(edev);
+		}
+	}
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return NULL;
+}
+
+/**
  * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
  * @pe: EEH PE
  * @option: operation to be issued
@@ -523,6 +617,8 @@ static int pseries_eeh_write_config(struct device_node *dn, int where, int size,
 static struct eeh_ops pseries_eeh_ops = {
 	.name			= "pseries",
 	.init			= pseries_eeh_init,
+	.of_probe		= pseries_eeh_of_probe,
+	.dev_probe		= NULL,
 	.set_option		= pseries_eeh_set_option,
 	.get_pe_addr		= pseries_eeh_get_pe_addr,
 	.get_state		= pseries_eeh_get_state,
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 18/21] ppc/eeh: handle EEH error based on PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch reworks the current implementation so that the eeh errors
will be handled basing on PE instead of eeh device.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h              |    1 +
 arch/powerpc/include/asm/eeh_event.h        |    2 +-
 arch/powerpc/platforms/pseries/eeh_driver.c |  229 +++++++++++----------------
 arch/powerpc/platforms/pseries/eeh_event.c  |    2 +-
 arch/powerpc/platforms/pseries/eeh_pe.c     |   27 +++
 5 files changed, 124 insertions(+), 137 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9a9fe28..e07ece1 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -174,6 +174,7 @@ int eeh_pe_remove(struct eeh_dev *edev);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
 		eeh_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
 void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index dc722b5..de67d83 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -32,7 +32,7 @@ struct eeh_event {
 };
 
 int eeh_send_failure_event(struct eeh_pe *pe);
-struct eeh_dev *handle_eeh_events(struct eeh_event *);
+void eeh_handle_event(struct eeh_pe *pe);
 
 #endif /* __KERNEL__ */
 #endif /* ASM_POWERPC_EEH_EVENT_H */
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index baf92cd..343c807 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -116,28 +116,35 @@ static void eeh_enable_irq(struct pci_dev *dev)
 
 /**
  * eeh_report_error - Report pci error to each device driver
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  * 
  * Report an EEH error to each device driver, collect up and 
  * merge the device driver responses. Cumulative response 
  * passed back in "userdata".
  */
-static int eeh_report_error(struct pci_dev *dev, void *userdata)
+static void *eeh_report_error(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver = dev->driver;
 
+	/* We might not have the associated PCI device,
+	 * then we should continue for next one.
+	 */
+	if (!dev) return NULL;
+
 	dev->error_state = pci_channel_io_frozen;
 
 	if (!driver)
-		return 0;
+		return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return 0;
+		return NULL;
 
 	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
 
@@ -145,27 +152,31 @@ static int eeh_report_error(struct pci_dev *dev, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
-	return 0;
+	return NULL;
 }
 
 /**
  * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * Tells each device driver that IO ports, MMIO and config space I/O
  * are now enabled. Collects up and merges the device driver responses.
  * Cumulative response passed back in "userdata".
  */
-static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
+	struct pci_driver *driver;
 
-	if (!driver ||
+	if (!dev) return NULL;
+
+	if (!(driver = dev->driver) ||
 	    !driver->err_handler ||
 	    !driver->err_handler->mmio_enabled)
-		return 0;
+		return NULL;
 
 	rc = driver->err_handler->mmio_enabled(dev);
 
@@ -173,12 +184,12 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
-	return 0;
+	return NULL;
 }
 
 /**
  * eeh_report_reset - Tell device that slot has been reset
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This routine must be called while EEH tries to reset particular
@@ -186,13 +197,15 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
  * some actions, usually to save data the driver needs so that the
  * driver can work again while the device is recovered.
  */
-static int eeh_report_reset(struct pci_dev *dev, void *userdata)
+static void *eeh_report_reset(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
+	struct pci_driver *driver;
 
-	if (!driver)
-		return 0;
+	if (!dev || !(driver = dev->driver))
+		return NULL;
 
 	dev->error_state = pci_channel_io_normal;
 
@@ -200,7 +213,7 @@ static int eeh_report_reset(struct pci_dev *dev, void *userdata)
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->slot_reset)
-		return 0;
+		return NULL;
 
 	rc = driver->err_handler->slot_reset(dev);
 	if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -208,82 +221,89 @@ static int eeh_report_reset(struct pci_dev *dev, void *userdata)
 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 
-	return 0;
+	return NULL;
 }
 
 /**
  * eeh_report_resume - Tell device to resume normal operations
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This routine must be called to notify the device driver that it
  * could resume so that the device driver can do some initialization
  * to make the recovered device work again.
  */
-static int eeh_report_resume(struct pci_dev *dev, void *userdata)
+static void *eeh_report_resume(void *data, void *userdata)
 {
-	struct pci_driver *driver = dev->driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev) return NULL;
 
 	dev->error_state = pci_channel_io_normal;
 
-	if (!driver)
-		return 0;
+	if (!(driver = dev->driver))
+		return NULL;
 
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->resume)
-		return 0;
+		return NULL;
 
 	driver->err_handler->resume(dev);
 
-	return 0;
+	return NULL;
 }
 
 /**
  * eeh_report_failure - Tell device driver that device is dead.
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This informs the device driver that the device is permanently
  * dead, and that no further recovery attempts will be made on it.
  */
-static int eeh_report_failure(struct pci_dev *dev, void *userdata)
+static void *eeh_report_failure(void *data, void *userdata)
 {
-	struct pci_driver *driver = dev->driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
+
+	if (!dev) return NULL;
 
 	dev->error_state = pci_channel_io_perm_failure;
 
-	if (!driver)
-		return 0;
+	if (!(driver = dev->driver))
+		return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->error_detected)
-		return 0;
+		return NULL;
 
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
-	return 0;
+	return NULL;
 }
 
 /**
  * eeh_reset_device - Perform actual reset of a pci slot
- * @edev: PE associated EEH device
+ * @pe: EEH PE
  * @bus: PCI bus corresponding to the isolcated slot
  *
  * This routine must be called to do reset on the indicated PE.
  * During the reset, udev might be invoked because those affected
  * PCI devices will be removed and then added.
  */
-static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 {
-	struct device_node *dn;
 	int cnt, rc;
 
 	/* pcibios will clear the counter; save the value */
-	cnt = edev->freeze_count;
+	cnt = pe->freeze_count;
 
 	if (bus)
 		pcibios_remove_pci_devices(bus);
@@ -292,25 +312,13 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 	 * Reconfigure bridges and devices. Don't try to bring the system
 	 * up if the reset failed for some reason.
 	 */
-	rc = eeh_reset_pe(edev);
+	rc = eeh_reset_pe(pe);
 	if (rc)
 		return rc;
 
-	/* Walk over all functions on this device. */
-	dn = eeh_dev_to_of_node(edev);
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent->child;
-
-	while (dn) {
-		struct eeh_dev *pedev = of_node_to_eeh_dev(dn);
-
-		/* On Power4, always true because eeh_pe_config_addr=0 */
-		if (edev->pe_config_addr == pedev->pe_config_addr) {
-			eeh_ops->configure_bridge(dn);
-			eeh_restore_bars(pedev);
- 		}
-		dn = dn->sibling;
-	}
+	/* Restore PE */
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
 
 	/* Give the system 5 seconds to finish running the user-space
 	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
@@ -322,7 +330,7 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 		ssleep(5);
 		pcibios_add_pci_devices(bus);
 	}
-	edev->freeze_count = cnt;
+	pe->freeze_count = cnt;
 
 	return 0;
 }
@@ -334,7 +342,7 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 
 /**
  * eeh_handle_event - Reset a PCI device after hard lockup.
- * @event: EEH event
+ * @pe: EEH PE
  *
  * While PHB detects address or data parity errors on particular PCI
  * slot, the associated PE will be frozen. Besides, DMA's occurring
@@ -349,69 +357,24 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
  * drivers (which cause a second set of hotplug events to go out to
  * userspace).
  */
-struct eeh_dev *handle_eeh_events(struct eeh_event *event)
+void eeh_handle_event(struct eeh_pe *pe)
 {
-	struct device_node *frozen_dn;
-	struct eeh_dev *frozen_edev;
 	struct pci_bus *frozen_bus;
 	int rc = 0;
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
-	const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
-
-	frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev));
-	if (!frozen_dn) {
-		location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL);
-		location = location ? location : "unknown";
-		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
-		                "for location=%s pci addr=%s\n",
-			location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev)));
-		return NULL;
-	}
-
-	frozen_bus = pcibios_find_pci_bus(frozen_dn);
-	location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
-	location = location ? location : "unknown";
-
-	/* There are two different styles for coming up with the PE.
-	 * In the old style, it was the highest EEH-capable device
-	 * which was always an EADS pci bridge.  In the new style,
-	 * there might not be any EADS bridges, and even when there are,
-	 * the firmware marks them as "EEH incapable". So another
-	 * two-step is needed to find the pci bus..
-	 */
-	if (!frozen_bus)
-		frozen_bus = pcibios_find_pci_bus(frozen_dn->parent);
 
+	frozen_bus = eeh_pe_bus_get(pe);
 	if (!frozen_bus) {
-		printk(KERN_ERR "EEH: Cannot find PCI bus "
-		        "for location=%s dn=%s\n",
-		        location, frozen_dn->full_name);
-		return NULL;
+		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return;
 	}
 
-	frozen_edev = of_node_to_eeh_dev(frozen_dn);
-	frozen_edev->freeze_count++;
-	pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev));
-	drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev));
-
-	if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+	pe->freeze_count++;
+	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
 		goto excess_failures;
-
-	printk(KERN_WARNING
-	   "EEH: This PCI device has failed %d times in the last hour:\n",
-		frozen_edev->freeze_count);
-
-	if (frozen_edev->pdev) {
-		bus_pci_str = pci_name(frozen_edev->pdev);
-		bus_drv_str = eeh_pcid_name(frozen_edev->pdev);
-		printk(KERN_WARNING
-			"EEH: Bus location=%s driver=%s pci addr=%s\n",
-			location, bus_drv_str, bus_pci_str);
-	}
-
-	printk(KERN_WARNING
-		"EEH: Device location=%s driver=%s pci addr=%s\n",
-		location, drv_str, pci_str);
+	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+		pe->freeze_count);
 
 	/* Walk the various device drivers attached to this slot through
 	 * a reset sequence, giving each an opportunity to do what it needs
@@ -419,12 +382,12 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	 * status ... if any child can't handle the reset, then the entire
 	 * slot is dlpar removed and added.
 	 */
-	pci_walk_bus(frozen_bus, eeh_report_error, &result);
+	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
 	/* Get the current PCI slot state. This can take a long time,
 	 * sometimes over 3 seconds for certain systems.
 	 */
-	rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000);
+	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
 		printk(KERN_WARNING "EEH: Permanent failure\n");
 		goto hard_fail;
@@ -434,14 +397,14 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	 * don't post the error log until after all dev drivers
 	 * have been informed.
 	 */
-	eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP);
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
 
 	/* If all device drivers were EEH-unaware, then shut
 	 * down all of the device drivers, and hope they
 	 * go down willingly, without panicing the system.
 	 */
 	if (result == PCI_ERS_RESULT_NONE) {
-		rc = eeh_reset_device(frozen_edev, frozen_bus);
+		rc = eeh_reset_device(pe, frozen_bus);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
 			goto hard_fail;
@@ -450,7 +413,7 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 
 	/* If all devices reported they can proceed, then re-enable MMIO */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO);
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -458,13 +421,13 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 			result = PCI_ERS_RESULT_NEED_RESET;
 		} else {
 			result = PCI_ERS_RESULT_NONE;
-			pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
+			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
 		}
 	}
 
 	/* If all devices reported they can proceed, then re-enable DMA */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA);
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -482,13 +445,13 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 
 	/* If any device called out for a reset, then reset the slot */
 	if (result == PCI_ERS_RESULT_NEED_RESET) {
-		rc = eeh_reset_device(frozen_edev, NULL);
+		rc = eeh_reset_device(pe, NULL);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
 			goto hard_fail;
 		}
 		result = PCI_ERS_RESULT_NONE;
-		pci_walk_bus(frozen_bus, eeh_report_reset, &result);
+		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
 	}
 
 	/* All devices should claim they have recovered by now. */
@@ -499,9 +462,9 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	}
 
 	/* Tell all device drivers that they can resume operations */
-	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
+	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-	return frozen_edev;
+	return;
 	
 excess_failures:
 	/*
@@ -509,30 +472,26 @@ excess_failures:
 	 * are due to poorly seated PCI cards. Only 10% or so are
 	 * due to actual, failed cards.
 	 */
-	printk(KERN_ERR
-	   "EEH: PCI device at location=%s driver=%s pci addr=%s\n"
-		"has failed %d times in the last hour "
-		"and has been permanently disabled.\n"
-		"Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str, frozen_edev->freeze_count);
+	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+	       "last hour and has been permanently disabled.\n"
+	       "Please try reseating or replacing it.\n",
+		pe->phb->global_number, pe->addr,
+		pe->freeze_count);
 	goto perm_error;
 
 hard_fail:
-	printk(KERN_ERR
-	   "EEH: Unable to recover from failure of PCI device "
-	   "at location=%s driver=%s pci addr=%s\n"
-	   "Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str);
+	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+	       "Please try reseating or replacing it\n",
+		pe->phb->global_number, pe->addr);
 
 perm_error:
-	eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM);
+	eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
 	/* Notify all devices that they're about to go down. */
-	pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
+	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
 
 	/* Shut down the device drivers for good. */
-	pcibios_remove_pci_devices(frozen_bus);
-
-	return NULL;
+	if (frozen_bus)
+		pcibios_remove_pci_devices(frozen_bus);
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index ba7005a..51faaac 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -82,7 +82,7 @@ static int eeh_event_handler(void * dummy)
 		pe->phb->global_number, pe->addr);
 
 	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
-	handle_eeh_events(event);
+	eeh_handle_event(pe);
 	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 
 	kfree(event);
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 8bae0f6..d70a7e4 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -554,3 +554,30 @@ void eeh_pe_restore_bars(struct eeh_pe *pe)
 	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
 }
 
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+	struct pci_bus *bus = NULL;
+	struct eeh_dev *edev;
+	struct pci_dev *pdev;
+
+	if (pe->type == EEH_PE_PHB) {
+		bus = pe->phb->bus;
+	} else if (pe->type == EEH_PE_BUS) {
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			bus = pdev->bus;
+	}
+
+	return bus;
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 17/21] ppc/eeh: make EEH handler PE sensitive
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

Once eeh error is found, eeh event will be created and put it into
the global linked list. At the mean while, kernel thread will be
started to process it. The handler for the kernel thread originally
was eeh device sensitive.

The patch reworks the handler of the kernel thread so that it's PE
sensitive.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_event.c |   25 ++++++++++---------------
 1 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index 7f89f1e..ba7005a 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -57,7 +57,7 @@ static int eeh_event_handler(void * dummy)
 {
 	unsigned long flags;
 	struct eeh_event *event;
-	struct eeh_dev *edev;
+	struct eeh_pe *pe;
 
 	set_task_comm(current, "eehd");
 
@@ -76,28 +76,23 @@ static int eeh_event_handler(void * dummy)
 
 	/* Serialize processing of EEH events */
 	mutex_lock(&eeh_event_mutex);
-	edev = event->edev;
-	eeh_mark_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
-
-	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-	       eeh_pci_name(edev->pdev));
+	pe = event->pe;
+	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+		pe->phb->global_number, pe->addr);
 
 	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
-	edev = handle_eeh_events(event);
-
-	if (edev) {
-		eeh_clear_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
-		pci_dev_put(edev->pdev);
-	}
+	handle_eeh_events(event);
+	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 
 	kfree(event);
 	mutex_unlock(&eeh_event_mutex);
 
 	/* If there are no new errors after an hour, clear the counter. */
-	if (edev && edev->freeze_count>0) {
+	if (pe && pe->freeze_count > 0) {
 		msleep_interruptible(3600*1000);
-		if (edev->freeze_count>0)
-			edev->freeze_count--;
+		if (pe->freeze_count > 0)
+			pe->freeze_count--;
 
 	}
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 16/21] ppc/eeh: do reset based on PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch implements reset based on PE instead of eeh device. Also,
The functions used to retrieve the reset type, either hot or fundamental
reset, have been reworked for a little bit. More specificly, it's
implemented based the the eeh device traverse function.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h   |    2 +-
 arch/powerpc/platforms/pseries/eeh.c |   91 +++++++++++++---------------------
 2 files changed, 35 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 5e34b10..2a80f08 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -53,7 +53,7 @@ void pci_addr_cache_remove_device(struct pci_dev *dev);
 struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
-int eeh_reset_pe(struct eeh_dev *);
+int eeh_reset_pe(struct eeh_pe *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 031935d..d855c20 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -455,17 +455,24 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
  */
 int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 {
-	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = edev->pe;
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
 
 	switch (state) {
 	case pcie_deassert_reset:
-		eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 		break;
 	case pcie_hot_reset:
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 		break;
 	case pcie_warm_reset:
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 		break;
 	default:
 		return -EINVAL;
@@ -475,66 +482,37 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 }
 
 /**
- * __eeh_set_pe_freset - Check the required reset for child devices
- * @parent: parent device
- * @freset: return value
- *
- * Each device might have its preferred reset type: fundamental or
- * hot reset. The routine is used to collect the information from
- * the child devices so that they could be reset accordingly.
- */
-void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			struct pci_dev *dev = of_node_to_eeh_dev(dn)->pdev;
-
-			if (dev && dev->driver)
-				*freset |= dev->needs_freset;
-
-			__eeh_set_pe_freset(dn, freset);
-		}
-	}
-}
-
-/**
- * eeh_set_pe_freset - Check the required reset for the indicated device and its children
- * @dn: parent device
- * @freset: return value
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
  *
  * Each device might have its preferred reset type: fundamental or
  * hot reset. The routine is used to collected the information for
  * the indicated device and its children so that the bunch of the
  * devices could be reset properly.
  */
-void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
+static void *eeh_set_dev_freset(void *data, void *flag)
 {
 	struct pci_dev *dev;
-	dn = eeh_find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
+	unsigned int *freset = (unsigned int *)flag;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
 
-	dev = of_node_to_eeh_dev(dn)->pdev;
+	dev = eeh_dev_to_pci_dev(edev);
 	if (dev)
 		*freset |= dev->needs_freset;
 
-	__eeh_set_pe_freset(dn, freset);
+	return NULL;
 }
 
 /**
  * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
- * @edev: pci device node to be reset.
+ * @pe: EEH PE
  *
  * Assert the PCI #RST line for 1/4 second.
  */
-static void eeh_reset_pe_once(struct eeh_dev *edev)
+static void eeh_reset_pe_once(struct eeh_pe *pe)
 {
 	unsigned int freset = 0;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Determine type of EEH reset required for
 	 * Partitionable Endpoint, a hot-reset (1)
@@ -542,12 +520,12 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * A fundamental reset required by any device under
 	 * Partitionable Endpoint trumps hot-reset.
   	 */
-	eeh_set_pe_freset(dn, &freset);
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
 	if (freset)
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 	else
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 
 	/* The PCI bus requires that the reset be held high for at least
 	 * a 100 milliseconds. We wait a bit longer 'just in case'.
@@ -559,9 +537,9 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * pci slot reset line is dropped. Make sure we don't miss
 	 * these, and clear the flag now.
 	 */
-	eeh_clear_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_clear(pe, EEH_MODE_ISOLATED);
 
-	eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 
 	/* After a PCI slot has been reset, the PCI Express spec requires
 	 * a 1.5 second idle time for the bus to stabilize, before starting
@@ -573,32 +551,31 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 
 /**
  * eeh_reset_pe - Reset the indicated PE
- * @edev: PCI device associated EEH device
+ * @pe: EEH PE
  *
  * This routine should be called to reset indicated device, including
  * PE. A PE might include multiple PCI devices and sometimes PCI bridges
  * might be involved as well.
  */
-int eeh_reset_pe(struct eeh_dev *edev)
+int eeh_reset_pe(struct eeh_pe *pe)
 {
 	int i, rc;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Take three shots at resetting the bus */
 	for (i=0; i<3; i++) {
-		eeh_reset_pe_once(edev);
+		eeh_reset_pe_once(pe);
 
-		rc = eeh_ops->wait_state(dn, PCI_BUS_RESET_WAIT_MSEC);
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
 		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
 			return 0;
 
 		if (rc < 0) {
-			printk(KERN_ERR "EEH: unrecoverable slot failure %s\n",
-			       dn->full_name);
+			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
 			return -1;
 		}
-		printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n",
-		       i+1, dn->full_name, rc);
+		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+			i+1, pe->phb->global_number, pe->addr, rc);
 	}
 
 	return -1;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 14/21] ppc/eeh: device bars restore based on PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch introduces the function to traverse the devices of the
specified PE and its child PEs. Also, the restore on device bars
is implemented based on the traverse function.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h          |    3 +
 arch/powerpc/include/asm/ppc-pci.h      |    1 -
 arch/powerpc/platforms/pseries/eeh.c    |   79 --------------------------
 arch/powerpc/platforms/pseries/eeh_pe.c |   93 +++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 80 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 96451b7..9a9fe28 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -171,6 +171,9 @@ typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int __devinit eeh_phb_pe_create(struct pci_controller *phb);
 int eeh_pe_create(struct eeh_dev *edev);
 int eeh_pe_remove(struct eeh_dev *edev);
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+		eeh_traverse_func fn, void *flag);
+void eeh_pe_restore_bars(struct eeh_pe *pe);
 
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
 void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 3e301b1..5cbe3f2 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -54,7 +54,6 @@ struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_dev *edev, int severity);
 int eeh_pci_enable(struct eeh_dev *edev, int function);
 int eeh_reset_pe(struct eeh_dev *);
-void eeh_restore_bars(struct eeh_dev *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 636413f..28d0c04 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -610,85 +610,6 @@ int eeh_reset_pe(struct eeh_dev *edev)
 	return -1;
 }
 
-/** Save and restore of PCI BARs
- *
- * Although firmware will set up BARs during boot, it doesn't
- * set up device BAR's after a device reset, although it will,
- * if requested, set up bridge configuration. Thus, we need to
- * configure the PCI devices ourselves.  
- */
-
-/**
- * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
- * @edev: PCI device associated EEH device
- *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
- */
-static inline void eeh_restore_one_device_bars(struct eeh_dev *edev)
-{
-	int i;
-	u32 cmd;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
-
-	if (!edev->phb)
-		return;
-
-	for (i=4; i<10; i++) {
-		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
-	}
-
-	/* 12 == Expansion ROM Address */
-	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
-
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
-
-	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
-	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
-
-	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
-	            SAVED_BYTE(PCI_LATENCY_TIMER));
-
-	/* max latency, min grant, interrupt pin and line */
-	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
-
-	/* Restore PERR & SERR bits, some devices require it,
-	 * don't touch the other command bits
-	 */
-	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
-	if (edev->config_space[1] & PCI_COMMAND_PARITY)
-		cmd |= PCI_COMMAND_PARITY;
-	else
-		cmd &= ~PCI_COMMAND_PARITY;
-	if (edev->config_space[1] & PCI_COMMAND_SERR)
-		cmd |= PCI_COMMAND_SERR;
-	else
-		cmd &= ~PCI_COMMAND_SERR;
-	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
-}
-
-/**
- * eeh_restore_bars - Restore the PCI config space info
- * @edev: EEH device
- *
- * This routine performs a recursive walk to the children
- * of this device as well.
- */
-void eeh_restore_bars(struct eeh_dev *edev)
-{
-	struct device_node *dn;
-	if (!edev)
-		return;
-	
-	if ((edev->mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(edev->class_code))
-		eeh_restore_one_device_bars(edev);
-
-	for_each_child_of_node(eeh_dev_to_of_node(edev), dn)
-		eeh_restore_bars(of_node_to_eeh_dev(dn));
-}
-
 /**
  * eeh_save_bars - Save device bars
  * @edev: PCI device associated EEH device
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 3041e32..8bae0f6 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -154,6 +154,38 @@ static void *eeh_pe_traverse(struct eeh_pe *root,
 }
 
 /**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+		eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev;
+	void *ret;
+
+	if (!root) {
+		pr_warning("%s: Invalid PE %p\n", __func__, root);
+		return NULL;
+	}
+
+	/* Traverse root PE */
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		eeh_pe_for_each_dev(pe, edev) {
+			ret = fn(edev, flag);
+			if (ret) return ret;
+		}
+	}
+
+	return NULL;
+}
+
+/**
  * __eeh_pe_get - Check the PE address
  * @data: EEH PE
  * @flag: EEH device
@@ -461,3 +493,64 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state)
 	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
 }
 
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+	int i;
+	u32 cmd;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+
+	for (i = 4; i < 10; i++)
+                eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* 12 == Expansion ROM Address */
+	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+			SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+		SAVED_BYTE(PCI_LATENCY_TIMER));
+
+	/* max latency, min grant, interrupt pin and line */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/* Restore PERR & SERR bits, some devices require it,
+	 * don't touch the other command bits
+	 */
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+	if (edev->config_space[1] & PCI_COMMAND_PARITY)
+		cmd |= PCI_COMMAND_PARITY;
+	else
+		cmd &= ~PCI_COMMAND_PARITY;
+	if (edev->config_space[1] & PCI_COMMAND_SERR)
+		cmd |= PCI_COMMAND_SERR;
+	else
+		cmd &= ~PCI_COMMAND_SERR;
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 12/21] ppc/eeh: trace error based on PE from beginning
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

There're 2 conditions to trigger EEH error detection: invalid value
returned from reading I/O or config space. On each case, the function
eeh_dn_check_failure will be called to initialize EEH event and put
it into the poll for further processing.

The patch changes the function for a little bit so that the EEH error
will be traced based on PE instead of EEH device any more. Also, the
function eeh_find_device_pe() has been removed since the eeh device
is tracing the PE by struct eeh_dev::pe.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h   |    1 -
 arch/powerpc/platforms/pseries/eeh.c |   51 +++++++++++++--------------------
 arch/powerpc/platforms/pseries/msi.c |    6 +++-
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index c7e5bd6..3e301b1 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -59,7 +59,6 @@ int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
 void eeh_pe_state_clear(struct eeh_pe *pe, int state);
-struct device_node *eeh_find_device_pe(struct device_node *dn);
 
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index c527c46..341ba1a 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -264,21 +264,6 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 }
 
 /**
- * eeh_find_device_pe - Retrieve the PE for the given device
- * @dn: device node
- *
- * Return the PE under which this device lies
- */
-struct device_node *eeh_find_device_pe(struct device_node *dn)
-{
-	while (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-	       (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
-		dn = dn->parent;
-	}
-	return dn;
-}
-
-/**
  * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
  * @dn: device node
  * @dev: pci device, if known
@@ -297,6 +282,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 {
 	int ret;
 	unsigned long flags;
+	struct eeh_pe *pe;
 	struct eeh_dev *edev;
 	int rc = 0;
 	const char *location;
@@ -306,23 +292,26 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (!dn) {
+	if (dn) {
+		edev = of_node_to_eeh_dev(dn);
+	} else if (dev) {
+		edev = pci_dev_to_eeh_dev(dev);
+		dn = pci_device_to_OF_node(dev);
+	} else {
 		eeh_stats.no_dn++;
 		return 0;
 	}
-	dn = eeh_find_device_pe(dn);
-	edev = of_node_to_eeh_dev(dn);
+	pe = edev->pe;
 
 	/* Access to IO BARs might get this far and still not want checking. */
-	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-	    edev->mode & EEH_MODE_NOCHECK) {
+	if (!pe) {
 		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check (%x) for %s %s\n",
-			edev->mode, eeh_pci_name(dev), dn->full_name);
+		pr_debug("EEH: Ignored check for %s %s\n",
+			eeh_pci_name(dev), dn->full_name);
 		return 0;
 	}
 
-	if (!edev->config_addr && !edev->pe_config_addr) {
+	if (!pe->addr && !pe->config_addr) {
 		eeh_stats.no_cfg_addr++;
 		return 0;
 	}
@@ -335,13 +324,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 */
 	raw_spin_lock_irqsave(&confirm_error_lock, flags);
 	rc = 1;
-	if (edev->mode & EEH_MODE_ISOLATED) {
-		edev->check_count++;
-		if (edev->check_count % EEH_MAX_FAILS == 0) {
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count % EEH_MAX_FAILS == 0) {
 			location = of_get_property(dn, "ibm,loc-code", NULL);
 			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
 				"location=%s driver=%s pci addr=%s\n",
-				edev->check_count, location,
+				pe->check_count, location,
 				eeh_driver_name(dev), eeh_pci_name(dev));
 			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
@@ -357,7 +346,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * function zero of a multi-function device.
 	 * In any case they must share a common PHB.
 	 */
-	ret = eeh_ops->get_state(dn, NULL);
+	ret = eeh_ops->get_state(pe, NULL);
 
 	/* Note that config-io to empty slots may fail;
 	 * they are empty when they don't have children.
@@ -370,7 +359,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
 	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
 		eeh_stats.false_positives++;
-		edev->false_positives ++;
+		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
@@ -381,10 +370,10 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * with other functions on this device, and functions under
 	 * bridges.
 	 */
-	eeh_mark_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-	eeh_send_failure_event(edev);
+	eeh_send_failure_event(pe);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 109fdb7..c8534fa 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -210,6 +210,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
 	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Found our PE and assume 8 at that point. */
 
@@ -217,7 +218,10 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 	if (!dn)
 		return NULL;
 
-	dn = eeh_find_device_pe(dn);
+	/* Get the top level device in the PE */
+	edev = of_node_to_eeh_dev(dn);
+	edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+	dn = eeh_dev_to_of_node(edev);
 	if (!dn)
 		return NULL;
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 15/21] ppc/eeh: I/O enable and log retrival based on PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch refactors the original implementation in order to enable
I/O and do log retrieval based on PE.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h   |    4 +-
 arch/powerpc/platforms/pseries/eeh.c |   44 ++++++++++++++-------------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 5cbe3f2..5e34b10 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -51,8 +51,8 @@ void pci_addr_cache_build(void);
 void pci_addr_cache_insert_device(struct pci_dev *dev);
 void pci_addr_cache_remove_device(struct pci_dev *dev);
 struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_dev *edev, int severity);
-int eeh_pci_enable(struct eeh_dev *edev, int function);
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_reset_pe(struct eeh_dev *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 28d0c04..031935d 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -207,22 +207,12 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
 		}
 	}
 
-	/* Gather status on devices under the bridge */
-	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
-		struct device_node *child;
-
-		for_each_child_of_node(dn, child) {
-			if (of_node_to_eeh_dev(child))
-				n += eeh_gather_pci_data(of_node_to_eeh_dev(child), buf+n, len-n);
-		}
-	}
-
 	return n;
 }
 
 /**
  * eeh_slot_error_detail - Generate combined log including driver log and error log
- * @edev: device to report error log for
+ * @pe: EEH PE
  * @severity: temporary or permanent error log
  *
  * This routine should be called to generate the combined log, which
@@ -230,17 +220,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
  * out from the config space of the corresponding PCI device, while
  * the error log is fetched through platform dependent function call.
  */
-void eeh_slot_error_detail(struct eeh_dev *edev, int severity)
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 {
 	size_t loglen = 0;
-	pci_regs_buf[0] = 0;
+	struct eeh_dev *edev;
 
-	eeh_pci_enable(edev, EEH_OPT_THAW_MMIO);
-	eeh_ops->configure_bridge(eeh_dev_to_of_node(edev));
-	eeh_restore_bars(edev);
-	loglen = eeh_gather_pci_data(edev, pci_regs_buf, EEH_PCI_REGS_LOG_LEN);
+	eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
 
-	eeh_ops->get_log(eeh_dev_to_of_node(edev), severity, pci_regs_buf, loglen);
+	pci_regs_buf[0] = 0;
+	eeh_pe_for_each_dev(pe, edev) {
+		loglen += eeh_gather_pci_data(edev, pci_regs_buf,
+				EEH_PCI_REGS_LOG_LEN);
+        }
+
+	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
 }
 
 /**
@@ -427,23 +422,22 @@ EXPORT_SYMBOL(eeh_check_failure);
 
 /**
  * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
- * @edev: pci device node
+ * @pe: EEH PE
  *
  * This routine should be called to reenable frozen MMIO or DMA
  * so that it would work correctly again. It's useful while doing
  * recovery or log collection on the indicated device.
  */
-int eeh_pci_enable(struct eeh_dev *edev, int function)
+int eeh_pci_enable(struct eeh_pe *pe, int function)
 {
 	int rc;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
-	rc = eeh_ops->set_option(dn, function);
+	rc = eeh_ops->set_option(pe, function);
 	if (rc)
-		printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n",
-		        function, rc, dn->full_name);
+		pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
+			__func__, function, pe->phb->global_number, pe->addr, rc);
 
-	rc = eeh_ops->wait_state(dn, PCI_BUS_RESET_WAIT_MSEC);
+	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
 	if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
 	   (function == EEH_OPT_THAW_MMIO))
 		return 0;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 13/21] ppc/eeh: eeh options based on PE
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

Originally, all the EEH options were implemented based on OF node.
Actually, it explicitly breaks the rules that the operation target
is PE instead of device. Therefore, the patch makes all the operations
based on PE instead of device.

Unfortunately, the backend for config space has to be kept as original
because it doesn't depend on PE actually.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h               |   14 ++--
 arch/powerpc/platforms/pseries/eeh.c         |   13 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c |  133 +++++++++++---------------
 3 files changed, 74 insertions(+), 86 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 493dc7c..96451b7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -136,13 +136,13 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 struct eeh_ops {
 	char *name;
 	int (*init)(void);
-	int (*set_option)(struct device_node *dn, int option);
-	int (*get_pe_addr)(struct device_node *dn);
-	int (*get_state)(struct device_node *dn, int *state);
-	int (*reset)(struct device_node *dn, int option);
-	int (*wait_state)(struct device_node *dn, int max_wait);
-	int (*get_log)(struct device_node *dn, int severity, char *drv_log, unsigned long len);
-	int (*configure_bridge)(struct device_node *dn);
+	int (*set_option)(struct eeh_pe *pe, int option);
+	int (*get_pe_addr)(struct eeh_pe *pe);
+	int (*get_state)(struct eeh_pe *pe, int *state);
+	int (*reset)(struct eeh_pe *pe, int option);
+	int (*wait_state)(struct eeh_pe *pe, int max_wait);
+	int (*get_log)(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len);
+	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
 	int (*write_config)(struct device_node *dn, int where, int size, u32 val);
 };
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 341ba1a..636413f 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -729,6 +729,7 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 	const u32 *regs;
 	int enable;
 	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	struct eeh_pe pe;
 
 	edev->class_code = 0;
 	edev->mode = 0;
@@ -755,9 +756,14 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 	 */
 	regs = of_get_property(dn, "reg", NULL);
 	if (regs) {
+		/* Initialize the fake PE */
+		memset(&pe, 0, sizeof(struct eeh_pe));
+		pe.phb = edev->phb;
+		pe.config_addr = regs[0];
+
 		/* First register entry is addr (00BBSS00)  */
 		/* Try to enable eeh */
-		ret = eeh_ops->set_option(dn, EEH_OPT_ENABLE);
+		ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
 
 		enable = 0;
 		if (ret == 0) {
@@ -766,14 +772,15 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 			/* If the newer, better, ibm,get-config-addr-info is supported, 
 			 * then use that instead.
 			 */
-			edev->pe_config_addr = eeh_ops->get_pe_addr(dn);
+			edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+			pe.addr = edev->pe_config_addr;
 
 			/* Some older systems (Power4) allow the
 			 * ibm,set-eeh-option call to succeed even on nodes
 			 * where EEH is not supported. Verify support
 			 * explicitly.
 			 */
-			ret = eeh_ops->get_state(dn, NULL);
+			ret = eeh_ops->get_state(&pe, NULL);
 			if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
 				enable = 1;
 		}
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 46616c8..90a4f20 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -134,22 +134,18 @@ static int pseries_eeh_init(void)
 
 /**
  * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
- * @dn: device node
+ * @pe: EEH PE
  * @option: operation to be issued
  *
  * The function is used to control the EEH functionality globally.
  * Currently, following options are support according to PAPR:
  * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
  */
-static int pseries_eeh_set_option(struct device_node *dn, int option)
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
 {
 	int ret = 0;
-	struct eeh_dev *edev;
-	const u32 *reg;
 	int config_addr;
 
-	edev = of_node_to_eeh_dev(dn);
-
 	/*
 	 * When we're enabling or disabling EEH functioality on
 	 * the particular PE, the PE config address is possibly
@@ -159,15 +155,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	switch (option) {
 	case EEH_OPT_DISABLE:
 	case EEH_OPT_ENABLE:
-		reg = of_get_property(dn, "reg", NULL);
-		config_addr = reg[0];
-		break;
-
 	case EEH_OPT_THAW_MMIO:
 	case EEH_OPT_THAW_DMA:
-		config_addr = edev->config_addr;
-		if (edev->pe_config_addr)
-			config_addr = edev->pe_config_addr;
+		config_addr = pe->config_addr;
+		if (pe->addr)
+			config_addr = pe->addr;
 		break;
 
 	default:
@@ -177,15 +169,15 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	}
 
 	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	return ret;
 }
 
 /**
  * pseries_eeh_get_pe_addr - Retrieve PE address
- * @dn: device node
+ * @pe: EEH PE
  *
  * Retrieve the assocated PE address. Actually, there're 2 RTAS
  * function calls dedicated for the purpose. We need implement
@@ -196,14 +188,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
  * It's notable that zero'ed return value means invalid PE config
  * address.
  */
-static int pseries_eeh_get_pe_addr(struct device_node *dn)
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int ret = 0;
 	int rets[3];
 
-	edev = of_node_to_eeh_dev(dn);
-
 	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
 		/*
 		 * First of all, we need to make sure there has one PE
@@ -211,18 +200,18 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 		 * meaningless.
 		 */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 1);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 1);
 		if (ret || (rets[0] == 0))
 			return 0;
 
 		/* Retrieve the associated PE config address */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -231,11 +220,11 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -247,7 +236,7 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 /**
  * pseries_eeh_get_state - Retrieve PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @state: return value
  *
  * Retrieve the state of the specified PE. On RTAS compliant
@@ -258,30 +247,28 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
  * RTAS calls for the purpose, we need to try the new one and back
  * to the old one if the new one couldn't work properly.
  */
-static int pseries_eeh_get_state(struct device_node *dn, int *state)
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 	int rets[4];
 	int result;
 
 	/* Figure out PE config address if possible */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
 		/* Fake PE unavailable info */
 		rets[2] = 0;
 		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return EEH_STATE_NOT_SUPPORT;
 	}
@@ -333,34 +320,32 @@ static int pseries_eeh_get_state(struct device_node *dn, int *state)
 
 /**
  * pseries_eeh_reset - Reset the specified PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @option: reset option
  *
  * Reset the specified PE
  */
-static int pseries_eeh_reset(struct device_node *dn, int option)
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Reset PE through RTAS call */
 	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	/* If fundamental-reset not supported, try hot-reset */
 	if (option == EEH_RESET_FUNDAMENTAL &&
 	    ret == -8) {
 		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), EEH_RESET_HOT);
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), EEH_RESET_HOT);
 	}
 
 	return ret;
@@ -368,13 +353,13 @@ static int pseries_eeh_reset(struct device_node *dn, int option)
 
 /**
  * pseries_eeh_wait_state - Wait for PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @max_wait: maximal period in microsecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
  */
-static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
 {
 	int ret;
 	int mwait;
@@ -391,7 +376,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 #define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
 
 	while (1) {
-		ret = pseries_eeh_get_state(dn, &mwait);
+		ret = pseries_eeh_get_state(pe, &mwait);
 
 		/*
 		 * If the PE's state is temporarily unavailable,
@@ -426,7 +411,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 
 /**
  * pseries_eeh_get_log - Retrieve error log
- * @dn: device node
+ * @pe: EEH PE
  * @severity: temporary or permanent error log
  * @drv_log: driver log to be combined with retrieved error log
  * @len: length of driver log
@@ -435,24 +420,22 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
  * Actually, the error will be retrieved through the dedicated
  * RTAS call.
  */
-static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	unsigned long flags;
 	int ret;
 
-	edev = of_node_to_eeh_dev(dn);
 	spin_lock_irqsave(&slot_errbuf_lock, flags);
 	memset(slot_errbuf, 0, eeh_error_buf_size);
 
 	/* Figure out the PE address */
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
-			BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
+			BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
 			virt_to_phys(drv_log), len,
 			virt_to_phys(slot_errbuf), eeh_error_buf_size,
 			severity);
@@ -465,40 +448,38 @@ static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_l
 
 /**
  * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  *
  * The function will be called to reconfigure the bridges included
  * in the specified PE so that the mulfunctional PE would be recovered
  * again.
  */
-static int pseries_eeh_configure_bridge(struct device_node *dn)
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out the PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Use new configure-pe function, if supported */
 	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return -EFAULT;
 	}
 
 	if (ret)
-		pr_warning("%s: Unable to configure bridge %d for %s\n",
-			__func__, ret, dn->full_name);
+		pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+			__func__, pe->phb->global_number, pe->addr, ret);
 
 	return ret;
 }
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 02/21] ppc/eeh: use slab to allocate eeh devices
From: Gavin Shan @ 2012-09-05  6:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1346825696-13960-1-git-send-email-shangw@linux.vnet.ibm.com>

The EEH initialization functions have been postponed until slab/slub
are ready. So we use slab/slub to allocate the memory chunks for
newly creatd EEH devices. That would save lots of memory.

The patch also does cleanup to replace "kmalloc" with "kzalloc" so
that we needn't clear the allocated memory chunk explicitly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_cache.c |    2 +-
 arch/powerpc/platforms/pseries/eeh_dev.c   |    2 +-
 arch/powerpc/platforms/pseries/eeh_event.c |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index e5ae1c6..f50b717 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -151,7 +151,7 @@ pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 			return piar;
 		}
 	}
-	piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
 	if (!piar)
 		return NULL;
 
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
index ab68c59..8e3443b 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/platforms/pseries/eeh_dev.c
@@ -55,7 +55,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data)
 	struct eeh_dev *edev;
 
 	/* Allocate EEH device */
-	edev = zalloc_maybe_bootmem(sizeof(*edev), GFP_KERNEL);
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
 	if (!edev) {
 		pr_warning("%s: out of memory\n", __func__);
 		return NULL;
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index fb50631..6132772 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -139,7 +139,7 @@ int eeh_send_failure_event(struct eeh_dev *edev)
 		printk(KERN_ERR "EEH: PCI location = %s\n", location);
 		return 1;
 	}
-	event = kmalloc(sizeof(*event), GFP_ATOMIC);
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
 	if (event == NULL) {
 		printk(KERN_ERR "EEH: out of memory, event not handled\n");
 		return 1;
-- 
1.7.5.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox