LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] vfio: enabled and supported on power (v7)
From: Alexey Kardashevskiy @ 2012-09-04  7:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, David Gibson
  Cc: Alexey Kardashevskiy, linuxppc-dev, Alex Williamson,
	Paul Mackerras
In-Reply-To: <20120821113534.GS29724@truffula.fritz.box>

Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h    |    3 +
 drivers/iommu/Kconfig               |    8 +
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  440 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   29 +++
 6 files changed, 487 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 957a83f..c64bce7 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -66,6 +66,9 @@ struct iommu_table {
 	unsigned long  it_halfpoint; /* Breaking point for small/large allocs */
 	spinlock_t     it_lock;      /* Protects it_map */
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 3bd9fff..19cf2d9 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -162,4 +162,12 @@ config TEGRA_IOMMU_SMMU
 	  space through the SMMU (System Memory Management Unit)
 	  hardware included on Tegra SoCs.
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_PSERIES
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..21f1909
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,440 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_x86.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <linux/spinlock.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+
+/*
+ * SPAPR TCE API
+ */
+static void tce_free(struct iommu_table *tbl, unsigned long entry,
+		unsigned long tce)
+{
+	struct page *page = pfn_to_page(tce >> PAGE_SHIFT);
+
+	WARN_ON(!page);
+	if (page) {
+		if (tce & VFIO_SPAPR_TCE_WRITE)
+			SetPageDirty(page);
+		put_page(page);
+	}
+	ppc_md.tce_free(tbl, entry, 1);
+}
+
+static long tce_put(struct iommu_table *tbl,
+		unsigned long entry, uint64_t tce, uint32_t flags)
+{
+	int ret;
+	unsigned long oldtce, kva, offset;
+	struct page *page = NULL;
+	enum dma_data_direction direction = DMA_NONE;
+
+	switch (flags & VFIO_SPAPR_TCE_PUT_MASK) {
+	case VFIO_SPAPR_TCE_READ:
+		direction = DMA_TO_DEVICE;
+		break;
+	case VFIO_SPAPR_TCE_WRITE:
+		direction = DMA_FROM_DEVICE;
+		break;
+	case VFIO_SPAPR_TCE_BIDIRECTIONAL:
+		direction = DMA_BIDIRECTIONAL;
+		break;
+	}
+
+	oldtce = ppc_md.tce_get(tbl, entry);
+
+	/* Free page if still allocated */
+	if (oldtce & VFIO_SPAPR_TCE_PUT_MASK)
+		tce_free(tbl, entry, oldtce);
+
+	/* Map new TCE */
+	if (direction != DMA_NONE) {
+		offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+				direction != DMA_TO_DEVICE, &page);
+		BUG_ON(ret > 1);
+		if (ret < 1) {
+			printk(KERN_ERR "tce_vfio: get_user_pages_fast failed "
+					"tce=%llx ioba=%lx ret=%d\n",
+					tce, entry << IOMMU_PAGE_SHIFT, ret);
+			if (!ret)
+				ret = -EFAULT;
+			goto unlock_exit;
+		}
+
+		kva = (unsigned long) page_address(page);
+		kva += offset;
+		BUG_ON(!kva);
+		if (WARN_ON(kva & ~IOMMU_PAGE_MASK))
+			return -EINVAL;
+
+		/* Preserve access bits */
+		kva |= flags & VFIO_SPAPR_TCE_PUT_MASK;
+
+		/* tce_build receives a virtual address */
+		entry += tbl->it_offset;	/* Offset into real TCE table */
+		ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+		/* tce_build() only returns non-zero for transient errors */
+		if (unlikely(ret)) {
+			printk(KERN_ERR "tce_vfio: Failed to add TCE\n");
+			ret = -EIO;
+			goto unlock_exit;
+		}
+	}
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+unlock_exit:
+	if (ret && page)
+		put_page(page);
+
+	if (ret)
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx "
+				"ioba=%lx kva=%lx\n", tce,
+				entry << IOMMU_PAGE_SHIFT, kva);
+	return ret;
+}
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		printk(KERN_ERR "tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = container->tbl;
+	unsigned long i, tce;
+
+	/* Unmap leftovers */
+	spin_lock_irq(&tbl->it_lock);
+	for (i = tbl->it_offset; i < tbl->it_offset + tbl->it_size; ++i) {
+		tce = ppc_md.tce_get(tbl, i);
+		if (tce & VFIO_SPAPR_TCE_PUT_MASK)
+			tce_free(tbl, i, tce);
+	}
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+
+	spin_unlock_irq(&tbl->it_lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION: {
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma64_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		if (!tbl)
+			return -ENXIO;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma64_window_start = 0;
+		info.dma64_window_size = 0;
+		info.flags = 0;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+	case VFIO_IOMMU_SPAPR_TCE_PUT: {
+		struct vfio_iommu_spapr_tce_put par;
+		struct iommu_table *tbl = container->tbl;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_put, tce);
+
+		if (copy_from_user(&par, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (par.argsz < minsz)
+			return -EINVAL;
+
+		if (!tbl) {
+			return -ENXIO;
+		}
+
+		spin_lock_irq(&tbl->it_lock);
+		ret = tce_put(tbl, par.ioba >> IOMMU_PAGE_SHIFT,
+				par.tce, par.flags);
+		spin_unlock_irq(&tbl->it_lock);
+
+		return ret;
+	}
+	default:
+		printk(KERN_WARNING "tce_vfio: unexpected cmd %x\n", cmd);
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	printk(KERN_DEBUG "tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		printk(KERN_WARNING "tce_vfio: Only one group per IOMMU "
+				"container is allowed, "
+				"existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	if (tbl != container->tbl) {
+		printk(KERN_WARNING "tce_vfio: detaching group #%u, expected "
+				"group is #%u\n", iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+		return;
+	}
+	printk(KERN_DEBUG "tce_vfio: detaching group #%u from iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+/*
+ * Add/delete devices support (hotplug, module_init, module_exit)
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (dev->iommu_group) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu "
+				"group %d, skipping\n", dev->kobj.name,
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		printk(KERN_DEBUG "tce_vfio: skipping device %s with no tbl\n",
+				dev->kobj.name);
+		return 0;
+	}
+
+	printk(KERN_DEBUG "tce_vfio: adding %s to iommu group %d\n",
+			dev->kobj.name, iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev->kobj.name, ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* If the current platform does not support tce_get
+	   we are unable to clean TCE table properly and
+	   therefore it is better not to touch it at all */
+	if (!ppc_md.tce_get) {
+		printk(KERN_ERR "tce_vfio: ppc_md.tce_get isn't implemented\n");
+		return -EOPNOTSUPP;
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Allocate and initialize VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return -EFAULT;
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..2c0a927 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,32 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;
+	__u32 dma32_window_start;
+	__u32 dma32_window_size;
+	__u64 dma64_window_start;
+	__u64 dma64_window_size;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct vfio_iommu_spapr_tce_put {
+	__u32 argsz;
+	__u32 flags;
+#define VFIO_SPAPR_TCE_READ		1
+#define VFIO_SPAPR_TCE_WRITE		2
+#define VFIO_SPAPR_TCE_BIDIRECTIONAL	(VFIO_SPAPR_TCE_READ|VFIO_SPAPR_TCE_WRITE)
+#define VFIO_SPAPR_TCE_PUT_MASK		VFIO_SPAPR_TCE_BIDIRECTIONAL
+	__u64 ioba;
+	__u64 tce;
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_PUT	_IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related

* Re: 3.5+: yaboot, Invalid memory access
From: Michael Ellerman @ 2012-09-04  6:51 UTC (permalink / raw)
  To: Christian Kujau; +Cc: Steven Rostedt, linuxppc-dev
In-Reply-To: <alpine.DEB.2.01.1209032310220.25392@trent.utfs.org>

On Mon, 2012-09-03 at 23:18 -0700, Christian Kujau wrote:
> On Mon, 30 Jul 2012 at 22:46, Christian Kujau wrote:
> > when trying to upgrade from 3.5 (final) to today's git checkout from 
> > Linus' tree, yaboot cannot boot and the following is printed:
> > 
> >   [...]
> >   returning from prom_init
> >   Invalid memory access at %SRR0: 00c62fd4  %SRR1: 00003030
> 
> Finally got around to do a bisection:
> 
> -----------------------------
> b6e3796834faefe4b6e9a2aedfe12665cd51fbc5 is the first bad commit
> commit b6e3796834faefe4b6e9a2aedfe12665cd51fbc5
> Author: Steven Rostedt <srostedt@redhat.com>
> Date:   Thu Apr 26 08:31:18 2012 +0000
> 
>     powerpc: Have patch_instruction detect faults
>     
>     For ftrace to use the patch_instruction code, it needs to check for
>     faults on write. Ftrace updates code all over the kernel, and we need 
>     to know if code is updated or not due to protections that are placed 
>     on some portions of the kernel. If ftrace does not detect a fault, it 
>     will error later on, and it will be much more difficult to find the 
>     problem.
>     
>     By changing patch_instruction() to detect faults, then ftrace will be
>     able to make use of it too.
>     
>     Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
>     Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> 
> :040000 040000 83fb0e420524db452c07425e4dc402041428e696 0d1a01acd863237cf388045946ad4446a28df50c M      arch
> -----------------------------
> 
> The changeset looked pretty harmless to me but I tested with a current 
> 3.6+ git checkout and the kernel would only boot when this changeset was
> reverted.
> 
> Thoughts?

My guess would be we're calling that quite early and the __put_user()
check is getting confused and failing. That means we'll have left some
code unpatched, which then fails.

Can you try with the patch applied, but instead of returning if the
__put_user() fails, just continue on anyway.

That will isolate if it's something in the __put_user() (I doubt it), or
just that the __put_user() is failing and leaving the code unpatched.

cheers

^ permalink raw reply

* Re: 3.5+: yaboot, Invalid memory access
From: Christian Kujau @ 2012-09-04  6:18 UTC (permalink / raw)
  To: Steven Rostedt, Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <alpine.DEB.2.01.1207302235170.26923@trent.utfs.org>

On Mon, 30 Jul 2012 at 22:46, Christian Kujau wrote:
> when trying to upgrade from 3.5 (final) to today's git checkout from 
> Linus' tree, yaboot cannot boot and the following is printed:
> 
>   [...]
>   returning from prom_init
>   Invalid memory access at %SRR0: 00c62fd4  %SRR1: 00003030

Finally got around to do a bisection:

-----------------------------
b6e3796834faefe4b6e9a2aedfe12665cd51fbc5 is the first bad commit
commit b6e3796834faefe4b6e9a2aedfe12665cd51fbc5
Author: Steven Rostedt <srostedt@redhat.com>
Date:   Thu Apr 26 08:31:18 2012 +0000

    powerpc: Have patch_instruction detect faults
    
    For ftrace to use the patch_instruction code, it needs to check for
    faults on write. Ftrace updates code all over the kernel, and we need 
    to know if code is updated or not due to protections that are placed 
    on some portions of the kernel. If ftrace does not detect a fault, it 
    will error later on, and it will be much more difficult to find the 
    problem.
    
    By changing patch_instruction() to detect faults, then ftrace will be
    able to make use of it too.
    
    Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
    Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

:040000 040000 83fb0e420524db452c07425e4dc402041428e696 0d1a01acd863237cf388045946ad4446a28df50c M      arch
-----------------------------

The changeset looked pretty harmless to me but I tested with a current 
3.6+ git checkout and the kernel would only boot when this changeset was
reverted.

Thoughts?

Thanks,
Christian.

> The whole message: http://nerdbynature.de/bits/3.5.0/yaboot/
> 
> The Yaboot version is 1.3.16 (from Debian/testing), I haven't tried 1.3.17 
> yet, its changelog does not say anything about "newer kernel support" so 
> I'm not sure if this would help here.
> 
> Any other ideas?
> 
> Thanks,
> Christian.

-- 
BOFH excuse #154:

You can tune a file system, but you can't tune a fish (from most tunefs man pages)

^ permalink raw reply

* Re: [RFC v8 PATCH 13/20] memory-hotplug: check page type in get_page_bootmem
From: Wen Congyang @ 2012-09-04  3:46 UTC (permalink / raw)
  To: Andrew Morton, isimatu.yasuaki
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, paulus, minchan.kim,
	kosaki.motohiro, rientjes, sparclinux, cl, linuxppc-dev, liuj97
In-Reply-To: <20120831143032.1343e99a.akpm@linux-foundation.org>

Hi, isimatu-san

At 09/01/2012 05:30 AM, Andrew Morton Wrote:
> On Tue, 28 Aug 2012 18:00:20 +0800
> wency@cn.fujitsu.com wrote:
> 
>> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>
>> There is a possibility that get_page_bootmem() is called to the same page many
>> times. So when get_page_bootmem is called to the same page, the function only
>> increments page->_count.
> 
> I really don't understand this explanation, even after having looked at
> the code.  Can you please have another attempt at the changelog?

What is the problem that you want to fix? The function get_page_bootmem()
may be called to the same page more than once, but I don't find any problem
about current implementation.

Thanks
Wen Congyang

> 
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -95,10 +95,17 @@ static void release_memory_resource(struct resource *res)
>>  static void get_page_bootmem(unsigned long info,  struct page *page,
>>  			     unsigned long type)
>>  {
>> -	page->lru.next = (struct list_head *) type;
>> -	SetPagePrivate(page);
>> -	set_page_private(page, info);
>> -	atomic_inc(&page->_count);
>> +	unsigned long page_type;
>> +
>> +	page_type = (unsigned long) page->lru.next;
>> +	if (page_type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
>> +	    page_type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE){
>> +		page->lru.next = (struct list_head *) type;
>> +		SetPagePrivate(page);
>> +		set_page_private(page, info);
>> +		atomic_inc(&page->_count);
>> +	} else
>> +		atomic_inc(&page->_count);
>>  }
> 
> And a code comment which explains what is going on would be good.  As
> is always the case ;)
> 
> 

^ permalink raw reply

* [PATCH 4/4] powerpc: Restore correct DSCR in context switch
From: Anton Blanchard @ 2012-09-04  2:51 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20120904124756.4ddeb535@kryten>


During a context switch we always restore the per thread DSCR value.
If we aren't doing explicit DSCR management
(ie thread.dscr_inherit == 0) and the default DSCR changed while
the process has been sleeping we end up with the wrong value.

Check thread.dscr_inherit and select the default DSCR or per thread
DSCR as required.

This was found with the following test case, when running with
more threads than CPUs (ie forcing context switching):

http://ozlabs.org/~anton/junkcode/dscr_default_test.c

With the four patches applied I can run a combination of all
test cases successfully at the same time:

http://ozlabs.org/~anton/junkcode/dscr_default_test.c
http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c
http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c

Signed-off-by: Anton Blanchard <anton@samba.org>  
Cc: <stable@kernel.org> # 3.0+
---

Index: b/arch/powerpc/kernel/entry_64.S
===================================================================
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -370,6 +370,12 @@ _GLOBAL(ret_from_fork)
 	li	r3,0
 	b	syscall_exit
 
+	.section	".toc","aw"
+DSCR_DEFAULT:
+	.tc dscr_default[TC],dscr_default
+
+	.section	".text"
+
 /*
  * This routine switches between two different tasks.  The process
  * state of one is saved on its kernel stack.  Then the state
@@ -509,9 +515,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEG
 	mr	r1,r8		/* start using new stack pointer */
 	std	r7,PACAKSAVE(r13)
 
-	ld	r6,_CCR(r1)
-	mtcrf	0xFF,r6
-
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
 	ld	r0,THREAD_VRSAVE(r4)
@@ -520,14 +523,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_PPC64
 BEGIN_FTR_SECTION
+	lwz	r6,THREAD_DSCR_INHERIT(r4)
+	ld	r7,DSCR_DEFAULT@toc(2)
 	ld	r0,THREAD_DSCR(r4)
-	cmpd	r0,r25
-	beq	1f
+	cmpwi	r6,0
+	bne	1f
+	ld	r0,0(r7)
+1:	cmpd	r0,r25
+	beq	2f
 	mtspr	SPRN_DSCR,r0
-1:	
+2:
 END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
 #endif
 
+	ld	r6,_CCR(r1)
+	mtcrf	0xFF,r6
+
 	/* r3-r13 are destroyed -- Cort */
 	REST_8GPRS(14, r1)
 	REST_10GPRS(22, r1)
Index: b/arch/powerpc/kernel/asm-offsets.c
===================================================================
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -76,6 +76,7 @@ int main(void)
 	DEFINE(SIGSEGV, SIGSEGV);
 	DEFINE(NMI_MASK, NMI_MASK);
 	DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr));
+	DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit));
 #else
 	DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
 #endif /* CONFIG_PPC64 */

^ permalink raw reply

* [PATCH 3/4] powerpc: Fix DSCR inheritance in copy_thread()
From: Anton Blanchard @ 2012-09-04  2:49 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20120904124756.4ddeb535@kryten>


If the default DSCR is non zero we set thread.dscr_inherit in
copy_thread() meaning the new thread and all its children will ignore
future updates to the default DSCR. This is not intended and is
a change in behaviour that a number of our users have hit.

We just need to inherit thread.dscr and thread.dscr_inherit from
the parent which ends up being much simpler.

This was found with the following test case:

http://ozlabs.org/~anton/junkcode/dscr_default_test.c

Signed-off-by: Anton Blanchard <anton@samba.org>  
Cc: <stable@kernel.org> # 3.0+
---

Index: b/arch/powerpc/kernel/process.c
===================================================================
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -802,16 +802,8 @@ int copy_thread(unsigned long clone_flag
 #endif /* CONFIG_PPC_STD_MMU_64 */
 #ifdef CONFIG_PPC64 
 	if (cpu_has_feature(CPU_FTR_DSCR)) {
-		if (current->thread.dscr_inherit) {
-			p->thread.dscr_inherit = 1;
-			p->thread.dscr = current->thread.dscr;
-		} else if (0 != dscr_default) {
-			p->thread.dscr_inherit = 1;
-			p->thread.dscr = dscr_default;
-		} else {
-			p->thread.dscr_inherit = 0;
-			p->thread.dscr = 0;
-		}
+		p->thread.dscr_inherit = current->thread.dscr_inherit;
+		p->thread.dscr = current->thread.dscr;
 	}
 #endif
 

^ permalink raw reply

* [PATCH 2/4] powerpc: Keep thread.dscr and thread.dscr_inherit in sync
From: Anton Blanchard @ 2012-09-04  2:48 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev
In-Reply-To: <20120904124756.4ddeb535@kryten>


When we update the DSCR either via emulation of mtspr(DSCR) or via
a change to dscr_default in sysfs we don't update thread.dscr.
We will eventually update it at context switch time but there is
a period where thread.dscr is incorrect.

If we fork at this point we will copy the old value of thread.dscr
into the child. To avoid this, always keep thread.dscr in sync with
reality.

This issue was found with the following testcase:

http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c

Signed-off-by: Anton Blanchard <anton@samba.org>  
Cc: <stable@kernel.org> # 3.0+
---

Index: b/arch/powerpc/kernel/traps.c
===================================================================
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -972,8 +972,9 @@ static int emulate_instruction(struct pt
 			cpu_has_feature(CPU_FTR_DSCR)) {
 		PPC_WARN_EMULATED(mtdscr, regs);
 		rd = (instword >> 21) & 0x1f;
-		mtspr(SPRN_DSCR, regs->gpr[rd]);
+		current->thread.dscr = regs->gpr[rd];
 		current->thread.dscr_inherit = 1;
+		mtspr(SPRN_DSCR, current->thread.dscr);
 		return 0;
 	}
 #endif
Index: b/arch/powerpc/kernel/sysfs.c
===================================================================
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -196,8 +196,10 @@ static ssize_t show_dscr_default(struct
 
 static void update_dscr(void *dummy)
 {
-	if (!current->thread.dscr_inherit)
+	if (!current->thread.dscr_inherit) {
+		current->thread.dscr = dscr_default;
 		mtspr(SPRN_DSCR, dscr_default);
+	}
 }
 
 static ssize_t __used store_dscr_default(struct device *dev,

^ permalink raw reply

* [PATCH 1/4] powerpc: Update DSCR on all CPUs when writing sysfs dscr_default
From: Anton Blanchard @ 2012-09-04  2:47 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev


Writing to dscr_default in sysfs doesn't actually change the DSCR -
we rely on a context switch on each CPU to do the work. There is no
guarantee we will get a context switch in a reasonable amount of time
so fire off an IPI to force an immediate change.

This issue was found with the following test case:

http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c

Signed-off-by: Anton Blanchard <anton@samba.org>  
Cc: <stable@kernel.org> # 3.0+
---

Index: b/arch/powerpc/kernel/sysfs.c
===================================================================
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -194,6 +194,12 @@ static ssize_t show_dscr_default(struct
 	return sprintf(buf, "%lx\n", dscr_default);
 }
 
+static void update_dscr(void *dummy)
+{
+	if (!current->thread.dscr_inherit)
+		mtspr(SPRN_DSCR, dscr_default);
+}
+
 static ssize_t __used store_dscr_default(struct device *dev,
 		struct device_attribute *attr, const char *buf,
 		size_t count)
@@ -206,6 +212,8 @@ static ssize_t __used store_dscr_default
 		return -EINVAL;
 	dscr_default = val;
 
+	on_each_cpu(update_dscr, NULL, 1);
+
 	return count;
 }
 

^ permalink raw reply

* [PATCH 2/2] powerpc/pci: Use PCIe IP block revision register instead of compatible
From: Roy Zang @ 2012-09-03  9:22 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1346664130-7896-1-git-send-email-tie-fei.zang@freescale.com>

Freescale PCIe IP block revision bigger than rev2.2 will also need
redefine the sequence of inbound windows. So change to use IP block
revision instead of compatible for the judgment.

Signed-off-by: Roy Zang <tie-fei.zang@freescale.com>
---

 arch/powerpc/sysdev/fsl_pci.c |   14 ++++++++------
 1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index a7b2a60..bce48e6 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -143,18 +143,20 @@ static void __init setup_pci_atmu(struct pci_controller *hose,
 	pr_debug("PCI memory map start 0x%016llx, size 0x%016llx\n",
 		 (u64)rsrc->start, (u64)resource_size(rsrc));
 
-	if (of_device_is_compatible(hose->dn, "fsl,qoriq-pcie-v2.2")) {
-		win_idx = 2;
-		start_idx = 0;
-		end_idx = 3;
-	}
-
 	pci = ioremap(rsrc->start, resource_size(rsrc));
 	if (!pci) {
 	    dev_err(hose->parent, "Unable to map ATMU registers\n");
 	    return;
 	}
 
+	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
+		if (in_be32(&pci->block_rev1) >= PCIE_IP_REV_2_2) {
+			win_idx = 2;
+			start_idx = 0;
+			end_idx = 3;
+		}
+	}
+
 	/* Disable all windows (except powar0 since it's ignored) */
 	for(i = 1; i < 5; i++)
 		out_be32(&pci->pow[i].powar, 0);
-- 
1.7.8.1

^ permalink raw reply related

* [PATCH 1/2] powerpc/pci: Add IP revision register define for Freescale PCIe controller
From: Roy Zang @ 2012-09-03  9:22 UTC (permalink / raw)
  To: linuxppc-dev

Signed-off-by: Roy Zang <tie-fei.zang@freescale.com>
---
 arch/powerpc/sysdev/fsl_pci.h |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h
index baa0fd1..7192932 100644
--- a/arch/powerpc/sysdev/fsl_pci.h
+++ b/arch/powerpc/sysdev/fsl_pci.h
@@ -16,6 +16,7 @@
 
 #define PCIE_LTSSM	0x0404		/* PCIE Link Training and Status */
 #define PCIE_LTSSM_L0	0x16		/* L0 state */
+#define PCIE_IP_REV_2_2		0x02080202 /* PCIE IP block version Rev2.2 */
 #define PIWAR_EN		0x80000000	/* Enable */
 #define PIWAR_PF		0x20000000	/* prefetch */
 #define PIWAR_TGI_LOCAL		0x00f00000	/* target - local memory */
@@ -57,7 +58,9 @@ struct ccsr_pci {
 	__be32	pex_pme_mes_disr;	/* 0x.024 - PCIE PME and message disable register */
 	__be32	pex_pme_mes_ier;	/* 0x.028 - PCIE PME and message interrupt enable register */
 	__be32	pex_pmcr;		/* 0x.02c - PCIE power management command register */
-	u8	res3[3024];
+	u8	res3[3016];
+	__be32	block_rev1;	/* 0x.bf8 - PCIE Block Revision register 1 */
+	__be32	block_rev2;	/* 0x.bfc - PCIE Block Revision register 2 */
 
 /* PCI/PCI Express outbound window 0-4
  * Window 0 is the default window and is the only window enabled upon reset.
-- 
1.7.8.1

^ permalink raw reply related

* Re: [RFC v8 PATCH 08/20] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Wen Congyang @ 2012-09-03  7:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <20120831140623.8d13bd2c.akpm@linux-foundation.org>

At 09/01/2012 05:06 AM, Andrew Morton Wrote:
> On Tue, 28 Aug 2012 18:00:15 +0800
> wency@cn.fujitsu.com wrote:
> 
>> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>
>> When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
>> sysfs files are created. But there is no code to remove these files. The patch
>> implements the function to remove them.
>>
>> Note : The code does not free firmware_map_entry since there is no way to free
>>        memory which is allocated by bootmem.
>>
>> ....
>>
>> +#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
> 
> It would be better to implement this as an inlined C function.  That
> has improved type safety and improved readability.

Hmm, this macro is not a new macro. It is defined after the function
release_firmware_map_entry(). We just moved it here because we
need it in the function release_firmware_map_entry().

> 
>> +static void release_firmware_map_entry(struct kobject *kobj)
>> +{
>> +	struct firmware_map_entry *entry = to_memmap_entry(kobj);
>> +	struct page *page;
>> +
>> +	page = virt_to_page(entry);
>> +	if (PageSlab(page) || PageCompound(page))
> 
> That PageCompound() test looks rather odd.  Why is this done?
> 
>> +		kfree(entry);
>> +
>> +	/* There is no way to free memory allocated from bootmem*/
>> +}
> 
> This function is a bit ugly - poking around in page flags to determine
> whether or not the memory came from bootmem.  It would be cleaner to
> use a separate boolean.  Although I guess we can live with it as you
> have it here.
> 
>>  static struct kobj_type memmap_ktype = {
>> +	.release	= release_firmware_map_entry,
>>  	.sysfs_ops	= &memmap_attr_ops,
>>  	.default_attrs	= def_attrs,
>>  };
>> @@ -123,6 +139,16 @@ static int firmware_map_add_entry(u64 start, u64 end,
>>  	return 0;
>>  }
>>  
>> +/**
>> + * firmware_map_remove_entry() - Does the real work to remove a firmware
>> + * memmap entry.
>> + * @entry: removed entry.
>> + **/
>> +static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
>> +{
>> +	list_del(&entry->list);
>> +}
> 
> Is there no locking  to protect that list?

OK, I will add a lock to protect it.

Thanks
Wen Congyang

> 
>>  /*
>>   * Add memmap entry on sysfs
>>   */
>> @@ -144,6 +170,31 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
>>  	return 0;
>>  }
>>  
>> +/*
>> + * Remove memmap entry on sysfs
>> + */
>> +static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
>> +{
>> +	kobject_put(&entry->kobj);
>> +}
>> +
>> +/*
>> + * Search memmap entry
>> + */
>> +
>> +struct firmware_map_entry * __meminit
>> +find_firmware_map_entry(u64 start, u64 end, const char *type)
> 
> A better name would be firmware_map_find_entry().  To retain the (good)
> convention that symbols exported from here all start with
> "firmware_map_".
> 
>> +{
>> +	struct firmware_map_entry *entry;
>> +
>> +	list_for_each_entry(entry, &map_entries, list)
>> +		if ((entry->start == start) && (entry->end == end) &&
>> +		    (!strcmp(entry->type, type)))
>> +			return entry;
>> +
>> +	return NULL;
>> +}
>> +
>>  /**
>>   * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
>>   * memory hotplug.
>> @@ -196,6 +247,32 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
>>  	return firmware_map_add_entry(start, end, type, entry);
>>  }
>>  
>> +/**
>> + * firmware_map_remove() - remove a firmware mapping entry
>> + * @start: Start of the memory range.
>> + * @end:   End of the memory range.
>> + * @type:  Type of the memory range.
>> + *
>> + * removes a firmware mapping entry.
>> + *
>> + * Returns 0 on success, or -EINVAL if no entry.
>> + **/
>> +int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
>> +{
>> +	struct firmware_map_entry *entry;
>> +
>> +	entry = find_firmware_map_entry(start, end - 1, type);
>> +	if (!entry)
>> +		return -EINVAL;
>> +
>> +	firmware_map_remove_entry(entry);
>> +
>> +	/* remove the memmap entry */
>> +	remove_sysfs_fw_map_entry(entry);
>> +
>> +	return 0;
>> +}
> 
> Again, the lack of locking looks bad.
> 
>> ...
>>
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1052,9 +1052,9 @@ int offline_memory(u64 start, u64 size)
>>  	return 0;
>>  }
>>  
>> -int remove_memory(int nid, u64 start, u64 size)
>> +int __ref remove_memory(int nid, u64 start, u64 size)
> 
> Why was __ref added?
> 
>>  {
>> -	int ret = -EBUSY;
>> +	int ret = 0;
>>  	lock_memory_hotplug();
>>  	/*
>>  	 * The memory might become online by other task, even if you offine it.
>>
>> ...
>>
> 

^ permalink raw reply

* Re: [RFC v8 PATCH 08/20] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Wen Congyang @ 2012-09-03  5:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <20120831140623.8d13bd2c.akpm@linux-foundation.org>

At 09/01/2012 05:06 AM, Andrew Morton Wrote:
> On Tue, 28 Aug 2012 18:00:15 +0800
> wency@cn.fujitsu.com wrote:
> 
>> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>
>> When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
>> sysfs files are created. But there is no code to remove these files. The patch
>> implements the function to remove them.
>>
>> Note : The code does not free firmware_map_entry since there is no way to free
>>        memory which is allocated by bootmem.
>>
>> ....
>>
>> +#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
> 
> It would be better to implement this as an inlined C function.  That
> has improved type safety and improved readability.
> 
>> +static void release_firmware_map_entry(struct kobject *kobj)
>> +{
>> +	struct firmware_map_entry *entry = to_memmap_entry(kobj);
>> +	struct page *page;
>> +
>> +	page = virt_to_page(entry);
>> +	if (PageSlab(page) || PageCompound(page))
> 
> That PageCompound() test looks rather odd.  Why is this done?

Liu Jiang and Christoph Lameter discussed how to find slab page
in this mail:
https://lkml.org/lkml/2012/7/6/333.

> 
>> +		kfree(entry);
>> +
>> +	/* There is no way to free memory allocated from bootmem*/
>> +}
> 
> This function is a bit ugly - poking around in page flags to determine
> whether or not the memory came from bootmem.  It would be cleaner to
> use a separate boolean.  Although I guess we can live with it as you
> have it here.
> 
>>  static struct kobj_type memmap_ktype = {
>> +	.release	= release_firmware_map_entry,
>>  	.sysfs_ops	= &memmap_attr_ops,
>>  	.default_attrs	= def_attrs,
>>  };
>> @@ -123,6 +139,16 @@ static int firmware_map_add_entry(u64 start, u64 end,
>>  	return 0;
>>  }
>>  
>> +/**
>> + * firmware_map_remove_entry() - Does the real work to remove a firmware
>> + * memmap entry.
>> + * @entry: removed entry.
>> + **/
>> +static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
>> +{
>> +	list_del(&entry->list);
>> +}
> 
> Is there no locking  to protect that list?
> 
>>  /*
>>   * Add memmap entry on sysfs
>>   */
>> @@ -144,6 +170,31 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
>>  	return 0;
>>  }
>>  
>> +/*
>> + * Remove memmap entry on sysfs
>> + */
>> +static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
>> +{
>> +	kobject_put(&entry->kobj);
>> +}
>> +
>> +/*
>> + * Search memmap entry
>> + */
>> +
>> +struct firmware_map_entry * __meminit
>> +find_firmware_map_entry(u64 start, u64 end, const char *type)
> 
> A better name would be firmware_map_find_entry().  To retain the (good)
> convention that symbols exported from here all start with
> "firmware_map_".

OK.

> 
>> +{
>> +	struct firmware_map_entry *entry;
>> +
>> +	list_for_each_entry(entry, &map_entries, list)
>> +		if ((entry->start == start) && (entry->end == end) &&
>> +		    (!strcmp(entry->type, type)))
>> +			return entry;
>> +
>> +	return NULL;
>> +}
>> +
>>  /**
>>   * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
>>   * memory hotplug.
>> @@ -196,6 +247,32 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
>>  	return firmware_map_add_entry(start, end, type, entry);
>>  }
>>  
>> +/**
>> + * firmware_map_remove() - remove a firmware mapping entry
>> + * @start: Start of the memory range.
>> + * @end:   End of the memory range.
>> + * @type:  Type of the memory range.
>> + *
>> + * removes a firmware mapping entry.
>> + *
>> + * Returns 0 on success, or -EINVAL if no entry.
>> + **/
>> +int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
>> +{
>> +	struct firmware_map_entry *entry;
>> +
>> +	entry = find_firmware_map_entry(start, end - 1, type);
>> +	if (!entry)
>> +		return -EINVAL;
>> +
>> +	firmware_map_remove_entry(entry);
>> +
>> +	/* remove the memmap entry */
>> +	remove_sysfs_fw_map_entry(entry);
>> +
>> +	return 0;
>> +}
> 
> Again, the lack of locking looks bad.
> 
>> ...
>>
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1052,9 +1052,9 @@ int offline_memory(u64 start, u64 size)
>>  	return 0;
>>  }
>>  
>> -int remove_memory(int nid, u64 start, u64 size)
>> +int __ref remove_memory(int nid, u64 start, u64 size)
> 
> Why was __ref added?

Hmm, firmware_map_remove() was put in meminit section, and we call it
in this function, so __ref is added here.

Thanks
Wen Congyang

> 
>>  {
>> -	int ret = -EBUSY;
>> +	int ret = 0;
>>  	lock_memory_hotplug();
>>  	/*
>>  	 * The memory might become online by other task, even if you offine it.
>>
>> ...
>>
> 

^ permalink raw reply

* Re: [RFC v8 PATCH 04/20] memory-hotplug: offline and remove memory when removing the memory device
From: Wen Congyang @ 2012-09-03  1:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <20120831135514.2a2dc0d4.akpm@linux-foundation.org>

At 09/01/2012 04:55 AM, Andrew Morton Wrote:
> On Tue, 28 Aug 2012 18:00:11 +0800
> wency@cn.fujitsu.com wrote:
> 
>> +int remove_memory(int nid, u64 start, u64 size)
>> +{
>> +	int ret = -EBUSY;
>> +	lock_memory_hotplug();
>> +	/*
>> +	 * The memory might become online by other task, even if you offine it.
>> +	 * So we check whether the cpu has been onlined or not.
> 
> I think you meant "memory", not "cpu".

Yes. I will fix it.

Thanks
Wen Congyang

> 
> Actually, "check whether any part of this memory range has been
> onlined" would be better.  If that is accurate ;)
> 
>> +	 */
>> +	if (!is_memblk_offline(start, size)) {
>> +		pr_warn("memory removing [mem %#010llx-%#010llx] failed, "
>> +			"because the memmory range is online\n",
>> +			start, start + size);
>> +		ret = -EAGAIN;
>> +	}
>> +
>> +	unlock_memory_hotplug();
>> +	return ret;
>> +
>> +}
>> +EXPORT_SYMBOL_GPL(remove_memory);
> 

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Wait for STOP to hit the bus
From: Joakim Tjernlund @ 2012-09-02 14:22 UTC (permalink / raw)
  To: Tabi Timur-B04825
  Cc: linuxppc-dev@lists.ozlabs.org, linux-i2c@vger.kernel.org
In-Reply-To: <6AE080B68D46FC4BA2D2769E68D765B708019E97@039-SN2MPN1-022.039d.mgd.msft.net>

Tabi Timur-B04825 <B04825@freescale.com> wrote on 2012/09/02 04:48:01:
> On Thu, Aug 30, 2012 at 5:40 AM, Joakim Tjernlund
> <Joakim.Tjernlund@transmode.se> wrote:
>
> > -       mpc_i2c_stop(i2c);
> > +       mpc_i2c_stop(i2c); /* Initiate STOP */
> > +       orig_jiffies = jiffies;
> > +       /* Wait until STOP is seen, allow up to 1 s */
> > +       while (readb(i2c->base + MPC_I2C_SR) & CSR_MBB) {
> > +               if (time_after(jiffies, orig_jiffies + HZ)) {
> > +                       u8 status = readb(i2c->base + MPC_I2C_SR);
> > +
> > +                       dev_dbg(i2c->dev, "timeout\n");
> > +                       if ((status & (CSR_MCF | CSR_MBB | CSR_RXAK)) != 0) {
> > +                               writeb(status & ~CSR_MAL,
> > +                                      i2c->base + MPC_I2C_SR);
> > +                               mpc_i2c_fixup(i2c);
> > +                       }
> > +                       return -EIO;
> > +               }
> > +               cond_resched();
> > +       }
>
> Shouldn't the while-loop be inside mpc_i2c_stop() itself?

Possibly but I choosed to do it this way as there is a similar loop in the beginning of mpc_xfer().
I figured it has better visibility if it is in the same function.

 Jocke

^ permalink raw reply

* Re: Build regressions/improvements in v3.6-rc4
From: Geert Uytterhoeven @ 2012-09-02  9:33 UTC (permalink / raw)
  To: linux-kernel; +Cc: linuxppc-dev, linux-input
In-Reply-To: <1346577700-8156-1-git-send-email-geert@linux-m68k.org>

On Sun, Sep 2, 2012 at 11:21 AM, Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> JFYI, when comparing v3.6-rc4 to v3.6-rc3[3], the summaries are:
>   - build errors: +6/-13

6 regressions:
  + drivers/input/touchscreen/edt-ft5x06.c: error: 'struct
edt_ft5x06_ts_data' has no member named 'raw_buffer':  => 846:14

powerpc-randconfig, fix reported as queued by Dmitry.

  + error: eni.c: relocation truncated to fit: R_PPC64_REL24 against
symbol `_restgpr0_25' defined in .text.save.restore section in
arch/powerpc/lib/built-in.o:  => (.text+0x1ff96e4), (.text+0x1ff9438),
(.text+0x1ff8e74)
  + error: eni.c: relocation truncated to fit: R_PPC64_REL24 against
symbol `_restgpr0_28' defined in .text.save.restore section in
arch/powerpc/lib/built-in.o:  => (.text+0x1ff9848)
  + error: eni.c: relocation truncated to fit: R_PPC64_REL24 against
symbol `_restgpr0_30' defined in .text.save.restore section in
arch/powerpc/lib/built-in.o:  => (.text+0x1ff8ea8), (.text+0x1ff8edc)
  + error: eni.c: relocation truncated to fit: R_PPC64_REL24 against
symbol `_savegpr0_29' defined in .text.save.restore section in
arch/powerpc/lib/built-in.o:  => (.text+0x1ff9cf0)
  + error: fore200e.c: relocation truncated to fit: R_PPC64_REL24
against symbol `_restgpr0_22' defined in .text.save.restore section in
arch/powerpc/lib/built-in.o:  => (.text+0x1ff8608), (.text+0x1ff7e5c)

powerpc-allyesconfig

> [1] http://kisskb.ellerman.id.au/kisskb/head/5381/ (all 116 configs)
> [3] http://kisskb.ellerman.id.au/kisskb/head/5359/ (all 116 configs)

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v7 6/8] fsl-dma: use spin_lock_bh to instead of spin_lock_irqsave
From: Dan Williams @ 2012-09-02  8:41 UTC (permalink / raw)
  To: qiang.liu
  Cc: arnd, vinod.koul, gregkh, linux-kernel, Timur Tabi, herbert,
	linux-crypto, linuxppc-dev, davem
In-Reply-To: <1344500582-11110-1-git-send-email-qiang.liu@freescale.com>

On Thu, Aug 9, 2012 at 1:23 AM,  <qiang.liu@freescale.com> wrote:
> From: Qiang Liu <qiang.liu@freescale.com>
>
> The use of spin_lock_irqsave() is a stronger locking mechanism than is
> required throughout the driver. The minimum locking required should be
> used instead. Interrupts will be turned off and context will be saved,
> there is needless to use irqsave.
>
> Change all instances of spin_lock_irqsave() to spin_lock_bh().
> All manipulation of protected fields is done using tasklet context or
> weaker, which makes spin_lock_bh() the correct choice.

It seems you are coordinating fsl-dma copy and talitos xor operations.
 It looks like fsl-dma will be called through
talitos_process_pending()->dma_run_dependencies(), which is
potentially called in hard irq context.

This all comes back to the need to fix raid offload to manage the
channels explicitly rather than the current dependency chains.

--
Dan

^ permalink raw reply

* Re: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
From: Dan Williams @ 2012-09-02  8:12 UTC (permalink / raw)
  To: qiang.liu
  Cc: arnd, vinod.koul, gregkh, linux-kernel, herbert, linux-crypto,
	dan.j.williams, linuxppc-dev, davem
In-Reply-To: <1344500448-10927-1-git-send-email-qiang.liu@freescale.com>

On Thu, Aug 9, 2012 at 1:20 AM,  <qiang.liu@freescale.com> wrote:
> From: Qiang Liu <qiang.liu@freescale.com>
>
> Expose Talitos's XOR functionality to be used for RAID parity
> calculation via the Async_tx layer.
>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
> Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
> Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
> Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
> ---
>  drivers/crypto/Kconfig   |    9 +
>  drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/crypto/talitos.h |   53 ++++++
>  3 files changed, 475 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index be6b2ba..f0a7c29 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -222,6 +222,15 @@ config CRYPTO_DEV_TALITOS
>           To compile this driver as a module, choose M here: the module
>           will be called talitos.
>
> +config CRYPTO_DEV_TALITOS_RAIDXOR
> +       bool "Talitos RAID5 XOR Calculation Offload"
> +       default y

No, default y.  The user should explicitly enable this.

> +       select DMA_ENGINE
> +       depends on CRYPTO_DEV_TALITOS
> +       help
> +         Say 'Y' here to use the Freescale Security Engine (SEC) to
> +         offload RAID XOR parity Calculation
> +

Is it faster than cpu xor?

Will this engine be coordinating with another to handle memory copies?
 The dma mapping code for async_tx/raid is broken when dma mapping
requests overlap or cross dma device boundaries [1].

[1]: http://marc.info/?l=linux-arm-kernel&m=129407269402930&w=2

> +static void talitos_process_pending(struct talitos_xor_chan *xor_chan)
> +{
> +       struct talitos_xor_desc *desc, *_desc;
> +       unsigned long flags;
> +       int status;
> +       struct talitos_private *priv;
> +       int ch;
> +
> +       priv = dev_get_drvdata(xor_chan->dev);
> +       ch = atomic_inc_return(&priv->last_chan) &
> +                                 (priv->num_channels - 1);

Maybe a comment about why this this is duplicated from
talitos_cra_init()?  It sticks out here and I had to go looking to
find out why this channel number increments on submit.


> +       spin_lock_irqsave(&xor_chan->desc_lock, flags);
> +
> +       list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
> +               status = talitos_submit(xor_chan->dev, ch, &desc->hwdesc,
> +                                       talitos_release_xor, desc);
> +               if (status != -EINPROGRESS)
> +                       break;
> +
> +               list_del(&desc->node);
> +               list_add_tail(&desc->node, &xor_chan->in_progress_q);
> +       }
> +
> +       spin_unlock_irqrestore(&xor_chan->desc_lock, flags);
> +}
> +
> +static void talitos_xor_run_tx_complete_actions(struct talitos_xor_desc *desc,
> +               struct talitos_xor_chan *xor_chan)
> +{
> +       struct device *dev = xor_chan->dev;
> +       dma_addr_t dest, addr;
> +       unsigned int src_cnt = desc->unmap_src_cnt;
> +       unsigned int len = desc->unmap_len;
> +       enum dma_ctrl_flags flags = desc->async_tx.flags;
> +       struct dma_async_tx_descriptor *tx = &desc->async_tx;
> +
> +       /* unmap dma addresses */
> +       dest = desc->hwdesc.ptr[6].ptr;
> +       if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
> +               dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
> +
> +       desc->idx = 6 - src_cnt;
> +       if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
> +               while(desc->idx < 6) {

Checkpatch says:
ERROR: space required before the open parenthesis '('


> +                       addr = desc->hwdesc.ptr[desc->idx++].ptr;
> +                       if (addr == dest)
> +                               continue;
> +                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
> +               }
> +       }
> +
> +       /* run dependent operations */
> +       dma_run_dependencies(tx);

Here is where we run into problems if another engine accesses these
same buffers, especially on ARM v6+.

> +}
> +
> +static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
> +                               void *context, int error)
> +{
> +       struct talitos_xor_desc *desc = context;
> +       struct talitos_xor_chan *xor_chan;
> +       dma_async_tx_callback callback;
> +       void *callback_param;
> +
> +       if (unlikely(error))
> +               dev_err(dev, "xor operation: talitos error %d\n", error);
> +
> +       xor_chan = container_of(desc->async_tx.chan, struct talitos_xor_chan,
> +                               common);
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       if (xor_chan->completed_cookie < desc->async_tx.cookie)
> +               xor_chan->completed_cookie = desc->async_tx.cookie;
> +

Use dma_cookie_complete().

> +       callback = desc->async_tx.callback;
> +       callback_param = desc->async_tx.callback_param;
> +
> +       if (callback) {
> +               spin_unlock_bh(&xor_chan->desc_lock);
> +               callback(callback_param);
> +               spin_lock_bh(&xor_chan->desc_lock);

As mentioned you'll either need to ensure that
talitos_process_pending() is only called from the tasklet, or upgrade
these locks to hardirq safe.

> +       }
> +
> +       talitos_xor_run_tx_complete_actions(desc, xor_chan);
> +
> +       list_del(&desc->node);
> +       list_add_tail(&desc->node, &xor_chan->free_desc);
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +       if (!list_empty(&xor_chan->pending_q))
> +               talitos_process_pending(xor_chan);
> +}
> +
> +/**
> + * talitos_issue_pending - move the descriptors in submit
> + * queue to pending queue and submit them for processing
> + * @chan: DMA channel
> + */
> +static void talitos_issue_pending(struct dma_chan *chan)
> +{
> +       struct talitos_xor_chan *xor_chan;
> +
> +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       list_splice_tail_init(&xor_chan->submit_q,
> +                                &xor_chan->pending_q);
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +       talitos_process_pending(xor_chan);
> +}
> +
> +static dma_cookie_t talitos_async_tx_submit(struct dma_async_tx_descriptor *tx)
> +{
> +       struct talitos_xor_desc *desc;
> +       struct talitos_xor_chan *xor_chan;
> +       dma_cookie_t cookie;
> +
> +       desc = container_of(tx, struct talitos_xor_desc, async_tx);
> +       xor_chan = container_of(tx->chan, struct talitos_xor_chan, common);
> +
> +       spin_lock_bh(&xor_chan->desc_lock);
> +
> +       cookie = xor_chan->common.cookie + 1;
> +       if (cookie < 0)
> +               cookie = 1;

Should use the new dma_cookie_assign() helper.

> +
> +       desc->async_tx.cookie = cookie;
> +       xor_chan->common.cookie = desc->async_tx.cookie;
> +
> +       list_splice_tail_init(&desc->tx_list,
> +                                &xor_chan->submit_q);
> +
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +
> +       return cookie;
> +}
> +
> +static struct talitos_xor_desc *talitos_xor_alloc_descriptor(
> +                               struct talitos_xor_chan *xor_chan, gfp_t flags)
> +{
> +       struct talitos_xor_desc *desc;
> +
> +       desc = kmalloc(sizeof(*desc), flags);
> +       if (desc) {
> +               xor_chan->total_desc++;
> +               desc->async_tx.tx_submit = talitos_async_tx_submit;
> +       }
> +
> +       return desc;
> +}
> +
[..]
> +
> +static struct dma_async_tx_descriptor *talitos_prep_dma_xor(
> +                       struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
> +                       unsigned int src_cnt, size_t len, unsigned long flags)
> +{
> +       struct talitos_xor_chan *xor_chan;
> +       struct talitos_xor_desc *new;
> +       struct talitos_desc *desc;
> +       int i, j;
> +
> +       BUG_ON(len > TALITOS_MAX_DATA_LEN);
> +
> +       xor_chan = container_of(chan, struct talitos_xor_chan, common);
> +
> +       spin_lock_bh(&xor_chan->desc_lock);
> +       if (!list_empty(&xor_chan->free_desc)) {
> +               new = container_of(xor_chan->free_desc.next,
> +                                  struct talitos_xor_desc, node);
> +               list_del(&new->node);
> +       } else {
> +                new = talitos_xor_alloc_descriptor(xor_chan, GFP_KERNEL | GFP_DMA);

You can't hold a spin_lock over a GFP_KERNEL allocation.

> +       }
> +       spin_unlock_bh(&xor_chan->desc_lock);
> +
> +       if (!new) {
> +               dev_err(xor_chan->common.device->dev,
> +                       "No free memory for XOR DMA descriptor\n");
> +               return NULL;
> +       }
> +       dma_async_tx_descriptor_init(&new->async_tx, &xor_chan->common);
> +
> +       INIT_LIST_HEAD(&new->node);
> +       INIT_LIST_HEAD(&new->tx_list);
> +
> +       desc = &new->hwdesc;
> +       /* Set destination: Last pointer pair */
> +       to_talitos_ptr(&desc->ptr[6], dest);
> +       desc->ptr[6].len = cpu_to_be16(len);
> +       desc->ptr[6].j_extent = 0;
> +       new->unmap_src_cnt = src_cnt;
> +       new->unmap_len = len;
> +
> +       /* Set Sources: End loading from second-last pointer pair */
> +       for (i = 5, j = 0; j < src_cnt && i >= 0; i--, j++) {
> +               to_talitos_ptr(&desc->ptr[i], src[j]);
> +               desc->ptr[i].len = cpu_to_be16(len);
> +               desc->ptr[i].j_extent = 0;
> +       }
> +
> +       /*
> +        * documentation states first 0 ptr/len combo marks end of sources
> +        * yet device produces scatter boundary error unless all subsequent
> +        * sources are zeroed out
> +        */
> +       for (; i >= 0; i--) {
> +               to_talitos_ptr(&desc->ptr[i], 0);
> +               desc->ptr[i].len = 0;
> +               desc->ptr[i].j_extent = 0;
> +       }
> +
> +       desc->hdr = DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_XOR |
> +               DESC_HDR_TYPE_RAID_XOR;
> +
> +       new->async_tx.parent = NULL;
> +       new->async_tx.next = NULL;
> +       new->async_tx.cookie = 0;
> +       async_tx_ack(&new->async_tx);
> +
> +       list_add_tail(&new->node, &new->tx_list);
> +
> +       new->async_tx.flags = flags;
> +       new->async_tx.cookie = -EBUSY;
> +
> +       return &new->async_tx;
> +}
> +
> +static void talitos_unregister_async_xor(struct device *dev)
> +{
> +       struct talitos_private *priv = dev_get_drvdata(dev);
> +       struct talitos_xor_chan *xor_chan;
> +       struct dma_chan *chan, *_chan;
> +
> +       if (priv->dma_dev_common.chancnt)
> +               dma_async_device_unregister(&priv->dma_dev_common);
> +
> +       list_for_each_entry_safe(chan, _chan, &priv->dma_dev_common.channels,
> +                               device_node) {
> +               xor_chan = container_of(chan, struct talitos_xor_chan,
> +                                       common);
> +               list_del(&chan->device_node);
> +               priv->dma_dev_common.chancnt--;
> +               kfree(xor_chan);
> +       }
> +}
> +
> +/**
> + * talitos_register_dma_async - Initialize the Freescale XOR ADMA device
> + * It is registered as a DMA device with the capability to perform
> + * XOR operation with the Async_tx layer.
> + * The various queues and channel resources are also allocated.
> + */
> +static int talitos_register_async_tx(struct device *dev, int max_xor_srcs)
> +{
> +       struct talitos_private *priv = dev_get_drvdata(dev);
> +       struct dma_device *dma_dev = &priv->dma_dev_common;
> +       struct talitos_xor_chan *xor_chan;
> +       int err;
> +
> +       xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
> +       if (!xor_chan) {
> +               dev_err(dev, "unable to allocate xor channel\n");
> +               return -ENOMEM;
> +       }
> +
> +       dma_dev->dev = dev;
> +       dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
> +       dma_dev->device_free_chan_resources = talitos_free_chan_resources;
> +       dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
> +       dma_dev->max_xor = max_xor_srcs;
> +       dma_dev->device_tx_status = talitos_is_tx_complete;
> +       dma_dev->device_issue_pending = talitos_issue_pending;
> +       INIT_LIST_HEAD(&dma_dev->channels);
> +       dma_cap_set(DMA_XOR, dma_dev->cap_mask);
> +
> +       xor_chan->dev = dev;
> +       xor_chan->common.device = dma_dev;
> +       xor_chan->total_desc = 0;
> +       INIT_LIST_HEAD(&xor_chan->submit_q);
> +       INIT_LIST_HEAD(&xor_chan->pending_q);
> +       INIT_LIST_HEAD(&xor_chan->in_progress_q);
> +       INIT_LIST_HEAD(&xor_chan->free_desc);
> +       spin_lock_init(&xor_chan->desc_lock);
> +
> +       list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
> +       dma_dev->chancnt++;
> +
> +       err = dma_async_device_register(dma_dev);
> +       if (err) {
> +               dev_err(dev, "Unable to register XOR with Async_tx\n");
> +               goto err_out;
> +       }
> +
> +       return err;
> +
> +err_out:
> +       talitos_unregister_async_xor(dev);
> +       return err;
> +}
> +#endif
> +
>  /*
>   * crypto alg
>   */
> @@ -2891,6 +3284,26 @@ static int talitos_probe(struct platform_device *ofdev)
>                         dev_info(dev, "hwrng\n");
>         }
>
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR

Kill these ifdefs in the C file.  Do something like: if
(xor_enabled()) { } around the code sections that are optional so you
always get compile coverage.  Where xor_enabled() is a shorter form of
IS_ENABLED(CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR).

> +       /*
> +        * register with async_tx xor, if capable
> +        * SEC 2.x support up to 3 RAID sources,
> +        * SEC 3.x support up to 6
> +        */
> +       if (hw_supports(dev, DESC_HDR_SEL0_AESU | DESC_HDR_TYPE_RAID_XOR)) {
> +               int max_xor_srcs = 3;
> +               if (of_device_is_compatible(np, "fsl,sec3.0"))
> +                       max_xor_srcs = 6;
> +               err = talitos_register_async_tx(dev, max_xor_srcs);
> +               if (err) {
> +                       dev_err(dev, "failed to register async_tx xor: %d\n",
> +                                       err);
> +                       goto err_out;
> +               }
> +               dev_info(dev, "max_xor_srcs %d\n", max_xor_srcs);
> +       }
> +#endif
> +
>         /* register crypto algorithms the device supports */
>         for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
>                 if (hw_supports(dev, driver_algs[i].desc_hdr_template)) {
> diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> index 61a1405..fc9d125 100644
> --- a/drivers/crypto/talitos.h
> +++ b/drivers/crypto/talitos.h
> @@ -30,6 +30,7 @@
>
>  #define TALITOS_TIMEOUT 100000
>  #define TALITOS_MAX_DATA_LEN 65535
> +#define TALITOS_MAX_DESCRIPTOR_NR 256
>
>  #define DESC_TYPE(desc_hdr) ((be32_to_cpu(desc_hdr) >> 3) & 0x1f)
>  #define PRIMARY_EU(desc_hdr) ((be32_to_cpu(desc_hdr) >> 28) & 0xf)
> @@ -131,7 +132,57 @@ struct talitos_private {
>
>         /* hwrng device */
>         struct hwrng rng;
> +
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
> +       /* XOR Device */
> +       struct dma_device dma_dev_common;
> +#endif
> +};
> +
> +#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
> +/**
> + * talitos_xor_chan - context management for the async_tx channel
> + * @completed_cookie: the last completed cookie
> + * @desc_lock: lock for tx queue
> + * @total_desc: number of descriptors allocated
> + * @submit_q: queue of submitted descriptors
> + * @pending_q: queue of pending descriptors
> + * @in_progress_q: queue of descriptors in progress
> + * @free_desc: queue of unused descriptors
> + * @dev: talitos device implementing this channel
> + * @common: the corresponding xor channel in async_tx
> + */
> +struct talitos_xor_chan {
> +       dma_cookie_t completed_cookie;
> +       spinlock_t desc_lock;
> +       unsigned int total_desc;
> +       struct list_head submit_q;
> +       struct list_head pending_q;
> +       struct list_head in_progress_q;
> +       struct list_head free_desc;
> +       struct device *dev;
> +       struct dma_chan common;
> +};
> +
> +/**
> + * talitos_xor_desc - software xor descriptor
> + * @async_tx: the referring async_tx descriptor
> + * @node:
> + * @hwdesc: h/w descriptor
> + * @unmap_src_cnt: number of xor sources
> + * @unmap_len: transaction byte count
> + * @idx: index of xor sources
> + */
> +struct talitos_xor_desc {
> +       struct dma_async_tx_descriptor async_tx;
> +       struct list_head tx_list;
> +       struct list_head node;
> +       struct talitos_desc hwdesc;
> +       unsigned int unmap_src_cnt;
> +       unsigned int unmap_len;
> +       unsigned int idx;
>  };
> +#endif
>
>  extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>                           void (*callback)(struct device *dev,
> @@ -284,6 +335,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>  /* primary execution unit mode (MODE0) and derivatives */
>  #define        DESC_HDR_MODE0_ENCRYPT          cpu_to_be32(0x00100000)
>  #define        DESC_HDR_MODE0_AESU_CBC         cpu_to_be32(0x00200000)
> +#define        DESC_HDR_MODE0_AESU_XOR         cpu_to_be32(0x0c600000)
>  #define        DESC_HDR_MODE0_DEU_CBC          cpu_to_be32(0x00400000)
>  #define        DESC_HDR_MODE0_DEU_3DES         cpu_to_be32(0x00200000)
>  #define        DESC_HDR_MODE0_MDEU_CONT        cpu_to_be32(0x08000000)
> @@ -344,6 +396,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
>  #define DESC_HDR_TYPE_IPSEC_ESP                        cpu_to_be32(1 << 3)
>  #define DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU  cpu_to_be32(2 << 3)
>  #define DESC_HDR_TYPE_HMAC_SNOOP_NO_AFEU       cpu_to_be32(4 << 3)
> +#define DESC_HDR_TYPE_RAID_XOR                  cpu_to_be32(21 << 3)
>
>  /* link table extent field bits */
>  #define DESC_PTR_LNKTBL_JUMP                   0x80

^ permalink raw reply

* Re: [PATCH v7 1/8] Talitos: Support for async_tx XOR offload
From: Dan Williams @ 2012-09-02  6:47 UTC (permalink / raw)
  To: Geanta Neag Horia Ioan-B05471
  Cc: Li Yang-R58472, arnd@arndb.de, vinod.koul@intel.com,
	Liu Qiang-B32616, linux-kernel@vger.kernel.org,
	Phillips Kim-R1AAHA, herbert@gondor.hengli.com.au,
	linux-crypto@vger.kernel.org, gregkh@linuxfoundation.org,
	dan.j.williams@intel.com, linuxppc-dev@lists.ozlabs.org,
	davem@davemloft.net
In-Reply-To: <FD18E5D573A8AB48A365D4D78185DE992E5C0D@039-SN1MPN1-001.039d.mgd.msft.net>

On Thu, Aug 30, 2012 at 7:23 AM, Geanta Neag Horia Ioan-B05471
<B05471@freescale.com> wrote:
>
> Besides these:
> 1. As Dan Williams mentioned, you should explain why you are using
> both spin_lock_bh and spin_lock_irqsave on the same lock.

It looks like talitos_process_pending() can be called from hardirq
context via talitos_error().  So spin_lock_bh is not sufficient.

^ permalink raw reply

* Re: PPC64 & DMA zone
From: Benjamin Herrenschmidt @ 2012-09-02  3:12 UTC (permalink / raw)
  To: Tabi Timur-B04825; +Cc: linuxppc-dev@ozlabs.org, Aaro Koskinen
In-Reply-To: <6AE080B68D46FC4BA2D2769E68D765B708019EB2@039-SN2MPN1-022.039d.mgd.msft.net>

On Sun, 2012-09-02 at 02:54 +0000, Tabi Timur-B04825 wrote:
> On Fri, Aug 17, 2012 at 3:40 PM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Fri, 2012-08-17 at 23:04 +0300, Aaro Koskinen wrote:
> >> Hi,
> >>
> >> Is there a way to reduce/limit DMA zone with PPC64 kernel (3.6-rcX)
> >> on G5 Mac? I have tried to search Kconfig or command line options,
> >> but can't find anything.
> >
> > No.
> 
> What about Shaohui patches for ZONE_DMA?  That's meant to solve
> exactly this problem?  Granted, it sets the DMA zone to 31 bits, but
> it's simple to hack that to 30 bits.

Unnecessary for this specific problem. The right fix to the iommu code
has already been done.

Cheers,
Ben.

^ permalink raw reply

* Re: PPC64 & DMA zone
From: Tabi Timur-B04825 @ 2012-09-02  2:54 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev@ozlabs.org, Aaro Koskinen
In-Reply-To: <1345236056.11781.1.camel@pasglop>

On Fri, Aug 17, 2012 at 3:40 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Fri, 2012-08-17 at 23:04 +0300, Aaro Koskinen wrote:
>> Hi,
>>
>> Is there a way to reduce/limit DMA zone with PPC64 kernel (3.6-rcX)
>> on G5 Mac? I have tried to search Kconfig or command line options,
>> but can't find anything.
>
> No.

What about Shaohui patches for ZONE_DMA?  That's meant to solve
exactly this problem?  Granted, it sets the DMA zone to 31 bits, but
it's simple to hack that to 30 bits.

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: [PATCH] QE USB host fix for mpc832x
From: Tabi Timur-B04825 @ 2012-09-02  2:50 UTC (permalink / raw)
  To: Ben Dubb; +Cc: linuxppc-dev@ozlabs.org
In-Reply-To: <CACLP1513KDxU95cY0_tpMqenwFmc8WZyuQMbaZbvU_pZdVqkSg@mail.gmail.com>

On Wed, Aug 22, 2012 at 10:54 PM, Ben Dubb <ben.dubb@gmail.com> wrote:
> This patch should be used to get QE USB host mode working on mpc832x and
> mpc8360
> based devices. It fixes the following issues:
>
> - BRG divisor shall not be an add number greater than 3.
> - USB param block in multi-user ram can't be accessed at default location=
 on
> mpc832x
>   device. Allocate it dynamically.

I think this should be two separate patches.

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Wait for STOP to hit the bus
From: Tabi Timur-B04825 @ 2012-09-02  2:48 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev@lists.ozlabs.org, linux-i2c@vger.kernel.org
In-Reply-To: <1346323204-19210-1-git-send-email-Joakim.Tjernlund@transmode.se>

On Thu, Aug 30, 2012 at 5:40 AM, Joakim Tjernlund
<Joakim.Tjernlund@transmode.se> wrote:

> -       mpc_i2c_stop(i2c);
> +       mpc_i2c_stop(i2c); /* Initiate STOP */
> +       orig_jiffies =3D jiffies;
> +       /* Wait until STOP is seen, allow up to 1 s */
> +       while (readb(i2c->base + MPC_I2C_SR) & CSR_MBB) {
> +               if (time_after(jiffies, orig_jiffies + HZ)) {
> +                       u8 status =3D readb(i2c->base + MPC_I2C_SR);
> +
> +                       dev_dbg(i2c->dev, "timeout\n");
> +                       if ((status & (CSR_MCF | CSR_MBB | CSR_RXAK)) !=
=3D 0) {
> +                               writeb(status & ~CSR_MAL,
> +                                      i2c->base + MPC_I2C_SR);
> +                               mpc_i2c_fixup(i2c);
> +                       }
> +                       return -EIO;
> +               }
> +               cond_resched();
> +       }

Shouldn't the while-loop be inside mpc_i2c_stop() itself?

--=20
Timur Tabi
Linux kernel developer at Freescale=

^ permalink raw reply

* Re: [RFC v8 PATCH 13/20] memory-hotplug: check page type in get_page_bootmem
From: Andrew Morton @ 2012-08-31 21:30 UTC (permalink / raw)
  To: wency
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <1346148027-24468-14-git-send-email-wency@cn.fujitsu.com>

On Tue, 28 Aug 2012 18:00:20 +0800
wency@cn.fujitsu.com wrote:

> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> 
> There is a possibility that get_page_bootmem() is called to the same page many
> times. So when get_page_bootmem is called to the same page, the function only
> increments page->_count.

I really don't understand this explanation, even after having looked at
the code.  Can you please have another attempt at the changelog?

> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -95,10 +95,17 @@ static void release_memory_resource(struct resource *res)
>  static void get_page_bootmem(unsigned long info,  struct page *page,
>  			     unsigned long type)
>  {
> -	page->lru.next = (struct list_head *) type;
> -	SetPagePrivate(page);
> -	set_page_private(page, info);
> -	atomic_inc(&page->_count);
> +	unsigned long page_type;
> +
> +	page_type = (unsigned long) page->lru.next;
> +	if (page_type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
> +	    page_type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE){
> +		page->lru.next = (struct list_head *) type;
> +		SetPagePrivate(page);
> +		set_page_private(page, info);
> +		atomic_inc(&page->_count);
> +	} else
> +		atomic_inc(&page->_count);
>  }

And a code comment which explains what is going on would be good.  As
is always the case ;)

^ permalink raw reply

* Re: [RFC v8 PATCH 08/20] memory-hotplug: remove /sys/firmware/memmap/X sysfs
From: Andrew Morton @ 2012-08-31 21:06 UTC (permalink / raw)
  To: wency
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <1346148027-24468-9-git-send-email-wency@cn.fujitsu.com>

On Tue, 28 Aug 2012 18:00:15 +0800
wency@cn.fujitsu.com wrote:

> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> 
> When (hot)adding memory into system, /sys/firmware/memmap/X/{end, start, type}
> sysfs files are created. But there is no code to remove these files. The patch
> implements the function to remove them.
> 
> Note : The code does not free firmware_map_entry since there is no way to free
>        memory which is allocated by bootmem.
> 
> ....
>
> +#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)

It would be better to implement this as an inlined C function.  That
has improved type safety and improved readability.

> +static void release_firmware_map_entry(struct kobject *kobj)
> +{
> +	struct firmware_map_entry *entry = to_memmap_entry(kobj);
> +	struct page *page;
> +
> +	page = virt_to_page(entry);
> +	if (PageSlab(page) || PageCompound(page))

That PageCompound() test looks rather odd.  Why is this done?

> +		kfree(entry);
> +
> +	/* There is no way to free memory allocated from bootmem*/
> +}

This function is a bit ugly - poking around in page flags to determine
whether or not the memory came from bootmem.  It would be cleaner to
use a separate boolean.  Although I guess we can live with it as you
have it here.

>  static struct kobj_type memmap_ktype = {
> +	.release	= release_firmware_map_entry,
>  	.sysfs_ops	= &memmap_attr_ops,
>  	.default_attrs	= def_attrs,
>  };
> @@ -123,6 +139,16 @@ static int firmware_map_add_entry(u64 start, u64 end,
>  	return 0;
>  }
>  
> +/**
> + * firmware_map_remove_entry() - Does the real work to remove a firmware
> + * memmap entry.
> + * @entry: removed entry.
> + **/
> +static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
> +{
> +	list_del(&entry->list);
> +}

Is there no locking  to protect that list?

>  /*
>   * Add memmap entry on sysfs
>   */
> @@ -144,6 +170,31 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
>  	return 0;
>  }
>  
> +/*
> + * Remove memmap entry on sysfs
> + */
> +static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
> +{
> +	kobject_put(&entry->kobj);
> +}
> +
> +/*
> + * Search memmap entry
> + */
> +
> +struct firmware_map_entry * __meminit
> +find_firmware_map_entry(u64 start, u64 end, const char *type)

A better name would be firmware_map_find_entry().  To retain the (good)
convention that symbols exported from here all start with
"firmware_map_".

> +{
> +	struct firmware_map_entry *entry;
> +
> +	list_for_each_entry(entry, &map_entries, list)
> +		if ((entry->start == start) && (entry->end == end) &&
> +		    (!strcmp(entry->type, type)))
> +			return entry;
> +
> +	return NULL;
> +}
> +
>  /**
>   * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
>   * memory hotplug.
> @@ -196,6 +247,32 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
>  	return firmware_map_add_entry(start, end, type, entry);
>  }
>  
> +/**
> + * firmware_map_remove() - remove a firmware mapping entry
> + * @start: Start of the memory range.
> + * @end:   End of the memory range.
> + * @type:  Type of the memory range.
> + *
> + * removes a firmware mapping entry.
> + *
> + * Returns 0 on success, or -EINVAL if no entry.
> + **/
> +int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
> +{
> +	struct firmware_map_entry *entry;
> +
> +	entry = find_firmware_map_entry(start, end - 1, type);
> +	if (!entry)
> +		return -EINVAL;
> +
> +	firmware_map_remove_entry(entry);
> +
> +	/* remove the memmap entry */
> +	remove_sysfs_fw_map_entry(entry);
> +
> +	return 0;
> +}

Again, the lack of locking looks bad.

> ...
>
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1052,9 +1052,9 @@ int offline_memory(u64 start, u64 size)
>  	return 0;
>  }
>  
> -int remove_memory(int nid, u64 start, u64 size)
> +int __ref remove_memory(int nid, u64 start, u64 size)

Why was __ref added?

>  {
> -	int ret = -EBUSY;
> +	int ret = 0;
>  	lock_memory_hotplug();
>  	/*
>  	 * The memory might become online by other task, even if you offine it.
>
> ...
>

^ permalink raw reply

* Re: [RFC v8 PATCH 04/20] memory-hotplug: offline and remove memory when removing the memory device
From: Andrew Morton @ 2012-08-31 20:55 UTC (permalink / raw)
  To: wency
  Cc: linux-s390, linux-ia64, len.brown, linux-acpi, linux-sh, x86,
	linux-kernel, cmetcalf, linux-mm, isimatu.yasuaki, paulus,
	minchan.kim, kosaki.motohiro, rientjes, sparclinux, cl,
	linuxppc-dev, liuj97
In-Reply-To: <1346148027-24468-5-git-send-email-wency@cn.fujitsu.com>

On Tue, 28 Aug 2012 18:00:11 +0800
wency@cn.fujitsu.com wrote:

> +int remove_memory(int nid, u64 start, u64 size)
> +{
> +	int ret = -EBUSY;
> +	lock_memory_hotplug();
> +	/*
> +	 * The memory might become online by other task, even if you offine it.
> +	 * So we check whether the cpu has been onlined or not.

I think you meant "memory", not "cpu".

Actually, "check whether any part of this memory range has been
onlined" would be better.  If that is accurate ;)

> +	 */
> +	if (!is_memblk_offline(start, size)) {
> +		pr_warn("memory removing [mem %#010llx-%#010llx] failed, "
> +			"because the memmory range is online\n",
> +			start, start + size);
> +		ret = -EAGAIN;
> +	}
> +
> +	unlock_memory_hotplug();
> +	return ret;
> +
> +}
> +EXPORT_SYMBOL_GPL(remove_memory);

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox