LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] vfio powerpc: enabled on powernv platform
From: Alex Williamson @ 2012-12-12 23:30 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson
In-Reply-To: <1355315657-31153-1-git-send-email-aik@ozlabs.ru>

On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
>  drivers/iommu/Kconfig                |    8 +
>  4 files changed, 481 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..3c861ae 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..f3bb2e7 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -36,6 +36,7 @@
>  #include <linux/hash.h>
>  #include <linux/fault-inject.h>
>  #include <linux/pci.h>
> +#include <linux/uaccess.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
> @@ -44,6 +45,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	long			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);
> +	kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm)
> +		return; /* process exited */
> +
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to update
> +	 * mm->locked_vm later. If locked_vm were atomic, we
> +	 * wouldn't need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}

Locked page accounting in this version is very, very broken.  How do
powerpc folks feel about seemingly generic kernel iommu interfaces
messing with the current task mm?  Besides that, more problems below...

> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used.
> + *
> + * restore==true says to bring the iommu_table into the state as it was
> + * before being used by VFIO.
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> +{
> +	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
> +	if (!restore && (tbl->it_offset == 0))
> +		clear_bit(0, tbl->it_map);
> +
> +	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);

This does locked page accounting and unpins pages, even on startup when
the pages aren't necessarily pinned or accounted against the current
process.

> +
> +	/* ... or restore  */
> +	if (restore && (tbl->it_offset == 0))
> +		set_bit(0, tbl->it_map);
> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + *
> + * syspage_weight_zero is optimized for expected case == 0
> + * syspage_weight_one is optimized for expected case > 1
> + * Other case are not used in this file.
> + */
> +#if PAGE_SIZE == IOMMU_PAGE_SIZE
> +
> +#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
> +#define syspage_weight_one(map, offset)		test_bit((map), (offset))
> +
> +#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
> +
> +static int syspage_weight_zero(unsigned long *map, unsigned long offset)
> +{
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +	return 0xffffUL & (map[BIT_WORD(offset)] >>
> +			(offset & (BITS_PER_LONG-1)));
> +}

I would have expected these to be bools and return true if the weight
matches the value.

If you replaced 0xffff above w/ this, would you need the #error below?

(1UL << (PAGE_SIZE/IOMMU_PAGE_SIZE)) - 1)

> +
> +static int syspage_weight_one(unsigned long *map, unsigned long offset)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits && (ret < 2)) {

Don't you have a ffs()?  Could also be used for _zero.  Surely there are
some bitops helpers that could help here even on big endian.  hweight
really doesn't work?

> +		if (test_bit(offset, map))
> +			++ret;
> +
> +		--nbits;
> +		++offset;
> +	}
> +
> +	return ret;
> +}
> +#else
> +#error TODO: support other page size
> +#endif
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0, clr;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i, ++entry) {
> +		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
> +			continue;
> +
> +		oldtce = ppc_md.tce_get(tbl, entry);
> +		ppc_md.tce_free(tbl, entry, 1);
> +
> +		oldweight = syspage_weight_one(tbl->it_map,
> +				entry - tbl->it_offset);
> +		clr = __test_and_clear_bit(entry - tbl->it_offset,
> +				tbl->it_map);
> +
> +		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +			continue;
> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		if (WARN_ON(!page))
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && clr)
> +			++retpages;
> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> +		unsigned long size)
> +{
> +	int ret;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			<< IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, npages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	if (ret > 0) {
> +		lock_acct(-ret);
> +		return 0;
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret != 1) {
> +		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> +		return -EFAULT;
> +	}
> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long size)
> +{
> +	int i, ret = 0, retpages = 0;
> +	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
> +	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +	unsigned long locked, lock_limit;
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);
> +
> +	if ((size & ~IOMMU_PAGE_MASK) ||
> +			(ioba & ~IOMMU_PAGE_MASK) ||
> +			(tce & ~IOMMU_PAGE_MASK))
> +		return -EINVAL;
> +
> +	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> +			 << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> +		return -EINVAL;
> +
> +	/* Account for locked pages */
> +	locked = current->mm->locked_vm +
> +		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);

Looks like we just over penalize upfront and correct when mapped, that's
better, but not great.

> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
> +				rlimit(RLIMIT_MEMLOCK));
> +		return -ENOMEM;
> +	}
> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < npages; ++i) {
> +		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0) {
> +		clear_tces_nolock(tbl, entry, i);
> +	} else {
> +		if (retpages)
> +			lock_acct(retpages);
> +		ret = 0;
> +	}

Bug, if it fails we clear, which decrements our locked pages, but we
haven't incremented them yet.  Thanks,

Alex

> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));
> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);
> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));
> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

^ permalink raw reply

* [PATCH v3] powerpc: fix wii_memory_fixups() compile error on 3.0.y tree
From: Shuah Khan @ 2012-12-12 23:27 UTC (permalink / raw)
  To: Ben Hutchings, benh, paulus, Greg KH; +Cc: linuxppc-dev, shuahkhan, stable
In-Reply-To: <1355270698.2616.44.camel@lorien2>

Fix wii_memory_fixups() the following compile error on 3.0.y tree with
wii_defconfig on 3.0.y tree.

  CC      arch/powerpc/platforms/embedded6xx/wii.o
arch/powerpc/platforms/embedded6xx/wii.c: In function ‘wii_memory_fixups’:
arch/powerpc/platforms/embedded6xx/wii.c:88:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 2 has type ‘phys_addr_t’ [-Werror=format]
arch/powerpc/platforms/embedded6xx/wii.c:88:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘phys_addr_t’ [-Werror=format]
arch/powerpc/platforms/embedded6xx/wii.c:90:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 2 has type ‘phys_addr_t’ [-Werror=format]
arch/powerpc/platforms/embedded6xx/wii.c:90:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘phys_addr_t’ [-Werror=format]
cc1: all warnings being treated as errors
make[2]: *** [arch/powerpc/platforms/embedded6xx/wii.o] Error 1
make[1]: *** [arch/powerpc/platforms/embedded6xx] Error 2
make: *** [arch/powerpc/platforms] Error 2

Signed-off-by: Shuah Khan <shuah.khan@hp.com>
CC: stable@vger.kernel.org 3.0.y
---
 arch/powerpc/platforms/embedded6xx/wii.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c
index 1b5dc1a..daf793b 100644
--- a/arch/powerpc/platforms/embedded6xx/wii.c
+++ b/arch/powerpc/platforms/embedded6xx/wii.c
@@ -85,9 +85,11 @@ void __init wii_memory_fixups(void)
 	wii_hole_start = p[0].base + p[0].size;
 	wii_hole_size = p[1].base - wii_hole_start;
 
-	pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size);
+	pr_info("MEM1: <%08llx %08llx>\n",
+		(unsigned long long) p[0].base, (unsigned long long) p[0].size);
 	pr_info("HOLE: <%08lx %08lx>\n", wii_hole_start, wii_hole_size);
-	pr_info("MEM2: <%08llx %08llx>\n", p[1].base, p[1].size);
+	pr_info("MEM2: <%08llx %08llx>\n",
+		(unsigned long long) p[1].base, (unsigned long long) p[1].size);
 
 	p[0].size += wii_hole_size + p[1].size;
 
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] Revert "crypto: caam - Updated SEC-4.0 device tree binding for ERA information."
From: Kumar Gala @ 2012-12-12 21:29 UTC (permalink / raw)
  To: Vakul Garg; +Cc: linuxppc-dev, devicetree-discuss, linux-crypto
In-Reply-To: <1354870649-3656-1-git-send-email-vakul@freescale.com>


On Dec 7, 2012, at 2:57 AM, Vakul Garg wrote:

> This reverts commit a2c0911c09190125f52c9941b9d187f601c2f7be.
> 
> Signed-off-by: Vakul Garg <vakul@freescale.com>
> ---
> Instead of adding SEC era information in crypto node's compatible, a new
> property 'fsl,sec-era' is being introduced into crypto node.
> 
> .../devicetree/bindings/crypto/fsl-sec4.txt        |    5 ++---
> 1 files changed, 2 insertions(+), 3 deletions(-)

What tree do you think this has been applied to?

- k

^ permalink raw reply

* [TRIVIAL PATCH 11/26] powerpc: Convert print_symbol to %pSR
From: Joe Perches @ 2012-12-12 18:19 UTC (permalink / raw)
  To: Jiri Kosina, Arnd Bergmann
  Cc: cbe-oss-dev, Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <cover.1355335227.git.joe@perches.com>

Use the new vsprintf extension to avoid any possible
message interleaving.

Convert the #ifdef DEBUG block to a single pr_debug.

Signed-off-by: Joe Perches <joe@perches.com>
---
 arch/powerpc/platforms/cell/spu_callbacks.c |   12 +++++-------
 1 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
index 75d6133..c5fe6d2 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -60,13 +60,11 @@ long spu_sys_callback(struct spu_syscall_block *s)
 
 	syscall = spu_syscall_table[s->nr_ret];
 
-#ifdef DEBUG
-	print_symbol(KERN_DEBUG "SPU-syscall %s:", (unsigned long)syscall);
-	printk("syscall%ld(%lx, %lx, %lx, %lx, %lx, %lx)\n",
-			s->nr_ret,
-			s->parm[0], s->parm[1], s->parm[2],
-			s->parm[3], s->parm[4], s->parm[5]);
-#endif
+	pr_debug("SPU-syscall %pSR:syscall%ld(%lx, %lx, %lx, %lx, %lx, %lx)\n",
+		 syscall,
+		 s->nr_ret,
+		 s->parm[0], s->parm[1], s->parm[2],
+		 s->parm[3], s->parm[4], s->parm[5]);
 
 	return syscall(s->parm[0], s->parm[1], s->parm[2],
 		       s->parm[3], s->parm[4], s->parm[5]);
-- 
1.7.8.112.g3fd21

^ permalink raw reply related

* [TRIVIAL PATCH 00/26] treewide: Add and use vsprintf extension %pSR
From: Joe Perches @ 2012-12-12 18:18 UTC (permalink / raw)
  To: Jiri Kosina, linux-doc, linuxppc-dev, cbe-oss-dev, linux-edac,
	user-mode-linux-devel, user-mode-linux-user, cluster-devel,
	linux-mm
  Cc: linux-m32r-ja, linux-s390, linux-m32r, linux-ia64, linux-c6x-dev,
	linux-sh, linux-xtensa, linux, linux-kernel, linux-am33-list,
	linux-kernel, linux-alpha, linux-arm-kernel

Remove the somewhat awkward uses of print_symbol and convert all the
existing uses to a new vsprintf pointer type of %pSR.

print_symbol can be interleaved when it is used in a sequence like:

	printk("something: ...");
	print_symbol("%s", addr);
	printk("\n");

Instead use:

	printk("something: %pSR\n", (void *)addr);

Add a new %p[SsFf]R vsprintf extension that can perform the same
symbol function/address/offset formatting as print_symbol to
reduce the number and styles of message logging functions.

print_symbol used __builtin_extract_return_addr for those architectures
like S/390 and SPARC that have offset or masked addressing.
%p[FfSs]R uses the same gcc __builtin

Joe Perches (26):
  vsprintf: Add extension %pSR - print_symbol replacement
  alpha: Convert print_symbol to %pSR
  arm: Convert print_symbol to %pSR
  arm64: Convert print_symbol to %pSR
  avr32: Convert print_symbol to %pSR
  c6x: Convert print_symbol to %pSR
  ia64: Convert print_symbol to %pSR
  m32r: Convert print_symbol to %pSR
  mn10300: Convert print_symbol to %pSR
  openrisc: Convert print_symbol to %pSR
  powerpc: Convert print_symbol to %pSR
  s390: Convert print_symbol to %pSR
  sh: Convert print_symbol to %pSR
  um: Convert print_symbol to %pSR
  unicore32: Convert print_symbol to %pSR
  x86: Convert print_symbol to %pSR
  xtensa: Convert print_symbol to %pSR
  drivers: base: Convert print_symbol to %pSR
  gfs2: Convert print_symbol to %pSR
  sysfs: Convert print_symbol to %pSR
  irq: Convert print_symbol to %pSR
  smp_processor_id: Convert print_symbol to %pSR
  mm: Convert print_symbol to %pSR
  xtensa: Convert print_symbol to %pSR
  x86: head_64.S: Use vsprintf extension %pSR not print_symbol
  kallsyms: Remove print_symbol

 Documentation/filesystems/sysfs.txt         |    4 +-
 Documentation/printk-formats.txt            |    2 +
 Documentation/zh_CN/filesystems/sysfs.txt   |    4 +-
 arch/alpha/kernel/traps.c                   |    8 ++----
 arch/arm/kernel/process.c                   |    4 +-
 arch/arm64/kernel/process.c                 |    4 +-
 arch/avr32/kernel/process.c                 |   25 ++++++-----------------
 arch/c6x/kernel/traps.c                     |    3 +-
 arch/ia64/kernel/process.c                  |   13 ++++-------
 arch/m32r/kernel/traps.c                    |    6 +---
 arch/mn10300/kernel/traps.c                 |    8 +++---
 arch/openrisc/kernel/traps.c                |    7 +----
 arch/powerpc/platforms/cell/spu_callbacks.c |   12 ++++------
 arch/s390/kernel/traps.c                    |   28 +++++++++++++++-----------
 arch/sh/kernel/process_32.c                 |    4 +-
 arch/um/kernel/sysrq.c                      |    6 +---
 arch/unicore32/kernel/process.c             |    5 ++-
 arch/x86/kernel/cpu/mcheck/mce.c            |   13 ++++++-----
 arch/x86/kernel/dumpstack.c                 |    5 +--
 arch/x86/kernel/head_64.S                   |    4 +-
 arch/x86/kernel/process_32.c                |    2 +-
 arch/x86/mm/mmio-mod.c                      |    4 +-
 arch/x86/um/sysrq_32.c                      |    9 ++-----
 arch/xtensa/kernel/traps.c                  |    6 +---
 drivers/base/core.c                         |    4 +-
 fs/gfs2/glock.c                             |    4 +-
 fs/gfs2/trans.c                             |    3 +-
 fs/sysfs/file.c                             |    4 +-
 include/linux/kallsyms.h                    |   18 -----------------
 kernel/irq/debug.h                          |   15 ++++++-------
 kernel/kallsyms.c                           |   11 ----------
 lib/smp_processor_id.c                      |    2 +-
 lib/vsprintf.c                              |   18 ++++++++++++----
 mm/memory.c                                 |    8 +++---
 mm/slab.c                                   |    8 ++----
 35 files changed, 117 insertions(+), 164 deletions(-)

-- 
1.7.8.112.g3fd21

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Rob Herring @ 2012-12-12 17:29 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: monstr, Arnd Bergmann, linux-pci, devicetree-discuss,
	Thierry Reding, Rob Herring, David Laight, linuxppc-dev
In-Reply-To: <20121212171651.346b81ab@skate>

On 12/12/2012 10:16 AM, Thomas Petazzoni wrote:
> Dear Rob Herring,
> 
> On Mon, 10 Dec 2012 17:24:44 -0600, Rob Herring wrote:
> 
>>> Marvell SoCs have up to 20 configurable address windows, which allow
>>> you, at run time, to say "I would like the range from physical
>>> address 0xYYYYYYYY to 0xZZZZZZZZ to correspond to the PCIe device
>>> in port 1, lane 2, or to the NAND, or to this or that device".
>>> Therefore, in the PCIe driver I proposed for the Armada 370/XP SoCs
>>> [1], there is no need to encode all those ranges statically in the
>>> DT.
>>
>> That's not a unique feature. I'm not sure if any powerpc systems do
>> that though.
> 
> Yes, probably not an unique feature.
> 
>>> The only "ranges" property I'm using is to allow the DT sub-nodes
>>> describing each PCIe port/lane to access the CPU registers that
>>> allow to see if the PCIe link is up or down, access the PCI
>>> configuration space and so on. So all ranges in my "ranges"
>>> property correspond to normal CPU registers, like the one you would
>>> put in the "reg" property for any device. The fact that those
>>> devices are PCIe is really orthogonal here.
>>
>> That doesn't really sound right.
> 
> Very likely, but I still don't get what is "the right way".
> 
>> I don't think deviating from the normal binding is the right approach.
>> Perhaps the host driver should fill in the ranges property with the
>> addresses it uses. Then any child devices will get the right address
>> translation.
> 
> I don't really understand what you mean here. If you look at the host
> driver code (arch/arm/mach-mvebu/pcie.c), for each PCIe interface
> is simply does:
> 
>  * Create an address decoding window for the memory BAR
>  * Create an address decoding window for the I/O BAR
>  * Associate the memory BAR window address and the I/O bar window
>    address with the PCIe interface
> 
> And that's it. See
> https://github.com/MISL-EBU-System-SW/mainline-public/blob/marvell-pcie-v1/arch/arm/mach-mvebu/pcie.c#L107.
> 
> So this driver is both "deciding" of the physical addresses for each
> PCIe interface, and "associating" them with the PCIe interfaces. How is
> it useful to feed some addresses back into the Device Tree?

I'm not completely sure for PCI, but the ranges is necessary to
translate addresses of child nodes.

If you don't need ranges then you could omit it. If you need ranges,
then you should follow the PCI binding whether it is put in the DTS or
you dynamically fill it in. This could be filled in by the bootloader as
well if you have PCI devices you need to boot from.

>> Also, while the h/w may support practically any config, there are
>> practical constraints of what Linux will use like there's no reason to
>> support more than 64K i/o space. PCI memory addresses generally start
>> at 0x100000. You probably don't need more than 1 memory window per
>> root complex (although prefetchable memory may also be needed).
> 
> I allocate one 64K I/O window and one memory window per PCIe interface
> whose link is up (i.e a PCIe device is connected).
> 
>> You could let the DT settings drive the address window configuration.
> 
> No, because I don't want to have absolute addresses for the windows: I
> have 10 PCIe interfaces, but often, only a few of them are used. So I
> don't want in the Device Tree to over-allocate hundreds of MB of
> physical address space if it's not useful.

How many you have is probably board dependent and not probe-able, right?
So you would at least know the subset of root complexes that you are
using. I know you want to find the size of all the cards up front and
size windows based on that, but I don't think that is going to be possible.

> 
> PCIe is dynamic, address window configuration is dynamic. And we should
> hardcode all this configuration statically in the DT? Doesn't seem like
> the right solution.

I'm just throwing out ideas. There are many cases of flexibility in h/w
designs which are never used. H/w is often designed in a vacuum without
s/w input. Not saying that is the case here, but you do have to consider
that.

Rob

> 
> Best regards,
> 
> Thomas
> 

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Grant Likely @ 2012-12-12 17:22 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: linux-pci, devicetree-discuss, Rob Herring, David Laight,
	Rob Herring, linuxppc-dev
In-Reply-To: <20121212171651.346b81ab@skate>

On Wed, Dec 12, 2012 at 4:16 PM, Thomas Petazzoni
<thomas.petazzoni@free-electrons.com> wrote:
> Dear Rob Herring,
>
> On Mon, 10 Dec 2012 17:24:44 -0600, Rob Herring wrote:
>
>> > Marvell SoCs have up to 20 configurable address windows, which allow
>> > you, at run time, to say "I would like the range from physical
>> > address 0xYYYYYYYY to 0xZZZZZZZZ to correspond to the PCIe device
>> > in port 1, lane 2, or to the NAND, or to this or that device".
>> > Therefore, in the PCIe driver I proposed for the Armada 370/XP SoCs
>> > [1], there is no need to encode all those ranges statically in the
>> > DT.
>>
>> That's not a unique feature. I'm not sure if any powerpc systems do
>> that though.
>
> Yes, probably not an unique feature.
>
>> > The only "ranges" property I'm using is to allow the DT sub-nodes
>> > describing each PCIe port/lane to access the CPU registers that
>> > allow to see if the PCIe link is up or down, access the PCI
>> > configuration space and so on. So all ranges in my "ranges"
>> > property correspond to normal CPU registers, like the one you would
>> > put in the "reg" property for any device. The fact that those
>> > devices are PCIe is really orthogonal here.
>>
>> That doesn't really sound right.
>
> Very likely, but I still don't get what is "the right way".

Hi Thomas,

I just went and looked at your binding. Here's the snippet I found interesting:

pcie-controller {
+ compatible = "marvell,armada-370-xp-pcie";
+ status = "disabled";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0       0xd0040000 0x2000 /* port0x1_port0 */
+  0x2000  0xd0042000 0x2000 /* port2x1_port0 */
+  0x4000  0xd0044000 0x2000 /* port0x1_port1 */
+  0x8000  0xd0048000 0x2000 /* port0x1_port2 */
+  0xC000  0xd004C000 0x2000 /* port0x1_port3 */
+  0x10000 0xd0080000 0x2000 /* port1x1_port0 */
+  0x12000 0xd0082000 0x2000 /* port3x1_port0 */
+  0x14000 0xd0084000 0x2000 /* port1x1_port1 */
+  0x18000 0xd0088000 0x2000 /* port1x1_port2 */
+  0x1C000 0xd008C000 0x2000 /* port1x1_port3 */>;
+
+ pcie0.0 at 0xd0040000 {
+ reg = <0x0 0x2000>;
+ interrupts = <58>;
+ clocks = <&gateclk 5>;
+ marvell,pcie-port = <0>;
+ marvell,pcie-lane = <0>;
+ status = "disabled";
+ };
+
+ pcie0.1 at 0xd0044000 {
+ reg = <0x4000 0x2000>;
+ interrupts = <59>;
+ clocks = <&gateclk 5>;
+ marvell,pcie-port = <0>;
+ marvell,pcie-lane = <1>;
+ status = "disabled";
+ };
[... rest trimmed for berevity]

You're right, if you're doing dynamic allocation of windows, then you
really don't need to have a ranges property. However, the way the PCI
node is set up definitely looks incorrect.

PCI already has a very specific binding for pci host controller nodes.
First, #address-cells=<3>; #size-cells=<2>; and device_type="pcie"
must be there. You don't want to break this. You can find details on
the pci and pci-express binding here:
http://www.openfirmware.org/1275/bindings/pci/pci2_1.pdf
http://www.openfirmware.org/1275/bindings/pci/pci-express.txt

For the child nodes, PCI is a discoverable bus, so normally I wouldn't
expect to see child nodes at all when using a dtb. The only time nodes
should be populated is when a device has non-discoverable
charactersitics. In your example above you do have some additional
data, but I don't know enough about pci-express to say how best to
encode them or whether they are needed at all. Ben might have some
thoughts on this.

When the PCI child nodes are present, it is important to stick with
the established PCI addressing scheme which uses 3 cells for
addressing. The first entry in the reg property must represent the
configuration space so that DT nodes can be matched up with discovered
devices. There is no requirement to include mappings for the memory
and io regions if the host controller can assign them dynamically.

I don't think you should need a ranges property at all for what you're
doing. Access to config space is generally managed by the PCI host
controller drivers and subsystem, and PCI device drivers don't
typically use of_ calls directly.

g.

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Michal Simek @ 2012-12-12 16:55 UTC (permalink / raw)
  To: Grant Likely
  Cc: linux-pci, devicetree-discuss, Thierry Reding, Rob Herring,
	Rob Herring, linuxppc-dev
In-Reply-To: <CACxGe6udeSbEfibC53v0wu8vFRE6K3Fd_P6EX-3JS9F2M=wUeA@mail.gmail.com>

On 12/12/2012 11:49 AM, Grant Likely wrote:
> On Wed, Dec 12, 2012 at 10:37 AM, Michal Simek <monstr@monstr.eu> wrote:
>> On 12/10/2012 10:41 PM, Grant Likely wrote:
>>> drivers/pci/pci-of.c would be good. I'd also accept drivers/of/pci.c
>>> which might actually be a good idea in the short term so that it gets
>>> appropriate supervision while being generalized before being moved into
>>> the pci directory.
>>
>> Ben: Are you willing to move that ppc code to this location?
>> It is probably not good idea that I should do it when I even don't have
>> hardware available for testing (Asking someone else).
>
> You're a clever guy, you are more than capable of crafting the patch,
> even if you can't test on hardware. :-)
>
> I refactored most of the OF support code without having access to most
> of the affected hardware. Once I got the changes out there for review
> I also asked for spot testing before getting it into linux-next for
> even more testing.

Fair enough. :-)

Good time to start to look for how to work with board farm.

Thanks,
Michal


-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Andrew Murray @ 2012-12-12 16:44 UTC (permalink / raw)
  To: Thierry Reding
  Cc: Michal Simek, linux-pci@vger.kernel.org, devicetree-discuss,
	Liviu.Dudau, rob.herring@calxeda.com, Rob Herring, linuxppc-dev
In-Reply-To: <20121212133424.GA26280@avionic-0098.adnet.avionic-design.de>

On Wed, Dec 12, 2012 at 01:34:24PM +0000, Thierry Reding wrote:
> On Wed, Dec 12, 2012 at 12:19:12PM +0000, Andrew Murray wrote:
> > I've been working on a relatively architecture agnostic PCI host bridge=
 driver
> > and also wanted to avoid duplicating more generic DT parsing code for P=
CI
> > bindings.
> >=20
> > I've ended up with a patch which provides an iterator for returning res=
ources
> > based on the the typical 'ranges' binding. This has ended up living in
> > drivers/of/address.c. I originally started out in drivers/of/pci.c and
> > drivers/pci/pci-of.c but found there were good (and static) implementat=
ions in
> > drivers/of/address.c which can be reused (e.g. of_bus_pci_get_flags,
> > bus->count_cells).
> >=20
> > I'm not just ready to post it - but can do before early next week if yo=
u can
> > wait.
>=20
> I already posted a similar patch[0] as part of a larger series to bring
> DT support to Tegra PCIe back in July. I suppose what you have must be
> something pretty close to that. Most of the stuff that had me occupied
> since then should be done soon and I was planning on resurrecting the
> series one of these days.

Thanks for the reference. I've submitted my patch, it's along the lines of =
your
existing patch.

I'm happy to take the best bits from both, drop mine, etc.

Andrew Murray

^ permalink raw reply

* [PATCH] pci: Provide support for parsing PCI DT ranges property
From: Andrew Murray @ 2012-12-12 16:37 UTC (permalink / raw)
  To: linux-pci
  Cc: Michal Simek, devicetree-discuss, Thierry Reding, Liviu Dudau,
	Rob Herring, Rob Herring, linuxppc-dev

DT bindings for PCI host bridges often use the ranges property to describe
memory and IO ranges - this binding tends to be the same across architectur=
es
yet several parsing implementations exist, e.g. arch/mips/pci/pci.c,
arch/powerpc/kernel/pci-common.c, arch/sparc/kernel/pci.c and
arch/microblaze/pci/pci-common.c (clone of PPC). Some of these duplicate
functionality provided by drivers/of/address.c.

This patch provides a common iterator-based parser for the ranges property,=
 it
is hoped this will reduce DT representation differences between architectur=
es
and that architectures will migrate in part to this new parser.

It is also hoped (and the motativation for the patch) that this patch will
reduce duplication of code when writing host bridge drivers that are suppor=
ted
by multiple architectures.

This patch provides struct resources from a device tree node, e.g.:

=09u32 *last =3D NULL;
=09struct resource res;
=09while ((last =3D of_pci_process_ranges(np, res, last))) {
=09=09//do something with res
=09}

Platforms with quirks can then do what they like with the resource or migra=
te
common quirk handling to the parser. In an ideal world drivers can just req=
uest
the obtained resources and pass them on (e.g. pci_add_resource_offset).

Signed-off-by: Andrew Murray <Andrew.Murray@arm.com>
Signed-off-by: Liviu Dudau <Liviu.Dudau@arm.com>
---
 drivers/of/address.c       |   53 ++++++++++++++++++++++++++++++++++++++++=
+++-
 include/linux/of_address.h |    7 +++++
 2 files changed, 59 insertions(+), 1 deletions(-)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 7e262a6..03bfe61 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -219,6 +219,57 @@ int of_pci_address_to_resource(struct device_node *dev=
, int bar,
 =09return __of_address_to_resource(dev, addrp, size, flags, NULL, r);
 }
 EXPORT_SYMBOL_GPL(of_pci_address_to_resource);
+
+const __be32 *of_pci_process_ranges(struct device_node *node,
+=09=09=09=09    struct resource *res, const __be32 *from)
+{
+=09const __be32 *start, *end;
+=09int na, ns, np, pna;
+=09int rlen;
+=09struct of_bus *bus;
+=09WARN_ON(!res);
+
+=09bus =3D of_match_bus(node);
+=09bus->count_cells(node, &na, &ns);
+=09if (!OF_CHECK_COUNTS(na, ns)) {
+=09=09pr_err("Bad cell count for %s\n", node->full_name);
+=09=09return NULL;
+=09}
+
+=09pna =3D of_n_addr_cells(node);
+=09np =3D pna + na + ns;
+
+=09start =3D of_get_property(node, "ranges", &rlen);
+=09if (start =3D=3D NULL)
+=09=09return NULL;
+
+=09end =3D start + rlen;
+
+=09if (!from)
+=09=09from =3D start;
+
+=09while (from + np <=3D end) {
+=09=09u64 cpu_addr, size;
+
+=09=09cpu_addr =3D of_translate_address(node, from + na);
+=09=09size =3D of_read_number(from + na + pna, ns);
+=09=09res->flags =3D bus->get_flags(from);
+=09=09from +=3D np;
+
+=09=09if (cpu_addr =3D=3D OF_BAD_ADDR || size =3D=3D 0)
+=09=09=09continue;
+
+=09=09res->name =3D node->full_name;
+=09=09res->start =3D cpu_addr;
+=09=09res->end =3D res->start + size - 1;
+=09=09res->parent =3D res->child =3D res->sibling =3D NULL;
+=09=09return from;
+=09}
+
+=09return NULL;
+}
+EXPORT_SYMBOL_GPL(of_pci_process_ranges);
+
 #endif /* CONFIG_PCI */
=20
 /*
@@ -421,7 +472,7 @@ u64 __of_translate_address(struct device_node *dev, con=
st __be32 *in_addr,
 =09=09goto bail;
 =09bus =3D of_match_bus(parent);
=20
-=09/* Cound address cells & copy address locally */
+=09/* Count address cells & copy address locally */
 =09bus->count_cells(dev, &na, &ns);
 =09if (!OF_CHECK_COUNTS(na, ns)) {
 =09=09printk(KERN_ERR "prom_parse: Bad cell count for %s\n",
diff --git a/include/linux/of_address.h b/include/linux/of_address.h
index 01b925a..4582b20 100644
--- a/include/linux/of_address.h
+++ b/include/linux/of_address.h
@@ -26,6 +26,8 @@ static inline unsigned long pci_address_to_pio(phys_addr_=
t addr) { return -1; }
 #define pci_address_to_pio pci_address_to_pio
 #endif
=20
+const __be32 *of_pci_process_ranges(struct device_node *node,
+=09=09=09=09    struct resource *res, const __be32 *from);
 #else /* CONFIG_OF_ADDRESS */
 static inline int of_address_to_resource(struct device_node *dev, int inde=
x,
 =09=09=09=09=09 struct resource *r)
@@ -48,6 +50,11 @@ static inline const u32 *of_get_address(struct device_no=
de *dev, int index,
 {
 =09return NULL;
 }
+const __be32 *of_pci_process_ranges(struct device_node *node,
+=09=09=09=09    struct resource *res, const __be32 *from)
+{
+=09return NULL;
+}
 #endif /* CONFIG_OF_ADDRESS */
=20
=20
--=20
1.7.0.4

^ permalink raw reply related

* Re: pci and pcie device-tree binding - range No cells
From: Thomas Petazzoni @ 2012-12-12 16:16 UTC (permalink / raw)
  To: Rob Herring
  Cc: monstr, Arnd Bergmann, linux-pci, devicetree-discuss,
	Thierry Reding, Rob Herring, David Laight, linuxppc-dev
In-Reply-To: <50C66F3C.3010204@gmail.com>

Dear Rob Herring,

On Mon, 10 Dec 2012 17:24:44 -0600, Rob Herring wrote:

> > Marvell SoCs have up to 20 configurable address windows, which allow
> > you, at run time, to say "I would like the range from physical
> > address 0xYYYYYYYY to 0xZZZZZZZZ to correspond to the PCIe device
> > in port 1, lane 2, or to the NAND, or to this or that device".
> > Therefore, in the PCIe driver I proposed for the Armada 370/XP SoCs
> > [1], there is no need to encode all those ranges statically in the
> > DT.
> 
> That's not a unique feature. I'm not sure if any powerpc systems do
> that though.

Yes, probably not an unique feature.

> > The only "ranges" property I'm using is to allow the DT sub-nodes
> > describing each PCIe port/lane to access the CPU registers that
> > allow to see if the PCIe link is up or down, access the PCI
> > configuration space and so on. So all ranges in my "ranges"
> > property correspond to normal CPU registers, like the one you would
> > put in the "reg" property for any device. The fact that those
> > devices are PCIe is really orthogonal here.
> 
> That doesn't really sound right.

Very likely, but I still don't get what is "the right way".

> I don't think deviating from the normal binding is the right approach.
> Perhaps the host driver should fill in the ranges property with the
> addresses it uses. Then any child devices will get the right address
> translation.

I don't really understand what you mean here. If you look at the host
driver code (arch/arm/mach-mvebu/pcie.c), for each PCIe interface
is simply does:

 * Create an address decoding window for the memory BAR
 * Create an address decoding window for the I/O BAR
 * Associate the memory BAR window address and the I/O bar window
   address with the PCIe interface

And that's it. See
https://github.com/MISL-EBU-System-SW/mainline-public/blob/marvell-pcie-v1/arch/arm/mach-mvebu/pcie.c#L107.

So this driver is both "deciding" of the physical addresses for each
PCIe interface, and "associating" them with the PCIe interfaces. How is
it useful to feed some addresses back into the Device Tree?

> Also, while the h/w may support practically any config, there are
> practical constraints of what Linux will use like there's no reason to
> support more than 64K i/o space. PCI memory addresses generally start
> at 0x100000. You probably don't need more than 1 memory window per
> root complex (although prefetchable memory may also be needed).

I allocate one 64K I/O window and one memory window per PCIe interface
whose link is up (i.e a PCIe device is connected).

> You could let the DT settings drive the address window configuration.

No, because I don't want to have absolute addresses for the windows: I
have 10 PCIe interfaces, but often, only a few of them are used. So I
don't want in the Device Tree to over-allocate hundreds of MB of
physical address space if it's not useful.

PCIe is dynamic, address window configuration is dynamic. And we should
hardcode all this configuration statically in the DT? Doesn't seem like
the right solution.

Best regards,

Thomas
-- 
Thomas Petazzoni, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
From: Alex Williamson @ 2012-12-12 14:36 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson
In-Reply-To: <50C82B3C.1060006@ozlabs.ru>

On Wed, 2012-12-12 at 17:59 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:01, Alex Williamson wrote:
> >> +	case VFIO_IOMMU_MAP_DMA: {
> >> +		vfio_iommu_spapr_tce_dma_map param;
> >> +		struct iommu_table *tbl = container->tbl;
> >> +		enum dma_data_direction direction;
> >> +		unsigned long locked, lock_limit;
> >> +
> >> +		if (WARN_ON(!tbl))
> >> +			return -ENXIO;
> >> +
> >> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +		if (copy_from_user(&param, (void __user *)arg, minsz))
> >> +			return -EFAULT;
> >> +
> >> +		if (param.argsz < minsz)
> >> +			return -EINVAL;
> >> +
> >> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> >> +			direction = DMA_BIDIRECTIONAL;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> >> +			direction = DMA_TO_DEVICE;
> >> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +			direction = DMA_FROM_DEVICE;
> >> +		else
> >> +			return -EINVAL;
> >
> > flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> > set or else sloppy users may make it very difficult to make use of those
> > flag bits later.
> 
> 
> It already returns -EINVAL on any bit set except READ/WRITE, no?

No.  I could pass flags ~0 through there to get a read/write mapping and
cause you problems if you later want to define another bit.  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH] vfio powerpc: enabled on powernv platform
From: Alex Williamson @ 2012-12-12 14:34 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson
In-Reply-To: <50C820AF.8050804@ozlabs.ru>

On Wed, 2012-12-12 at 17:14 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:38, Alex Williamson wrote:
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >
> > BTW, groups have a name property that shows up in sysfs that can be set
> > with iommu_group_set_name().  IIRC, this was a feature David requested
> > for PEs.  It'd be nice if it was used for PEs...  Thanks,
> 
> 
> 
> But what would I put there?... IOMMU ID is more than enough at the moment 
> and struct iommu_table does not have anything what would have made sense to 
> show in the sysfs...

I believe David mentioned that PEs had user visible names.  Perhaps they
match an enclosure location or something.  Group numbers are rather
arbitrary and really have no guarantee of persistence.  Thanks,

Alex

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Thierry Reding @ 2012-12-12 13:34 UTC (permalink / raw)
  To: Andrew Murray
  Cc: Michal Simek, linux-pci, devicetree-discuss, Rob Herring,
	Rob Herring, linuxppc-dev
In-Reply-To: <20121212121912.GA2776@arm.com>

[-- Attachment #1: Type: text/plain, Size: 2190 bytes --]

On Wed, Dec 12, 2012 at 12:19:12PM +0000, Andrew Murray wrote:
> On Wed, Dec 12, 2012 at 10:49 AM, Grant Likely wrote:
> > On Wed, Dec 12, 2012 at 10:37 AM, Michal Simek <monstr@monstr.eu<mailto:monstr@monstr.eu>> wrote:
> > > On 12/10/2012 10:41 PM, Grant Likely wrote:
> > >> drivers/pci/pci-of.c would be good. I'd also accept drivers/of/pci.c
> > >> which might actually be a good idea in the short term so that it gets
> > >> appropriate supervision while being generalized before being moved into
> > >> the pci directory.
> > >
> > > Ben: Are you willing to move that ppc code to this location?
> > > It is probably not good idea that I should do it when I even don't have
> > > hardware available for testing (Asking someone else).
> > 
> > You're a clever guy, you are more than capable of crafting the patch,
> > even if you can't test on hardware. :-)
> > 
> > I refactored most of the OF support code without having access to most
> > of the affected hardware. Once I got the changes out there for review
> > I also asked for spot testing before getting it into linux-next for
> > even more testing.
> 
> I've been working on a relatively architecture agnostic PCI host bridge driver
> and also wanted to avoid duplicating more generic DT parsing code for PCI
> bindings.
> 
> I've ended up with a patch which provides an iterator for returning resources
> based on the the typical 'ranges' binding. This has ended up living in
> drivers/of/address.c. I originally started out in drivers/of/pci.c and
> drivers/pci/pci-of.c but found there were good (and static) implementations in
> drivers/of/address.c which can be reused (e.g. of_bus_pci_get_flags,
> bus->count_cells).
> 
> I'm not just ready to post it - but can do before early next week if you can
> wait.

I already posted a similar patch[0] as part of a larger series to bring
DT support to Tegra PCIe back in July. I suppose what you have must be
something pretty close to that. Most of the stuff that had me occupied
since then should be done soon and I was planning on resurrecting the
series one of these days.

Thierry

[0]: https://patchwork.kernel.org/patch/1244451/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH] vfio powerpc: enabled on powernv platform
From: Alexey Kardashevskiy @ 2012-12-12 12:38 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson
In-Reply-To: <1355315657-31153-1-git-send-email-aik@ozlabs.ru>

Hi Alex,

I posted other pair of patches. While debugging and testing my stuff I 
implemented some rough hack to support IOMMU mappings without passing those 
hypercalls to the QEMU, this is why I moved pieces of code around - want to 
support both QEMU-VFIO and kernel optimized H_PUT_TCE handler.



On 12/12/12 23:34, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
>
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
>
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
>
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---


-- 
Alexey

^ permalink raw reply

* [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
From: Alexey Kardashevskiy @ 2012-12-12 12:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson
In-Reply-To: <1354899707.3224.86.camel@bling.home>

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 drivers/vfio/Kconfig                |    6 +
 drivers/vfio/Makefile               |    1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  249 +++++++++++++++++++++++++++++++++++
 include/linux/vfio.h                |   31 +++++
 4 files changed, 287 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
 	depends on VFIO
 	default n
 
+config VFIO_IOMMU_SPAPR_TCE
+	tristate
+	depends on VFIO && SPAPR_TCE_IOMMU
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
+	select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 0000000..714bf57
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,249 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/err.h>
+#include <linux/vfio.h>
+#include <asm/iommu.h>
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "aik@ozlabs.ru"
+#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+	struct mutex lock;
+	struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+	struct tce_container *container;
+
+	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+		pr_err("tce_vfio: Wrong IOMMU type\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	container = kzalloc(sizeof(*container), GFP_KERNEL);
+	if (!container)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&container->lock);
+
+	return container;
+}
+
+static void tce_iommu_release(void *iommu_data)
+{
+	struct tce_container *container = iommu_data;
+
+	WARN_ON(container->tbl && !container->tbl->it_group);
+	if (container->tbl && container->tbl->it_group)
+		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+
+	mutex_destroy(&container->lock);
+
+	kfree(container);
+}
+
+static long tce_iommu_ioctl(void *iommu_data,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct tce_container *container = iommu_data;
+	unsigned long minsz;
+	long ret;
+
+	switch (cmd) {
+	case VFIO_CHECK_EXTENSION:
+		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
+
+ 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
+		struct vfio_iommu_spapr_tce_info info;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
+				dma32_window_size);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.flags = 0;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			return -EFAULT;
+
+		return 0;
+	}
+	case VFIO_IOMMU_MAP_DMA: {
+		vfio_iommu_spapr_tce_dma_map param;
+		struct iommu_table *tbl = container->tbl;
+		enum dma_data_direction direction;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
+				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
+			direction = DMA_BIDIRECTIONAL;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
+			direction = DMA_TO_DEVICE;
+		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+			direction = DMA_FROM_DEVICE;
+		else
+			return -EINVAL;
+
+		ret = iommu_put_tces(tbl, param.iova, param.vaddr, direction,
+				param.size);
+
+		return ret;
+	}
+	case VFIO_IOMMU_UNMAP_DMA: {
+		vfio_iommu_spapr_tce_dma_unmap param;
+		struct iommu_table *tbl = container->tbl;
+
+		if (WARN_ON(!tbl))
+			return -ENXIO;
+
+		minsz = offsetofend(vfio_iommu_spapr_tce_dma_unmap, size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		ret = iommu_clear_tces(tbl, param.iova, param.size);
+
+		return ret;
+	}
+	}
+
+	return -ENOTTY;
+}
+
+static int tce_iommu_attach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
+			iommu_group_id(iommu_group), iommu_group);
+	if (container->tbl) {
+		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
+				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(iommu_group));
+		mutex_unlock(&container->lock);
+		return -EBUSY;
+	}
+
+	container->tbl = tbl;
+	iommu_reset_table(tbl, false);
+	mutex_unlock(&container->lock);
+
+	return 0;
+}
+
+static void tce_iommu_detach_group(void *iommu_data,
+		struct iommu_group *iommu_group)
+{
+	struct tce_container *container = iommu_data;
+	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+
+	BUG_ON(!tbl);
+	mutex_lock(&container->lock);
+	if (tbl != container->tbl) {
+		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
+				iommu_group_id(iommu_group),
+				iommu_group_id(tbl->it_group));
+	} else {
+
+		pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+				iommu_group_id(iommu_group), iommu_group);
+
+		container->tbl = NULL;
+		iommu_reset_table(tbl, true);
+	}
+	mutex_unlock(&container->lock);
+}
+
+const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+	.name		= "iommu-vfio-powerpc",
+	.owner		= THIS_MODULE,
+	.open		= tce_iommu_open,
+	.release	= tce_iommu_release,
+	.ioctl		= tce_iommu_ioctl,
+	.attach_group	= tce_iommu_attach_group,
+	.detach_group	= tce_iommu_detach_group,
+};
+
+static int __init tce_iommu_init(void)
+{
+	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 0a4f180..b97697d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -99,6 +99,7 @@ extern void vfio_unregister_iommu_driver(
 /* Extensions */
 
 #define VFIO_TYPE1_IOMMU		1
+#define VFIO_SPAPR_TCE_IOMMU		2
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -442,4 +443,34 @@ struct vfio_iommu_type1_dma_unmap {
 
 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
 
+/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
+
+/*
+ * The SPAPR TCE info struct provides the information about the PCI bus
+ * address ranges available for DMA, these values are programmed into
+ * the hardware so the guest has to know that information.
+ *
+ * The DMA 32 bit window start is an absolute PCI bus address.
+ * The IOVA address passed via map/unmap ioctls are absolute PCI bus
+ * addresses too so the window works as a filter rather than an offset
+ * for IOVA addresses.
+ *
+ * A flag will need to be added if other page sizes are supported,
+ * so as defined here, it is always 4k.
+ */
+struct vfio_iommu_spapr_tce_info {
+	__u32 argsz;
+	__u32 flags;			/* reserved for future use */
+	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
+	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+};
+
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/* Reuse type1 map/unmap structs as they are the same at the moment */
+typedef struct vfio_iommu_type1_dma_map vfio_iommu_spapr_tce_dma_map;
+typedef struct vfio_iommu_type1_dma_unmap vfio_iommu_spapr_tce_dma_unmap;
+
+/* ***************************************************************** */
+
 #endif /* VFIO_H */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] vfio powerpc: enabled on powernv platform
From: Alexey Kardashevskiy @ 2012-12-12 12:34 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, Alexey Kardashevskiy, linux-kernel, Paul Mackerras,
	linuxppc-dev, David Gibson
In-Reply-To: <1354901926.3224.96.camel@bling.home>

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |   10 ++
 arch/powerpc/kernel/iommu.c          |  329 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  134 ++++++++++++++
 drivers/iommu/Kconfig                |    8 +
 4 files changed, 481 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..3c861ae 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,12 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..f3bb2e7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,7 @@
 #include <linux/hash.h>
 #include <linux/fault-inject.h>
 #include <linux/pci.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -44,6 +45,7 @@
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+struct vwork {
+	struct mm_struct	*mm;
+	long			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm)
+		return; /* process exited */
+
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to update
+	 * mm->locked_vm later. If locked_vm were atomic, we
+	 * wouldn't need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/*
+ * iommu_reset_table is called when it started/stopped being used.
+ *
+ * restore==true says to bring the iommu_table into the state as it was
+ * before being used by VFIO.
+ */
+void iommu_reset_table(struct iommu_table *tbl, bool restore)
+{
+	/* Page#0 is marked as used in iommu_init_table, so we clear it... */
+	if (!restore && (tbl->it_offset == 0))
+		clear_bit(0, tbl->it_map);
+
+	iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
+
+	/* ... or restore  */
+	if (restore && (tbl->it_offset == 0))
+		set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_reset_table);
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ *
+ * syspage_weight_zero is optimized for expected case == 0
+ * syspage_weight_one is optimized for expected case > 1
+ * Other case are not used in this file.
+ */
+#if PAGE_SIZE == IOMMU_PAGE_SIZE
+
+#define syspage_weight_zero(map, offset)	test_bit((map), (offset))
+#define syspage_weight_one(map, offset)		test_bit((map), (offset))
+
+#elif PAGE_SIZE/IOMMU_PAGE_SIZE == 16
+
+static int syspage_weight_zero(unsigned long *map, unsigned long offset)
+{
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+	return 0xffffUL & (map[BIT_WORD(offset)] >>
+			(offset & (BITS_PER_LONG-1)));
+}
+
+static int syspage_weight_one(unsigned long *map, unsigned long offset)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits && (ret < 2)) {
+		if (test_bit(offset, map))
+			++ret;
+
+		--nbits;
+		++offset;
+	}
+
+	return ret;
+}
+#else
+#error TODO: support other page size
+#endif
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0, clr;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i, ++entry) {
+		if (!test_bit(entry - tbl->it_offset, tbl->it_map))
+			continue;
+
+		oldtce = ppc_md.tce_get(tbl, entry);
+		ppc_md.tce_free(tbl, entry, 1);
+
+		oldweight = syspage_weight_one(tbl->it_map,
+				entry - tbl->it_offset);
+		clr = __test_and_clear_bit(entry - tbl->it_offset,
+				tbl->it_map);
+
+		if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		if (WARN_ON(!page))
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && clr)
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ * of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
+		unsigned long size)
+{
+	int ret;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	if ((size & ~IOMMU_PAGE_MASK) || (ioba & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			<< IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, npages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	if (ret > 0) {
+		lock_acct(-ret);
+		return 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret != 1) {
+		pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		return -EFAULT;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight_zero(tbl->it_map, entry - tbl->it_offset);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long size)
+{
+	int i, ret = 0, retpages = 0;
+	unsigned long entry = ioba >> IOMMU_PAGE_SHIFT;
+	unsigned long npages = size >> IOMMU_PAGE_SHIFT;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+	unsigned long locked, lock_limit;
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	if ((size & ~IOMMU_PAGE_MASK) ||
+			(ioba & ~IOMMU_PAGE_MASK) ||
+			(tce & ~IOMMU_PAGE_MASK))
+		return -EINVAL;
+
+	if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
+			 << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
+		return -EINVAL;
+
+	/* Account for locked pages */
+	locked = current->mm->locked_vm +
+		(_ALIGN_UP(size, PAGE_SIZE) >> PAGE_SHIFT);
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+				rlimit(RLIMIT_MEMLOCK));
+		return -ENOMEM;
+	}
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < npages; ++i) {
+		if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < npages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0) {
+		clear_tces_nolock(tbl, entry, i);
+	} else {
+		if (retpages)
+			lock_acct(retpages);
+		ret = 0;
+	}
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..1b970bf 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		pr_warn("tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		pr_err("tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT
-- 
1.7.10.4

^ permalink raw reply related

* Re: pci and pcie device-tree binding - range No cells
From: Andrew Murray @ 2012-12-12 12:19 UTC (permalink / raw)
  To: Grant Likely
  Cc: Michal Simek, linux-pci, devicetree-discuss, Thierry Reding,
	Rob Herring, Rob Herring, linuxppc-dev
In-Reply-To: <CAPcvp5EJH-Q6wd7my+V+FUVE1=hzwMN-yOfHiukGvDmkcoRcsQ@mail.gmail.com>

On Wed, Dec 12, 2012 at 10:49 AM, Grant Likely wrote:
> On Wed, Dec 12, 2012 at 10:37 AM, Michal Simek <monstr@monstr.eu<mailto:m=
onstr@monstr.eu>> wrote:
> > On 12/10/2012 10:41 PM, Grant Likely wrote:
> >> drivers/pci/pci-of.c would be good. I'd also accept drivers/of/pci.c
> >> which might actually be a good idea in the short term so that it gets
> >> appropriate supervision while being generalized before being moved int=
o
> >> the pci directory.
> >
> > Ben: Are you willing to move that ppc code to this location?
> > It is probably not good idea that I should do it when I even don't have
> > hardware available for testing (Asking someone else).
>=20
> You're a clever guy, you are more than capable of crafting the patch,
> even if you can't test on hardware. :-)
>=20
> I refactored most of the OF support code without having access to most
> of the affected hardware. Once I got the changes out there for review
> I also asked for spot testing before getting it into linux-next for
> even more testing.

I've been working on a relatively architecture agnostic PCI host bridge dri=
ver
and also wanted to avoid duplicating more generic DT parsing code for PCI
bindings.

I've ended up with a patch which provides an iterator for returning resourc=
es
based on the the typical 'ranges' binding. This has ended up living in
drivers/of/address.c. I originally started out in drivers/of/pci.c and
drivers/pci/pci-of.c but found there were good (and static) implementations=
 in
drivers/of/address.c which can be reused (e.g. of_bus_pci_get_flags,
bus->count_cells).

I'm not just ready to post it - but can do before early next week if you ca=
n
wait.

Andrew Murray

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Grant Likely @ 2012-12-12 10:49 UTC (permalink / raw)
  To: Michal Simek
  Cc: linux-pci, devicetree-discuss, Thierry Reding, Rob Herring,
	Rob Herring, linuxppc-dev
In-Reply-To: <50C85E7D.5080006@monstr.eu>

On Wed, Dec 12, 2012 at 10:37 AM, Michal Simek <monstr@monstr.eu> wrote:
> On 12/10/2012 10:41 PM, Grant Likely wrote:
>> drivers/pci/pci-of.c would be good. I'd also accept drivers/of/pci.c
>> which might actually be a good idea in the short term so that it gets
>> appropriate supervision while being generalized before being moved into
>> the pci directory.
>
> Ben: Are you willing to move that ppc code to this location?
> It is probably not good idea that I should do it when I even don't have
> hardware available for testing (Asking someone else).

You're a clever guy, you are more than capable of crafting the patch,
even if you can't test on hardware. :-)

I refactored most of the OF support code without having access to most
of the affected hardware. Once I got the changes out there for review
I also asked for spot testing before getting it into linux-next for
even more testing.

g.

^ permalink raw reply

* Re: pci and pcie device-tree binding - range No cells
From: Michal Simek @ 2012-12-12 10:37 UTC (permalink / raw)
  To: Grant Likely
  Cc: linux-pci, devicetree-discuss, Thierry Reding, Rob Herring,
	Rob Herring, linuxppc-dev
In-Reply-To: <20121210214127.D51773E0796@localhost>

On 12/10/2012 10:41 PM, Grant Likely wrote:
> On Mon, 10 Dec 2012 09:21:51 -0600, Rob Herring <robherring2@gmail.com> wrote:
>> On 12/10/2012 09:05 AM, Michal Simek wrote:
>>> On 12/10/2012 03:26 PM, Rob Herring wrote:
>>>> On 12/10/2012 06:20 AM, Michal Simek wrote:
>>>>> Hi Grant and others,
>>>>>
>>>>> I have a question regarding number of cells in ranges property
>>>>> for pci and pcie nodes.
>>>>>
>>>>> Linux pci/pcie powerpc DTSes contain 7 cells (xpedite5370.dts,
>>>>> sequoia.dts, etc)
>>>>> but also 6 cells format too (mpc832x_mds.dts)
>>>>>
>>>>> Here is shown 6 cells ranges format and describe
>>>>> http://devicetree.org/Device_Tree_Usage#PCI_Host_Bridge
>>>>>
>>>>> And also in documentation in the linux
>>>>> Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
>>>>>
>>>>> Both format uses:
>>>>> #size-cells = <2>;
>>>>> #address-cells = <3>;
>>>>>
>>>>> What is valid format?
>>>>
>>>> Both. 7 cells are valid when the host (parent) bus is 64-bit and 6 cells
>>>> are valid when the host bus is 32-bit. The ranges property is <<child
>>>> address> <parent address> <size>>. The parent address #address-cells is
>>>> taken from the parent node.
>>>
>>> Ok. Got it.
>>>
>>> Here is what we use on zynq and microblaze - both 32bit which should be
>>> fine.
>>>
>>>      ps7_axi_interconnect_0: axi@0 {
>>>          #address-cells = <1>;
>>>          #size-cells = <1>;
>>>          axi_pcie_0: axi-pcie@50000000 {
>>>              #address-cells = <3>;
>>>              #size-cells = <2>;
>>>              compatible = "xlnx,axi-pcie-1.05.a";
>>>              ranges = < 0x02000000 0 0x60000000 0x60000000 0 0x10000000 >;
>>>              ...
>>>          }
>>>      }
>>>
>>> What I am wondering is pci_process_bridge_OF_ranges() at
>>> arch/powerpc/kernel/pci-common.c
>>> where there are used some hardcoded values which should be probably
>>> loaded from device-tree.
>>>
>>> For example:
>>> 683         int np = pna + 5;
>>> ...
>>> 702                 pci_addr = of_read_number(ranges + 1, 2);
>>> 703                 cpu_addr = of_translate_address(dev, ranges + 3);
>>> 704                 size = of_read_number(ranges + pna + 3, 2);
>>
>> These would always be correct whether you have 6 or 7 cells. pna is the
>> parent bus address cells size. The pci address is fixed at 3 cells.
>>
>>>
>>>
>>> Unfortunately we have copied it to microblaze.
>>
>> I look at the PCI DT code in powerpc and see a whole bunch of code that
>> seems like it should be common. The different per arch pci structs
>> complicates that. No one has really gotten to looking at PCI DT on ARM
>> yet except you and Thierry for Tegra. We definitely don't want to create
>> a 3rd copy. Starting the process of moving it to something like
>> drivers/pci/pci-of.c would be great.
>
> A lot of it should be common. The microblaze code is a copy of the
> powerpc version. I'll strongly nack any attempt to add a third!  :-)

Yes it. There are some things which we had fixed because that powerpc
port is big endian only and we support PCIe on little endian too.
But changes are really cosmetic.


> drivers/pci/pci-of.c would be good. I'd also accept drivers/of/pci.c
> which might actually be a good idea in the short term so that it gets
> appropriate supervision while being generalized before being moved into
> the pci directory.

Ben: Are you willing to move that ppc code to this location?
It is probably not good idea that I should do it when I even don't have
hardware available for testing (Asking someone else).

Thanks,
Michal

-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO
From: Alexey Kardashevskiy @ 2012-12-12  6:59 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson
In-Reply-To: <1354899707.3224.86.camel@bling.home>

On 08/12/12 04:01, Alex Williamson wrote:
>> +	case VFIO_IOMMU_MAP_DMA: {
>> +		vfio_iommu_spapr_tce_dma_map param;
>> +		struct iommu_table *tbl = container->tbl;
>> +		enum dma_data_direction direction;
>> +		unsigned long locked, lock_limit;
>> +
>> +		if (WARN_ON(!tbl))
>> +			return -ENXIO;
>> +
>> +		minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
>> +
>> +		if (copy_from_user(&param, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (param.argsz < minsz)
>> +			return -EINVAL;
>> +
>> +		if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
>> +				(param.flags & VFIO_DMA_MAP_FLAG_WRITE))
>> +			direction = DMA_BIDIRECTIONAL;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
>> +			direction = DMA_TO_DEVICE;
>> +		else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
>> +			direction = DMA_FROM_DEVICE;
>> +		else
>> +			return -EINVAL;
>
> flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> set or else sloppy users may make it very difficult to make use of those
> flag bits later.


It already returns -EINVAL on any bit set except READ/WRITE, no?


-- 
Alexey

^ permalink raw reply

* Re: [PATCH] vfio powerpc: enabled on powernv platform
From: Alexey Kardashevskiy @ 2012-12-12  6:14 UTC (permalink / raw)
  To: Alex Williamson
  Cc: kvm, linux-kernel, Paul Mackerras, linuxppc-dev, David Gibson
In-Reply-To: <1354901926.3224.96.camel@bling.home>

On 08/12/12 04:38, Alex Williamson wrote:
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			pr_info("tce_vfio: cannot create new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>
> BTW, groups have a name property that shows up in sysfs that can be set
> with iommu_group_set_name().  IIRC, this was a feature David requested
> for PEs.  It'd be nice if it was used for PEs...  Thanks,



But what would I put there?... IOMMU ID is more than enough at the moment 
and struct iommu_table does not have anything what would have made sense to 
show in the sysfs...


-- 
Alexey

^ permalink raw reply

* Re: Understanding how kernel updates MMU hash table
From: Pegasus11 @ 2012-12-12  5:10 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <34760537.post@talk.nabble.com>


I cannot see my post at all on the old nabble system...this is just for
testing purposes//...

My last post is fine..but I cannot see my thread on linuxppc-dev@oldnabble
...?? :confused:
-- 
View this message in context: http://old.nabble.com/Understanding-how-kernel-updates-MMU-hash-table-tp34760537p34786958.html
Sent from the linuxppc-dev mailing list archive at Nabble.com.

^ permalink raw reply

* Re: [PATCH] KVM: PPC: Fix SREGS documentation reference
From: Alexander Graf @ 2012-12-12  0:29 UTC (permalink / raw)
  To: Amos Kong; +Cc: Mihai Caraman, linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <CAFeW=pb9Afawnp7hi6Muzs7HU=TVeKDX4RD6Tw0bMkkY9-TTBw@mail.gmail.com>


On 11.12.2012, at 15:22, Amos Kong wrote:

> On Tue, Dec 11, 2012 at 9:38 PM, Mihai Caraman
> <mihai.caraman@freescale.com> wrote:
>>=20
>> Reflect the uapi folder change in SREGS API documentation.
>>=20
>> Signed-off-by: Mihai Caraman <mihai.caraman@freescale.com>
>> ---
>> Documentation/virtual/kvm/api.txt |    2 +-
>> 1 files changed, 1 insertions(+), 1 deletions(-)
>>=20
>> diff --git a/Documentation/virtual/kvm/api.txt =
b/Documentation/virtual/kvm/api.txt
>> index a4df553..9cf591d 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -345,7 +345,7 @@ struct kvm_sregs {
>>        __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
>> };
>>=20
>> -/* ppc -- see arch/powerpc/include/asm/kvm.h */
>> +/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */
>=20
> Trivial fix.
> Reviewed-by: Amos Kong <kongjianjun@gmail.com>

Thanks, applied to kvm-ppc-next.


Alex

^ permalink raw reply

* Re: [PATCH v2] powerpc: fix wii_memory_fixups() compile error on 3.0.y tree
From: Shuah Khan @ 2012-12-12  0:04 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Greg KH, stable, paulus, shuahkhan, linuxppc-dev
In-Reply-To: <20121210185527.GI13292@decadent.org.uk>

On Mon, 2012-12-10 at 18:55 +0000, Ben Hutchings wrote:
> On Mon, Dec 10, 2012 at 10:23:16AM -0700, Shuah Khan wrote:
> > Fix wii_memory_fixups() the following compile error on 3.0.y tree with 
> > wii_defconfig on 3.0.y tree.
> > 
> >   CC      arch/powerpc/platforms/embedded6xx/wii.o
> > arch/powerpc/platforms/embedded6xx/wii.c: In function ‘wii_memory_fixups’:
> > arch/powerpc/platforms/embedded6xx/wii.c:88:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 2 has type ‘phys_addr_t’ [-Werror=format]
> > arch/powerpc/platforms/embedded6xx/wii.c:88:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘phys_addr_t’ [-Werror=format]
> > arch/powerpc/platforms/embedded6xx/wii.c:90:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 2 has type ‘phys_addr_t’ [-Werror=format]
> > arch/powerpc/platforms/embedded6xx/wii.c:90:2: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘phys_addr_t’ [-Werror=format]
> > cc1: all warnings being treated as errors
> > make[2]: *** [arch/powerpc/platforms/embedded6xx/wii.o] Error 1
> > make[1]: *** [arch/powerpc/platforms/embedded6xx] Error 2
> > make: *** [arch/powerpc/platforms] Error 2
> > 
> > Signed-off-by: Shuah Khan <shuah.khan@hp.com>
> > CC: stable@vger.kernel.org 3.0.y
> > ---
> >  arch/powerpc/platforms/embedded6xx/wii.c |    6 ++++--
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c
> > index 1b5dc1a..d8ff38f 100644
> > --- a/arch/powerpc/platforms/embedded6xx/wii.c
> > +++ b/arch/powerpc/platforms/embedded6xx/wii.c
> > @@ -85,9 +85,11 @@ void __init wii_memory_fixups(void)
> >  	wii_hole_start = p[0].base + p[0].size;
> >  	wii_hole_size = p[1].base - wii_hole_start;
> >  
> > -	pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size);
> > +	pr_info("MEM1: <%08ulx %08ulx>\n",
> > +		(phys_addr_t) p[0].base, (phys_addr_t) p[0].size);
> [...]
> 
> This is incorrect in exactly the same way as the last version,
> but with extra redundant casts.
> 

Yes it is. :) That is embarrassing. The lesson is "don't try to work
when not feeling well", at least that is my excuse.

Thanks for catching it. I will really fix it this time and send a new
patch. This bug is in there since Dec 2009, not sure if this is worth
fixing, but might as well.

de32400dd26e743c5d500aa42d8d6818b79edb73
wii: use both mem1 and mem2 as ram

-- Shuah

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox