* [PATCH 08/20] powerpc/dma: remove the unused dma_nommu_ops export
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/powerpc/kernel/dma.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index dbfc7056d7df..3939589aab04 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -286,7 +286,6 @@ const struct dma_map_ops dma_nommu_ops = {
.sync_sg_for_device = dma_nommu_sync_sg,
#endif
};
-EXPORT_SYMBOL(dma_nommu_ops);
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
--
2.18.0
^ permalink raw reply related
* [PATCH 07/20] powerpc/dma: remove the unused ARCH_HAS_DMA_MMAP_COHERENT define
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/powerpc/include/asm/dma-mapping.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
index 8fa394520af6..f2a4a7142b1e 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -112,7 +112,5 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
extern u64 __dma_get_required_mask(struct device *dev);
-#define ARCH_HAS_DMA_MMAP_COHERENT
-
#endif /* __KERNEL__ */
#endif /* _ASM_DMA_MAPPING_H */
--
2.18.0
^ permalink raw reply related
* [PATCH 06/20] dma-noncoherent: add an optional arch hook for ->get_required_mask
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
This is need for powerpc for now. Hopefully we can come up with a clean
generic implementation mid-term.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/dma-noncoherent.h | 6 ++++++
kernel/dma/Kconfig | 4 ++++
kernel/dma/noncoherent.c | 1 +
3 files changed, 11 insertions(+)
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 10b2654d549b..61394c6e56df 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -17,6 +17,12 @@ int arch_dma_mmap(struct device *dev, struct vm_area_struct *vma,
#define arch_dma_mmap NULL
#endif /* CONFIG_DMA_NONCOHERENT_MMAP */
+#ifdef CONFIG_DMA_NONCOHERENT_GET_REQUIRED
+u64 arch_get_required_mask(struct device *dev);
+#else
+#define arch_get_required_mask NULL
+#endif
+
#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction direction);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 9bd54304446f..b523104d6199 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -36,6 +36,10 @@ config DMA_NONCOHERENT_MMAP
bool
depends on DMA_NONCOHERENT_OPS
+config DMA_NONCOHERENT_GET_REQUIRED
+ bool
+ depends on DMA_NONCOHERENT_OPS
+
config DMA_NONCOHERENT_CACHE_SYNC
bool
depends on DMA_NONCOHERENT_OPS
diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c
index 79e9a757387f..cf4ffbe4a09d 100644
--- a/kernel/dma/noncoherent.c
+++ b/kernel/dma/noncoherent.c
@@ -98,5 +98,6 @@ const struct dma_map_ops dma_noncoherent_ops = {
.dma_supported = dma_direct_supported,
.mapping_error = dma_direct_mapping_error,
.cache_sync = arch_dma_cache_sync,
+ .get_required_mask = arch_get_required_mask,
};
EXPORT_SYMBOL(dma_noncoherent_ops);
--
2.18.0
^ permalink raw reply related
* [PATCH 05/20] swiotlb: allow the architecture to provide a get_required_mask hook
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
For now this allows consolidating the powerpc code. In the long run
we should grow a generic implementation of dma_get_required_mask that
returns the dma mask required to avoid bounce buffering.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
kernel/dma/swiotlb.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 904541055792..1bb420244753 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1084,5 +1084,9 @@ const struct dma_map_ops swiotlb_dma_ops = {
.map_page = swiotlb_map_page,
.unmap_page = swiotlb_unmap_page,
.dma_supported = dma_direct_supported,
+#ifdef swiotlb_get_required_mask
+ .get_required_mask = swiotlb_get_required_mask,
+#endif
+
};
EXPORT_SYMBOL(swiotlb_dma_ops);
--
2.18.0
^ permalink raw reply related
* [PATCH 04/20] ia64: remove get_required_mask implementation
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
ia64 can use the generic implementation in general, and SN2 can just
override it in the dma_map_ops now.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/ia64/include/asm/dma-mapping.h | 2 --
arch/ia64/include/asm/machvec.h | 7 -------
arch/ia64/include/asm/machvec_init.h | 1 -
arch/ia64/include/asm/machvec_sn2.h | 2 --
arch/ia64/pci/pci.c | 26 --------------------------
arch/ia64/sn/pci/pci_dma.c | 4 ++--
6 files changed, 2 insertions(+), 40 deletions(-)
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
index 76e4d6632d68..522745ae67bb 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -10,8 +10,6 @@
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
-#define ARCH_HAS_DMA_GET_REQUIRED_MASK
-
extern const struct dma_map_ops *dma_ops;
extern struct ia64_machine_vector ia64_mv;
extern void set_iommu_machvec(void);
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index 267f4f170191..5133739966bc 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -44,7 +44,6 @@ typedef void ia64_mv_kernel_launch_event_t(void);
/* DMA-mapping interface: */
typedef void ia64_mv_dma_init (void);
-typedef u64 ia64_mv_dma_get_required_mask (struct device *);
typedef const struct dma_map_ops *ia64_mv_dma_get_ops(struct device *);
/*
@@ -127,7 +126,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
# define platform_global_tlb_purge ia64_mv.global_tlb_purge
# define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish
# define platform_dma_init ia64_mv.dma_init
-# define platform_dma_get_required_mask ia64_mv.dma_get_required_mask
# define platform_dma_get_ops ia64_mv.dma_get_ops
# define platform_irq_to_vector ia64_mv.irq_to_vector
# define platform_local_vector_to_irq ia64_mv.local_vector_to_irq
@@ -171,7 +169,6 @@ struct ia64_machine_vector {
ia64_mv_global_tlb_purge_t *global_tlb_purge;
ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish;
ia64_mv_dma_init *dma_init;
- ia64_mv_dma_get_required_mask *dma_get_required_mask;
ia64_mv_dma_get_ops *dma_get_ops;
ia64_mv_irq_to_vector *irq_to_vector;
ia64_mv_local_vector_to_irq *local_vector_to_irq;
@@ -211,7 +208,6 @@ struct ia64_machine_vector {
platform_global_tlb_purge, \
platform_tlb_migrate_finish, \
platform_dma_init, \
- platform_dma_get_required_mask, \
platform_dma_get_ops, \
platform_irq_to_vector, \
platform_local_vector_to_irq, \
@@ -286,9 +282,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device *);
#ifndef platform_dma_get_ops
# define platform_dma_get_ops dma_get_ops
#endif
-#ifndef platform_dma_get_required_mask
-# define platform_dma_get_required_mask ia64_dma_get_required_mask
-#endif
#ifndef platform_irq_to_vector
# define platform_irq_to_vector __ia64_irq_to_vector
#endif
diff --git a/arch/ia64/include/asm/machvec_init.h b/arch/ia64/include/asm/machvec_init.h
index 2b32fd06b7c6..2aafb69a3787 100644
--- a/arch/ia64/include/asm/machvec_init.h
+++ b/arch/ia64/include/asm/machvec_init.h
@@ -4,7 +4,6 @@
extern ia64_mv_send_ipi_t ia64_send_ipi;
extern ia64_mv_global_tlb_purge_t ia64_global_tlb_purge;
-extern ia64_mv_dma_get_required_mask ia64_dma_get_required_mask;
extern ia64_mv_irq_to_vector __ia64_irq_to_vector;
extern ia64_mv_local_vector_to_irq __ia64_local_vector_to_irq;
extern ia64_mv_pci_get_legacy_mem_t ia64_pci_get_legacy_mem;
diff --git a/arch/ia64/include/asm/machvec_sn2.h b/arch/ia64/include/asm/machvec_sn2.h
index ece9fa85be88..b5153d300289 100644
--- a/arch/ia64/include/asm/machvec_sn2.h
+++ b/arch/ia64/include/asm/machvec_sn2.h
@@ -55,7 +55,6 @@ extern ia64_mv_readb_t __sn_readb_relaxed;
extern ia64_mv_readw_t __sn_readw_relaxed;
extern ia64_mv_readl_t __sn_readl_relaxed;
extern ia64_mv_readq_t __sn_readq_relaxed;
-extern ia64_mv_dma_get_required_mask sn_dma_get_required_mask;
extern ia64_mv_dma_init sn_dma_init;
extern ia64_mv_migrate_t sn_migrate;
extern ia64_mv_kernel_launch_event_t sn_kernel_launch_event;
@@ -100,7 +99,6 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_fixup_bus;
#define platform_pci_get_legacy_mem sn_pci_get_legacy_mem
#define platform_pci_legacy_read sn_pci_legacy_read
#define platform_pci_legacy_write sn_pci_legacy_write
-#define platform_dma_get_required_mask sn_dma_get_required_mask
#define platform_dma_init sn_dma_init
#define platform_migrate sn_migrate
#define platform_kernel_launch_event sn_kernel_launch_event
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 7ccc64d5fe3e..5d71800df431 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -568,32 +568,6 @@ static void __init set_pci_dfl_cacheline_size(void)
pci_dfl_cache_line_size = (1 << cci.pcci_line_size) / 4;
}
-u64 ia64_dma_get_required_mask(struct device *dev)
-{
- u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
- u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
- u64 mask;
-
- if (!high_totalram) {
- /* convert to mask just covering totalram */
- low_totalram = (1 << (fls(low_totalram) - 1));
- low_totalram += low_totalram - 1;
- mask = low_totalram;
- } else {
- high_totalram = (1 << (fls(high_totalram) - 1));
- high_totalram += high_totalram - 1;
- mask = (((u64)high_totalram) << 32) + 0xffffffff;
- }
- return mask;
-}
-EXPORT_SYMBOL_GPL(ia64_dma_get_required_mask);
-
-u64 dma_get_required_mask(struct device *dev)
-{
- return platform_dma_get_required_mask(dev);
-}
-EXPORT_SYMBOL_GPL(dma_get_required_mask);
-
static int __init pcibios_init(void)
{
set_pci_dfl_cacheline_size();
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index 74c934a997bb..96eb2567718a 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -344,11 +344,10 @@ static int sn_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
return 0;
}
-u64 sn_dma_get_required_mask(struct device *dev)
+static u64 sn_dma_get_required_mask(struct device *dev)
{
return DMA_BIT_MASK(64);
}
-EXPORT_SYMBOL_GPL(sn_dma_get_required_mask);
char *sn_pci_get_legacy_mem(struct pci_bus *bus)
{
@@ -473,6 +472,7 @@ static struct dma_map_ops sn_dma_ops = {
.sync_sg_for_device = sn_dma_sync_sg_for_device,
.mapping_error = sn_dma_mapping_error,
.dma_supported = sn_dma_supported,
+ .get_required_mask = sn_dma_get_required_mask,
};
void sn_dma_init(void)
--
2.18.0
^ permalink raw reply related
* [PATCH 03/20] dma-mapping: make the get_required_mask method available unconditionally
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
This save some duplication for ia64. In the long run this method will
need some additional work including moving over to kernel/dma, but that
will require some additional prep work, so let's do this minimal change
for now.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/base/platform.c | 11 ++++++++++-
include/linux/dma-mapping.h | 2 --
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index dff82a3c2caa..921ddb0c051b 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1180,7 +1180,7 @@ int __init platform_bus_init(void)
}
#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
-u64 dma_get_required_mask(struct device *dev)
+static u64 default_dma_get_required_mask(struct device *dev)
{
u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
@@ -1198,6 +1198,15 @@ u64 dma_get_required_mask(struct device *dev)
}
return mask;
}
+
+u64 dma_get_required_mask(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops->get_required_mask)
+ return ops->get_required_mask(dev);
+ return default_dma_get_required_mask(dev);
+}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
#endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f9cc309507d9..00d3065e1510 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -130,9 +130,7 @@ struct dma_map_ops {
enum dma_data_direction direction);
int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
int (*dma_supported)(struct device *dev, u64 mask);
-#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
u64 (*get_required_mask)(struct device *dev);
-#endif
};
extern const struct dma_map_ops dma_direct_ops;
--
2.18.0
^ permalink raw reply related
* [PATCH 01/20] kernel/dma/direct: take DMA offset into account in dma_direct_supported
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
When a device has a DMA offset the dma capable result will change due
to the difference between the physical and DMA address. Take that into
account.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
kernel/dma/direct.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8be8106270c2..d32d4f0d2c0c 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -167,7 +167,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
int dma_direct_supported(struct device *dev, u64 mask)
{
#ifdef CONFIG_ZONE_DMA
- if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
return 0;
#else
/*
@@ -176,14 +176,14 @@ int dma_direct_supported(struct device *dev, u64 mask)
* memory, or by providing a ZONE_DMA32. If neither is the case, the
* architecture needs to use an IOMMU instead of the direct mapping.
*/
- if (mask < DMA_BIT_MASK(32))
+ if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
#endif
/*
* Various PCI/PCIe bridges have broken support for > 32bit DMA even
* if the device itself might support it.
*/
- if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32))
+ if (dev->dma_32bit_limit && mask > phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
return 1;
}
--
2.18.0
^ permalink raw reply related
* use generic DMA mapping code in powerpc
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
Hi all,
this series switches the powerpc port to use the generic swiotlb
and noncoherent dma ops, and to use more generic code for the
coherent direct mapping, as well as removing dead code.
^ permalink raw reply
* [PATCH 02/20] kernel/dma/direct: refine dma_direct_alloc zone selection
From: Christoph Hellwig @ 2018-07-30 16:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Tony Luck, Fenghua Yu
Cc: Konrad Rzeszutek Wilk, Robin Murphy, linuxppc-dev, iommu,
linux-ia64
In-Reply-To: <20180730163824.10064-1-hch@lst.de>
We need to take the DMA offset and encryption bit into account when selecting
a zone. Add a helper that takes those into account and use it.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
kernel/dma/direct.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index d32d4f0d2c0c..c2c1df8827f2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -58,6 +58,14 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
return addr + size - 1 <= dev->coherent_dma_mask;
}
+static bool dma_coherent_below(struct device *dev, u64 mask)
+{
+ dma_addr_t addr = force_dma_unencrypted() ?
+ __phys_to_dma(dev, mask) : phys_to_dma(dev, mask);
+
+ return dev->coherent_dma_mask <= addr;
+}
+
void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs)
{
@@ -70,9 +78,9 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp &= ~__GFP_ZERO;
/* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
- if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ if (dma_coherent_below(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
gfp |= GFP_DMA;
- if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
+ if (dma_coherent_below(dev, DMA_BIT_MASK(32) && !(gfp & GFP_DMA)))
gfp |= GFP_DMA32;
again:
@@ -92,14 +100,14 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
- dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+ dma_coherent_below(dev, DMA_BIT_MASK(64)) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;
}
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
- dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
+ dma_coherent_below(dev, DMA_BIT_MASK(32)) &&
!(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
--
2.18.0
^ permalink raw reply related
* Re: [PATCH v2 04/10] powerpc/traps: Use REG_FMT in show_signal_msg()
From: LEROY Christophe @ 2018-07-30 16:30 UTC (permalink / raw)
To: Murilo Opsfelder Araujo
Cc: linux-kernel, Alastair D'Silva, Andrew Donnellan,
Balbir Singh, Benjamin Herrenschmidt, Cyril Bur,
Eric W . Biederman, Michael Ellerman, Michael Neuling,
Nicholas Piggin, Paul Mackerras, Simon Guo, Sukadev Bhattiprolu,
Tobin C . Harding, linuxppc-dev
In-Reply-To: <20180730152838.GA23704@kermit-br-ibm-com>
Murilo Opsfelder Araujo <muriloo@linux.ibm.com> a =C3=A9crit=C2=A0:
> Hi, Christophe.
>
> On Fri, Jul 27, 2018 at 06:40:23PM +0200, LEROY Christophe wrote:
>> Murilo Opsfelder Araujo <muriloo@linux.ibm.com> a =C3=A9crit=C2=A0:
>>
>> > Simplify the message format by using REG_FMT as the register format. =
This
>> > avoids having two different formats and avoids checking for MSR_64BIT.
>>
>> Are you sure it is what we want ?
>
> Yes.
>
>> Won't it change the behaviour for a 32 bits app running on a 64bits kern=
el ?
>
> In fact, this changes how many zeroes are prefixed when displaying=20=20
>=20the registers
> (%016lx vs. %08lx format). For example, 32-bits userspace, 64-bits kerne=
l:
Indeed that's what I suspected. What is the real benefit of this=20=20
change=20? Why not keep the current format for 32bits userspace ? All=20=20
those=20leading zeroes are pointless to me.
>
> before this series:
> [66475.002900] segv[4599]: unhandled signal 11 at 00000000 nip=20=20
>=2010000420 lr 0fe61854 code 1
>
> after this series:
> [ 73.414535] segv[3759]: segfault (11) at 0000000000000000 nip=20=20
>=200000000010000420 lr 000000000fe61854 code 1 in segv[10000000+10000]
> [ 73.414641] segv[3759]: code: 4e800421 80010014 38210010=20=20
>=207c0803a6 4bffff30 9421ffd0 93e1002c 7c3f0b78
> [ 73.414665] segv[3759]: code: 39200000 913f001c 813f001c=20=20
>=2039400001 <91490000> 39200000 7d234b78 397f0030
>
> Have you spotted any other behaviour change?
Not as of today
Christophe
>
> Cheers
> Murilo
^ permalink raw reply
* Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100
From: Alex Williamson @ 2018-07-30 16:29 UTC (permalink / raw)
To: Alexey Kardashevskiy
Cc: Benjamin Herrenschmidt, linuxppc-dev, David Gibson, kvm-ppc,
Ram Pai, kvm, Alistair Popple
In-Reply-To: <4089d9ed-0c6b-3ba2-d447-049ce5b98764@ozlabs.ru>
On Mon, 30 Jul 2018 18:58:49 +1000
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> On 11/07/2018 19:26, Alexey Kardashevskiy wrote:
> > On Tue, 10 Jul 2018 16:37:15 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >
> >> On Tue, 10 Jul 2018 14:10:20 +1000
> >> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>
> >>> On Thu, 7 Jun 2018 23:03:23 -0600
> >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>
> >>>> On Fri, 8 Jun 2018 14:14:23 +1000
> >>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>>>
> >>>>> On 8/6/18 1:44 pm, Alex Williamson wrote:
> >>>>>> On Fri, 8 Jun 2018 13:08:54 +1000
> >>>>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>>>>>
> >>>>>>> On 8/6/18 8:15 am, Alex Williamson wrote:
> >>>>>>>> On Fri, 08 Jun 2018 07:54:02 +1000
> >>>>>>>> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> >>>>>>>>
> >>>>>>>>> On Thu, 2018-06-07 at 11:04 -0600, Alex Williamson wrote:
> >>>>>>>>>>
> >>>>>>>>>> Can we back up and discuss whether the IOMMU grouping of NVLink
> >>>>>>>>>> connected devices makes sense? AIUI we have a PCI view of these
> >>>>>>>>>> devices and from that perspective they're isolated. That's the view of
> >>>>>>>>>> the device used to generate the grouping. However, not visible to us,
> >>>>>>>>>> these devices are interconnected via NVLink. What isolation properties
> >>>>>>>>>> does NVLink provide given that its entire purpose for existing seems to
> >>>>>>>>>> be to provide a high performance link for p2p between devices?
> >>>>>>>>>
> >>>>>>>>> Not entire. On POWER chips, we also have an nvlink between the device
> >>>>>>>>> and the CPU which is running significantly faster than PCIe.
> >>>>>>>>>
> >>>>>>>>> But yes, there are cross-links and those should probably be accounted
> >>>>>>>>> for in the grouping.
> >>>>>>>>
> >>>>>>>> Then after we fix the grouping, can we just let the host driver manage
> >>>>>>>> this coherent memory range and expose vGPUs to guests? The use case of
> >>>>>>>> assigning all 6 GPUs to one VM seems pretty limited. (Might need to
> >>>>>>>> convince NVIDIA to support more than a single vGPU per VM though)
> >>>>>>>
> >>>>>>> These are physical GPUs, not virtual sriov-alike things they are
> >>>>>>> implementing as well elsewhere.
> >>>>>>
> >>>>>> vGPUs as implemented on M- and P-series Teslas aren't SR-IOV like
> >>>>>> either. That's why we have mdev devices now to implement software
> >>>>>> defined devices. I don't have first hand experience with V-series, but
> >>>>>> I would absolutely expect a PCIe-based Tesla V100 to support vGPU.
> >>>>>
> >>>>> So assuming V100 can do vGPU, you are suggesting ditching this patchset and
> >>>>> using mediated vGPUs instead, correct?
> >>>>
> >>>> If it turns out that our PCIe-only-based IOMMU grouping doesn't
> >>>> account for lack of isolation on the NVLink side and we correct that,
> >>>> limiting assignment to sets of 3 interconnected GPUs, is that still a
> >>>> useful feature? OTOH, it's entirely an NVIDIA proprietary decision
> >>>> whether they choose to support vGPU on these GPUs or whether they can
> >>>> be convinced to support multiple vGPUs per VM.
> >>>>
> >>>>>>> My current understanding is that every P9 chip in that box has some NVLink2
> >>>>>>> logic on it so each P9 is directly connected to 3 GPUs via PCIe and
> >>>>>>> 2xNVLink2, and GPUs in that big group are interconnected by NVLink2 links
> >>>>>>> as well.
> >>>>>>>
> >>>>>>> From small bits of information I have it seems that a GPU can perfectly
> >>>>>>> work alone and if the NVIDIA driver does not see these interconnects
> >>>>>>> (because we do not pass the rest of the big 3xGPU group to this guest), it
> >>>>>>> continues with a single GPU. There is an "nvidia-smi -r" big reset hammer
> >>>>>>> which simply refuses to work until all 3 GPUs are passed so there is some
> >>>>>>> distinction between passing 1 or 3 GPUs, and I am trying (as we speak) to
> >>>>>>> get a confirmation from NVIDIA that it is ok to pass just a single GPU.
> >>>>>>>
> >>>>>>> So we will either have 6 groups (one per GPU) or 2 groups (one per
> >>>>>>> interconnected group).
> >>>>>>
> >>>>>> I'm not gaining much confidence that we can rely on isolation between
> >>>>>> NVLink connected GPUs, it sounds like you're simply expecting that
> >>>>>> proprietary code from NVIDIA on a proprietary interconnect from NVIDIA
> >>>>>> is going to play nice and nobody will figure out how to do bad things
> >>>>>> because... obfuscation? Thanks,
> >>>>>
> >>>>> Well, we already believe that a proprietary firmware of a sriov-capable
> >>>>> adapter like Mellanox ConnextX is not doing bad things, how is this
> >>>>> different in principle?
> >>>>
> >>>> It seems like the scope and hierarchy are different. Here we're
> >>>> talking about exposing big discrete devices, which are peers of one
> >>>> another (and have history of being reverse engineered), to userspace
> >>>> drivers. Once handed to userspace, each of those devices needs to be
> >>>> considered untrusted. In the case of SR-IOV, we typically have a
> >>>> trusted host driver for the PF managing untrusted VFs. We do rely on
> >>>> some sanity in the hardware/firmware in isolating the VFs from each
> >>>> other and from the PF, but we also often have source code for Linux
> >>>> drivers for these devices and sometimes even datasheets. Here we have
> >>>> neither of those and perhaps we won't know the extent of the lack of
> >>>> isolation between these devices until nouveau (best case) or some
> >>>> exploit (worst case) exposes it. IOMMU grouping always assumes a lack
> >>>> of isolation between devices unless the hardware provides some
> >>>> indication that isolation exists, for example ACS on PCIe. If NVIDIA
> >>>> wants to expose isolation on NVLink, perhaps they need to document
> >>>> enough of it that the host kernel can manipulate and test for isolation,
> >>>> perhaps even enabling virtualization of the NVLink interconnect
> >>>> interface such that the host can prevent GPUs from interfering with
> >>>> each other. Thanks,
> >>>
> >>>
> >>> So far I got this from NVIDIA:
> >>>
> >>> 1. An NVLink2 state can be controlled via MMIO registers, there is a
> >>> "NVLINK ISOLATION ON MULTI-TENANT SYSTEMS" spec (my copy is
> >>> "confidential" though) from NVIDIA with the MMIO addresses to block if
> >>> we want to disable certain links. In order to NVLink to work it needs to
> >>> be enabled on both sides so by filtering certains MMIO ranges we can
> >>> isolate a GPU.
> >>
> >> Where are these MMIO registers, on the bridge or on the endpoint device?
> >
> > The endpoint GPU device.
> >
> >> I'm wondering when you say block MMIO if these are ranges on the device
> >> that we disallow mmap to and all the overlapping PAGE_SIZE issues that
> >> come with that or if this should essentially be device specific
> >> enable_acs and acs_enabled quirks, and maybe also potentially used by
> >> Logan's disable acs series to allow GPUs to be linked and have grouping
> >> to match.
> >
> > An update, I confused P100 and V100, P100 would need filtering but
> > ours is V100 and it has a couple of registers which we can use to
> > disable particular links and once disabled, the link cannot be
> > re-enabled till the next secondary bus reset.
> >
> >
> >>> 2. We can and should also prohibit the GPU firmware update, this is
> >>> done via MMIO as well. The protocol is not open but at least register
> >>> ranges might be in order to filter these accesses, and there is no
> >>> plan to change this.
> >>
> >> I assume this MMIO is on the endpoint and has all the PAGE_SIZE joys
> >> along with it.
> >
> > Yes, however NVIDIA says there is no performance critical stuff with
> > this 64K page.
> >
> >> Also, there are certainly use cases of updating
> >> firmware for an assigned device, we don't want to impose a policy, but
> >> we should figure out the right place for that policy to be specified by
> >> the admin.
> >
> > May be but NVIDIA is talking about some "out-of-band" command to the GPU
> > to enable firmware update so firmware update is not really supported.
> >
> >
> >>> 3. DMA trafic over the NVLink2 link can be of 2 types: UT=1 for
> >>> PCI-style DMA via our usual TCE tables (one per a NVLink2 link),
> >>> and UT=0 for direct host memory access. UT stands for "use
> >>> translation" and this is a part of the NVLink2 protocol. Only UT=1 is
> >>> possible over the PCIe link.
> >>> This UT=0 trafic uses host physical addresses returned by a nest MMU (a
> >>> piece of NVIDIA logic on a POWER9 chip), this takes LPID (guest id),
> >>> mmu context id (guest userspace mm id), a virtual address and translates
> >>> to the host physical and that result is used for UT=0 DMA, this is
> >>> called "ATS" although it is not PCIe ATS afaict.
> >>> NVIDIA says that the hardware is designed in a way that it can only do
> >>> DMA UT=0 to addresses which ATS translated to, and there is no way to
> >>> override this behavior and this is what guarantees the isolation.
> >>
> >> I'm kinda lost here, maybe we can compare it to PCIe ATS where an
> >> endpoint requests a translation of an IOVA to physical address, the
> >> IOMMU returns a lookup based on PCIe requester ID, and there's an
> >> invalidation protocol to keep things coherent.
> >
> > Yes there is. The current approach is to have an MMU notifier in
> > the kernel which tells an NPU (IBM piece of logic between GPU/NVlink2
> > and NVIDIA nest MMU) to invalidate translations and that in turn pokes
> > the GPU till that confirms that it invalidated tlbs and there is no
> > ongoing DMA.
> >
> >> In the case above, who provides a guest id and mmu context id?
> >
> > We (powerpc/powernv platform) configure NPU to bind specific bus:dev:fn to
> > an LPID (== guest id) and MMU context id comes from the guest. The nest
> > MMU knows where the partition table and this table contains all the
> > pointers needs for the translation.
> >
> >
> >> Additional software
> >> somewhere? Is the virtual address an IOVA or a process virtual
> >> address?
> >
> > A guest kernel or a guest userspace virtual address.
> >
> >> Do we assume some sort of invalidation protocol as well?
> >
> > I am little confused, is this question about the same invalidation
> > protocol as above or different?
> >
> >
> >>> So isolation can be achieved if I do not miss something.
> >>>
> >>> How do we want this to be documented to proceed? I assume if I post
> >>> patches filtering MMIOs, this won't do it, right? If just 1..3 are
> >>> documented, will we take this t&c or we need a GPU API spec (which is
> >>> not going to happen anyway)?
> >>
> >> "t&c"? I think we need what we're actually interacting with to be well
> >> documented, but that could be _thorough_ comments in the code, enough
> >> to understand the theory of operation, as far as I'm concerned. A pdf
> >> lost on a corporate webserver isn't necessarily an improvement over
> >> that, but there needs to be sufficient detail to understand what we're
> >> touching such that we can maintain, adapt, and improve the code over
> >> time. Only item #3 above appears POWER specific, so I'd hope that #1
> >> is done in the PCI subsystem, #2 might be a QEMU option (maybe kernel
> >> vfio-pci, but I'm not sure that's necessary), and I don't know where #3
> >> goes. Thanks,
> >
> > Ok, understood. Thanks!
>
> After some local discussions, it was pointed out that force disabling
> nvlinks won't bring us much as for an nvlink to work, both sides need to
> enable it so malicious guests cannot penetrate good ones (or a host)
> unless a good guest enabled the link but won't happen with a well
> behaving guest. And if two guests became malicious, then can still only
> harm each other, and so can they via other ways such network. This is
> different from PCIe as once PCIe link is unavoidably enabled, a well
> behaving device cannot firewall itself from peers as it is up to the
> upstream bridge(s) now to decide the routing; with nvlink2, a GPU still
> has means to protect itself, just like a guest can run "firewalld" for
> network.
>
> Although it would be a nice feature to have an extra barrier between
> GPUs, is inability to block the links in hypervisor still a blocker for
> V100 pass through?
How is the NVLink configured by the guest, is it 'on'/'off' or are
specific routes configured? If the former, then isn't a non-malicious
guest still susceptible to a malicious guest? If the latter, how is
routing configured by the guest given that the guest view of the
topology doesn't match physical hardware? Are these routes
deconfigured by device reset? Are they part of the save/restore
state? Thanks,
Alex
^ permalink raw reply
* Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
From: David Miller @ 2018-07-30 16:25 UTC (permalink / raw)
To: yangbo.lu
Cc: netdev, madalin.bucur, richardcochran, robh+dt, shawnguo,
devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel
In-Reply-To: <20180730100154.27906-3-yangbo.lu@nxp.com>
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Mon, 30 Jul 2018 18:01:54 +0800
> +static unsigned int cksel = DEFAULT_CKSEL;
> +module_param(cksel, uint, 0644);
> +MODULE_PARM_DESC(cksel, "Select reference clock");
> +
> +static unsigned int clk_src;
> +module_param(clk_src, uint, 0644);
> +MODULE_PARM_DESC(clk_src, "Reference clock frequency (if clocks property not provided in dts)");
> +
> +static unsigned int tmr_prsc = 2;
> +module_param(tmr_prsc, uint, 0644);
> +MODULE_PARM_DESC(tmr_prsc, "Output clock division/prescale factor");
> +
> +static unsigned int tmr_fiper1 = 1000000000;
> +module_param(tmr_fiper1, uint, 0644);
> +MODULE_PARM_DESC(tmr_fiper1, "Desired fixed interval pulse period (ns)");
> +
> +static unsigned int tmr_fiper2 = 100000;
> +module_param(tmr_fiper2, uint, 0644);
> +MODULE_PARM_DESC(tmr_fiper2, "Desired fixed interval pulse period (ns)");
Sorry, there is no way I am every applying something like this. Module
parameters are to be avoided at all costs.
And you don't need it here, you have DTS, please use it.
You are required to support the existing DTS cases, in order to
avoid breaking things, anyways.
^ permalink raw reply
* Re: [PATCH v2 04/10] powerpc/traps: Use REG_FMT in show_signal_msg()
From: Murilo Opsfelder Araujo @ 2018-07-30 15:28 UTC (permalink / raw)
To: LEROY Christophe
Cc: linuxppc-dev, Tobin C . Harding, Sukadev Bhattiprolu, Simon Guo,
Paul Mackerras, Nicholas Piggin, Michael Neuling,
Michael Ellerman, Eric W . Biederman, Cyril Bur,
Benjamin Herrenschmidt, Balbir Singh, Andrew Donnellan,
Alastair D'Silva, linux-kernel
In-Reply-To: <20180727184023.Horde.KRXPzZpG18uxt_B9sy_FBg5@messagerie.si.c-s.fr>
Hi, Christophe.
On Fri, Jul 27, 2018 at 06:40:23PM +0200, LEROY Christophe wrote:
> Murilo Opsfelder Araujo <muriloo@linux.ibm.com> a écrit :
>
> > Simplify the message format by using REG_FMT as the register format. This
> > avoids having two different formats and avoids checking for MSR_64BIT.
>
> Are you sure it is what we want ?
Yes.
> Won't it change the behaviour for a 32 bits app running on a 64bits kernel ?
In fact, this changes how many zeroes are prefixed when displaying the registers
(%016lx vs. %08lx format). For example, 32-bits userspace, 64-bits kernel:
before this series:
[66475.002900] segv[4599]: unhandled signal 11 at 00000000 nip 10000420 lr 0fe61854 code 1
after this series:
[ 73.414535] segv[3759]: segfault (11) at 0000000000000000 nip 0000000010000420 lr 000000000fe61854 code 1 in segv[10000000+10000]
[ 73.414641] segv[3759]: code: 4e800421 80010014 38210010 7c0803a6 4bffff30 9421ffd0 93e1002c 7c3f0b78
[ 73.414665] segv[3759]: code: 39200000 913f001c 813f001c 39400001 <91490000> 39200000 7d234b78 397f0030
Have you spotted any other behaviour change?
Cheers
Murilo
^ permalink raw reply
* Re: [PATCH v3 1/1] powerpc/pseries: fix EEH recovery of some IOV devices
From: Bryant G. Ly @ 2018-07-30 15:07 UTC (permalink / raw)
To: Sam Bobroff, linuxppc-dev, linux-pci; +Cc: mpe, bhelgaas, bryantly
In-Reply-To: <e63eb03c87a1a54257aa2bcc384cf07761824a7e.1532915951.git.sbobroff@linux.ibm.com>
On 7/29/18 8:59 PM, Sam Bobroff wrote:
> EEH recovery currently fails on pSeries for some IOV capable PCI
> devices, if CONFIG_PCI_IOV is on and the hypervisor doesn't provide
> certain device tree properties for the device. (Found on an IOV
> capable device using the ipr driver.)
>
> Recovery fails in pci_enable_resources() at the check on r->parent,
> because r->flags is set and r->parent is not. This state is due to
> sriov_init() setting the start, end and flags members of the IOV BARs
> but the parent not being set later in
> pseries_pci_fixup_iov_resources(), because the
> "ibm,open-sriov-vf-bar-info" property is missing.
>
> Correct this by zeroing the resource flags for IOV BARs when they
> can't be configured (this is the same method used by sriov_init() and
> __pci_read_base()).
>
> VFs cleared this way can't be enabled later, because that requires
> another device tree property, "ibm,number-of-configurable-vfs" as well
> as support for the RTAS function "ibm_map_pes". These are all part of
> hypervisor support for IOV and it seems unlikely that a hypervisor
> would ever partially, but not fully, support it. (None are currently
> provided by QEMU/KVM.)
>
> Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> ---
> Hi,
>
> This is a fix to allow EEH recovery to succeed in a specific situation,
> which I've tried to explain in the commit message.
>
> As with the RFC version, the IOV BARs are disabled by setting the resource
> flags to 0 but the other fields are now left as-is because that is what is done
> elsewhere (see sriov_init() and __pci_read_base()).
>
> I've also examined the concern raised by Bjorn Helgaas, that VFs could be
> enabled later after the BARs are disabled, and it already seems safe: enabling
> VFs (on pseries) depends on another device tree property,
> "ibm,number-of-configurable-vfs" as well as support for the RTAS function
> "ibm_map_pes". Since these are all part of the hypervisor's support for IOV it
> seems unlikely that we would ever see some of them but not all. (None are
> currently provided by QEMU/KVM.) (Additionally, the ipr driver on which the EEH
> recovery failure was discovered doesn't even seem to have SR-IOV support so it
> certainly can't enable VFs.)
>
> Cheers,
> Sam.
>
> Patch set v3:
> Patch 1/1: powerpc/pseries: fix EEH recovery of some IOV devices
> * Moved some useful information from the cover letter to the commit log.
>
> Patch set v2:
> Patch 1/1: powerpc/pseries: fix EEH recovery of some IOV devices
> * Moved the BAR disabling code to a function.
> * Also check in pseries_pci_fixup_resources().
>
> Patch set v1:
> Patch 1/1: powerpc/pseries: fix EEH recovery of IOV devices
>
> arch/powerpc/platforms/pseries/setup.c | 25 +++++++++++++++++--------
> 1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index b55ad4286dc7..0a9e4243ae1d 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -645,6 +645,15 @@ void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
> }
> }
Reviewed-by: Bryant G. Ly <bryantly@linux.ibm.com>
^ permalink raw reply
* Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
From: Richard Cochran @ 2018-07-30 14:30 UTC (permalink / raw)
To: Yangbo Lu
Cc: netdev, madalin.bucur, Rob Herring, Shawn Guo, David S . Miller,
devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel
In-Reply-To: <20180730100154.27906-3-yangbo.lu@nxp.com>
On Mon, Jul 30, 2018 at 06:01:54PM +0800, Yangbo Lu wrote:
> The ptp_qoriq driver initialized the 1588 timer with the
> configurations provided by the properties of device tree
> node. For example,
>
> fsl,tclk-period = <5>;
> fsl,tmr-prsc = <2>;
> fsl,tmr-add = <0xaaaaaaab>;
> fsl,tmr-fiper1 = <999999995>;
> fsl,tmr-fiper2 = <99990>;
> fsl,max-adj = <499999999>;
>
> These things actually were runtime configurations which
> were not proper to be put into dts.
That is debatable. While I agree that the dts isn't ideal for these,
still it is the lesser of two or more evils.
> This patch is to convert
> to use module parameters for 1588 timer initialization, and
> to support initial register values calculation.
It is hard for me to understand how using module parameters improves
the situation.
> If the parameters are not provided, the driver will calculate
> register values with a set of default parameters. With this
> patch, those dts properties are no longer needed for new
> platform to support 1588 timer, and many QorIQ DPAA platforms
> (some P series and T series platforms of PowerPC, and some
> LS series platforms of ARM64) could use this driver for their
> fman ptp timer with default module parameters. However, this
> patch didn't remove the dts method. Because there were still
> many old platforms using the dts method. We need to clean up
> their dts files, verify module parameters on them, and convert
> them to the new method gradually in case of breaking any
> function.
In addition, like it or not, because the dts is an ABI, you must
continue support of the dts values as a legacy option.
Thanks,
Richard
^ permalink raw reply
* Re: [PATCH 3/3] powerpc/pseries/mm: call H_BLOCK_REMOVE
From: Aneesh Kumar K.V @ 2018-07-30 14:22 UTC (permalink / raw)
To: Michael Ellerman, Laurent Dufour, linuxppc-dev, linux-kernel
Cc: benh, paulus, npiggin
In-Reply-To: <877elcj0oa.fsf@concordia.ellerman.id.au>
Michael Ellerman <mpe@ellerman.id.au> writes:
> Hi Laurent,
>
> Just one comment below.
>
> Laurent Dufour <ldufour@linux.vnet.ibm.com> writes:
>> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platfo=
rms/pseries/lpar.c
>> index 96b8cd8a802d..41ed03245eb4 100644
>> --- a/arch/powerpc/platforms/pseries/lpar.c
>> +++ b/arch/powerpc/platforms/pseries/lpar.c
>> @@ -418,6 +418,73 @@ static void pSeries_lpar_hpte_invalidate(unsigned l=
ong slot, unsigned long vpn,
>> BUG_ON(lpar_rc !=3D H_SUCCESS);
>> }
>>=20=20
>> +
>> +/*
>> + * As defined in the PAPR's section 14.5.4.1.8
>> + * The control mask doesn't include the returned reference and change b=
it from
>> + * the processed PTE.
>> + */
>> +#define HBLKR_AVPN 0x0100000000000000UL
>> +#define HBLKR_CTRL_MASK 0xf800000000000000UL
>> +#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL
>> +#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
>> +#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
>> +
>> +/**
>> + * H_BLOCK_REMOVE caller.
>> + * @idx should point to the latest @param entry set with a PTEX.
>> + * If PTE cannot be processed because another CPUs has already locked t=
hat
>> + * group, those entries are put back in @param starting at index 1.
>> + * If entries has to be retried and @retry_busy is set to true, these e=
ntries
>> + * are retried until success. If @retry_busy is set to false, the retur=
ned
>> + * is the number of entries yet to process.
>> + */
>> +static unsigned long call_block_remove(unsigned long idx, unsigned long=
*param,
>> + bool retry_busy)
>> +{
>> + unsigned long i, rc, new_idx;
>> + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
>> +
>> +again:
>> + new_idx =3D 0;
>> + BUG_ON((idx < 2) || (idx > PLPAR_HCALL9_BUFSIZE));
>
> I count 1 ..
>
>> + if (idx < PLPAR_HCALL9_BUFSIZE)
>> + param[idx] =3D HBR_END;
>> +
>> + rc =3D plpar_hcall9(H_BLOCK_REMOVE, retbuf,
>> + param[0], /* AVA */
>> + param[1], param[2], param[3], param[4], /* TS0-7 */
>> + param[5], param[6], param[7], param[8]);
>> + if (rc =3D=3D H_SUCCESS)
>> + return 0;
>> +
>> + BUG_ON(rc !=3D H_PARTIAL);
>
> 2 ...
>
>> + /* Check that the unprocessed entries were 'not found' or 'busy' */
>> + for (i =3D 0; i < idx-1; i++) {
>> + unsigned long ctrl =3D retbuf[i] & HBLKR_CTRL_MASK;
>> +
>> + if (ctrl =3D=3D HBLKR_CTRL_ERRBUSY) {
>> + param[++new_idx] =3D param[i+1];
>> + continue;
>> + }
>> +
>> + BUG_ON(ctrl !=3D HBLKR_CTRL_SUCCESS
>> + && ctrl !=3D HBLKR_CTRL_ERRNOTFOUND);
>
> 3 ...
>
> BUG_ON()s.
>
> I know the code in this file is already pretty liberal with the use of
> BUG_ON() but I'd prefer if we don't make it any worse.
>
> Given this is an optimisation it seems like we should be able to fall
> back to the existing implementation in the case of error (which will
> probably then BUG_ON() =F0=9F=98=82)
>
> If there's some reason we can't then I guess I can live with it.
It would be nice to log the error in case we are not expecting the
error return. We recently did
https://marc.info/?i=3D20180629083904.29250-1-aneesh.kumar@linux.ibm.com
-aneesh
^ permalink raw reply
* Re: [PATCH 2/3] powerpc/pseries/mm: factorize PTE slot computation
From: Aneesh Kumar K.V @ 2018-07-30 14:19 UTC (permalink / raw)
To: Laurent Dufour, linuxppc-dev, linux-kernel; +Cc: mpe, benh, paulus, npiggin
In-Reply-To: <1532699493-10883-3-git-send-email-ldufour@linux.vnet.ibm.com>
Laurent Dufour <ldufour@linux.vnet.ibm.com> writes:
> This part of code will be called also when dealing with H_BLOCK_REMOVE.
>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> arch/powerpc/platforms/pseries/lpar.c | 27 ++++++++++++++++++++-------
> 1 file changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
> index 52eeff1297f4..96b8cd8a802d 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -547,6 +547,24 @@ static int pSeries_lpar_hpte_removebolted(unsigned long ea,
> return 0;
> }
>
> +
> +static inline unsigned long compute_slot(real_pte_t pte,
> + unsigned long vpn,
> + unsigned long index,
> + unsigned long shift,
> + int ssize)
> +{
> + unsigned long slot, hash, hidx;
> +
> + hash = hpt_hash(vpn, shift, ssize);
> + hidx = __rpte_to_hidx(pte, index);
> + if (hidx & _PTEIDX_SECONDARY)
> + hash = ~hash;
> + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> + slot += hidx & _PTEIDX_GROUP_IX;
> + return slot;
> +}
> +
> /*
> * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
> * lock.
> @@ -559,7 +577,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
> struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
> int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
> unsigned long param[PLPAR_HCALL9_BUFSIZE];
> - unsigned long hash, index, shift, hidx, slot;
> + unsigned long index, shift, slot;
> real_pte_t pte;
> int psize, ssize;
>
> @@ -573,12 +591,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
> vpn = batch->vpn[i];
> pte = batch->pte[i];
> pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
> - hash = hpt_hash(vpn, shift, ssize);
> - hidx = __rpte_to_hidx(pte, index);
> - if (hidx & _PTEIDX_SECONDARY)
> - hash = ~hash;
> - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> - slot += hidx & _PTEIDX_GROUP_IX;
> + slot = compute_slot(pte, vpn, index, shift, ssize);
> if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
> /*
> * lpar doesn't use the passed actual page size
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH 1/3] powerpc/pseries/mm: Introducing FW_FEATURE_BLOCK_REMOVE
From: Aneesh Kumar K.V @ 2018-07-30 14:18 UTC (permalink / raw)
To: Laurent Dufour, linuxppc-dev, linux-kernel; +Cc: mpe, benh, paulus, npiggin
In-Reply-To: <1532699493-10883-2-git-send-email-ldufour@linux.vnet.ibm.com>
Laurent Dufour <ldufour@linux.vnet.ibm.com> writes:
> This feature tells if the hcall H_BLOCK_REMOVE is available.
>
> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
> Cc: Nicholas Piggin <npiggin@gmail.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> arch/powerpc/include/asm/firmware.h | 3 ++-
> arch/powerpc/platforms/pseries/firmware.c | 1 +
> 2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
> index 535add3f7791..360ba197f9d2 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -53,6 +53,7 @@
> #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000)
> #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000)
> #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000)
> +#define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000)
>
> #ifndef __ASSEMBLY__
>
> @@ -70,7 +71,7 @@ enum {
> FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
> FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
> FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
> - FW_FEATURE_DRC_INFO,
> + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE,
> FW_FEATURE_PSERIES_ALWAYS = 0,
> FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
> FW_FEATURE_POWERNV_ALWAYS = 0,
> diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
> index a3bbeb43689e..1624501386f4 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -65,6 +65,7 @@ hypertas_fw_features_table[] = {
> {FW_FEATURE_SET_MODE, "hcall-set-mode"},
> {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"},
> {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
> + {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"},
> };
>
> /* Build up the firmware features bitmask using the contents of
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH 2/2] powerpc: Use of_machine_compatible_match()
From: Rob Herring @ 2018-07-30 14:14 UTC (permalink / raw)
To: Michael Ellerman; +Cc: devicetree, Frank Rowand, linuxppc-dev
In-Reply-To: <20180730131516.18406-2-mpe@ellerman.id.au>
On Mon, Jul 30, 2018 at 7:15 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Use of_machine_compatible_match() in platform code rather than open
> coding.
This is good because I want to get rid of of_root being used outside
of drivers/of/. Can you also convert this one:
drivers/clk/clk-qoriq.c: of_device_is_compatible(of_root,
"fsl,ls1021a")) {
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
> arch/powerpc/platforms/40x/ppc40x_simple.c | 2 +-
> arch/powerpc/platforms/512x/mpc512x_generic.c | 2 +-
> arch/powerpc/platforms/52xx/lite5200.c | 2 +-
> arch/powerpc/platforms/52xx/media5200.c | 2 +-
> arch/powerpc/platforms/52xx/mpc5200_simple.c | 2 +-
> arch/powerpc/platforms/83xx/mpc830x_rdb.c | 2 +-
> arch/powerpc/platforms/83xx/mpc831x_rdb.c | 2 +-
> arch/powerpc/platforms/83xx/mpc837x_rdb.c | 2 +-
> arch/powerpc/platforms/85xx/corenet_generic.c | 2 +-
> arch/powerpc/platforms/85xx/tqm85xx.c | 2 +-
> 10 files changed, 10 insertions(+), 10 deletions(-)
Acked-by: Rob Herring <robh@kernel.org>
Rob
^ permalink raw reply
* Re: [PATCH 1/2] of: Add of_machine_compatible_match()
From: Rob Herring @ 2018-07-30 14:06 UTC (permalink / raw)
To: Michael Ellerman; +Cc: devicetree, Frank Rowand, linuxppc-dev
In-Reply-To: <20180730131516.18406-1-mpe@ellerman.id.au>
On Mon, Jul 30, 2018 at 7:15 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> We have of_machine_is_compatible() to check if a machine is compatible
> with a single compatible string. However some code is able to support
> multiple compatible boards, and so wants to check for one of many
> compatible strings.
>
> So add of_machine_compatible_match() which takes a NULL terminated
> array of compatible strings to check against the root node's
> compatible property.
>
> Compared to an open coded match this is slightly more self
> documenting, and also avoids the caller needing to juggle the root
> node either directly or via of_find_node_by_path().
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
> drivers/of/base.c | 21 +++++++++++++++++++++
> include/linux/of.h | 6 ++++++
> 2 files changed, 27 insertions(+)
>
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index 848f549164cd..603716ba8513 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -505,6 +505,27 @@ int of_device_compatible_match(struct device_node *device,
> return score;
> }
>
> +/**
> + * of_machine_compatible_match - Test root of device tree against a compatible array
> + * @compats: NULL terminated array of compatible strings to look for in root node's compatible property.
> + *
> + * Returns true if the root node has any of the given compatible values in its
> + * compatible property.
> + */
> +bool of_machine_compatible_match(const char *const *compats)
> +{
> + struct device_node *root;
> + int rc = 0;
> +
> + root = of_node_get(of_root);
> + if (root) {
> + rc = of_device_compatible_match(root, compats);
> + of_node_put(root);
> + }
> +
> + return rc != 0;
> +}
> +
> /**
> * of_machine_is_compatible - Test root of device tree for a given compatible value
> * @compat: compatible string to look for in root node's compatible property.
> diff --git a/include/linux/of.h b/include/linux/of.h
> index 4d25e4f952d9..05e3e23a3a57 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -389,6 +389,7 @@ extern int of_alias_get_id(struct device_node *np, const char *stem);
> extern int of_alias_get_highest_id(const char *stem);
>
> extern int of_machine_is_compatible(const char *compat);
> +extern bool of_machine_compatible_match(const char *const *compats);
Would be nice if of_machine_is_compatible could be implemented in
terms of of_machine_compatible_match like this:
int of_machine_is_compatible(const char *compat)
{
const char *compats[] = { compat, NULL };
return of_machine_is_compatible(compats);
}
Probably can be inline too.
Rob
^ permalink raw reply
* Re: [PATCH 3/3] powerpc/pseries/mm: call H_BLOCK_REMOVE
From: Michael Ellerman @ 2018-07-30 13:47 UTC (permalink / raw)
To: Laurent Dufour, linuxppc-dev, linux-kernel
Cc: aneesh.kumar, benh, paulus, npiggin
In-Reply-To: <1532699493-10883-4-git-send-email-ldufour@linux.vnet.ibm.com>
Hi Laurent,
Just one comment below.
Laurent Dufour <ldufour@linux.vnet.ibm.com> writes:
> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platfor=
ms/pseries/lpar.c
> index 96b8cd8a802d..41ed03245eb4 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -418,6 +418,73 @@ static void pSeries_lpar_hpte_invalidate(unsigned lo=
ng slot, unsigned long vpn,
> BUG_ON(lpar_rc !=3D H_SUCCESS);
> }
>=20=20
> +
> +/*
> + * As defined in the PAPR's section 14.5.4.1.8
> + * The control mask doesn't include the returned reference and change bi=
t from
> + * the processed PTE.
> + */
> +#define HBLKR_AVPN 0x0100000000000000UL
> +#define HBLKR_CTRL_MASK 0xf800000000000000UL
> +#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL
> +#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
> +#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
> +
> +/**
> + * H_BLOCK_REMOVE caller.
> + * @idx should point to the latest @param entry set with a PTEX.
> + * If PTE cannot be processed because another CPUs has already locked th=
at
> + * group, those entries are put back in @param starting at index 1.
> + * If entries has to be retried and @retry_busy is set to true, these en=
tries
> + * are retried until success. If @retry_busy is set to false, the return=
ed
> + * is the number of entries yet to process.
> + */
> +static unsigned long call_block_remove(unsigned long idx, unsigned long =
*param,
> + bool retry_busy)
> +{
> + unsigned long i, rc, new_idx;
> + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
> +
> +again:
> + new_idx =3D 0;
> + BUG_ON((idx < 2) || (idx > PLPAR_HCALL9_BUFSIZE));
I count 1 ..
> + if (idx < PLPAR_HCALL9_BUFSIZE)
> + param[idx] =3D HBR_END;
> +
> + rc =3D plpar_hcall9(H_BLOCK_REMOVE, retbuf,
> + param[0], /* AVA */
> + param[1], param[2], param[3], param[4], /* TS0-7 */
> + param[5], param[6], param[7], param[8]);
> + if (rc =3D=3D H_SUCCESS)
> + return 0;
> +
> + BUG_ON(rc !=3D H_PARTIAL);
2 ...
> + /* Check that the unprocessed entries were 'not found' or 'busy' */
> + for (i =3D 0; i < idx-1; i++) {
> + unsigned long ctrl =3D retbuf[i] & HBLKR_CTRL_MASK;
> +
> + if (ctrl =3D=3D HBLKR_CTRL_ERRBUSY) {
> + param[++new_idx] =3D param[i+1];
> + continue;
> + }
> +
> + BUG_ON(ctrl !=3D HBLKR_CTRL_SUCCESS
> + && ctrl !=3D HBLKR_CTRL_ERRNOTFOUND);
3 ...
BUG_ON()s.
I know the code in this file is already pretty liberal with the use of
BUG_ON() but I'd prefer if we don't make it any worse.
Given this is an optimisation it seems like we should be able to fall
back to the existing implementation in the case of error (which will
probably then BUG_ON() =F0=9F=98=82)
If there's some reason we can't then I guess I can live with it.
cheers
^ permalink raw reply
* Re: [PATCH v4 09/11] macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver
From: Michael Ellerman @ 2018-07-30 13:32 UTC (permalink / raw)
To: Geert Uytterhoeven
Cc: Finn Thain, Benjamin Herrenschmidt, Michael Schmitz, linux-m68k,
linuxppc-dev, Linux Kernel Mailing List
In-Reply-To: <CAMuHMdURrJ6d22QCBsX9NKvmtaVckWKGayohgidBPpZoxwTrWg@mail.gmail.com>
Geert Uytterhoeven <geert@linux-m68k.org> writes:
> Hi Michael,
>
> On Mon, Jul 30, 2018 at 8:47 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Finn Thain <fthain@telegraphics.com.au> writes:
>> > Now that the PowerMac via-pmu driver supports m68k PowerBooks,
>> > switch over to that driver and remove the via-pmu68k driver.
>> >
>> > Cc: Geert Uytterhoeven <geert@linux-m68k.org>
>> > Tested-by: Stan Johnson <userm57@yahoo.com>
>> > Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
>> > ---
>> > arch/m68k/configs/mac_defconfig | 2 +-
>> > arch/m68k/configs/multi_defconfig | 2 +-
>> > arch/m68k/mac/config.c | 2 +-
>> > arch/m68k/mac/misc.c | 48 +--
>> > drivers/macintosh/Kconfig | 13 +-
>> > drivers/macintosh/Makefile | 1 -
>> > drivers/macintosh/adb.c | 2 +-
>> > drivers/macintosh/via-pmu68k.c | 846 --------------------------------------
>> > include/uapi/linux/pmu.h | 2 +-
>> > 9 files changed, 14 insertions(+), 904 deletions(-)
>> > delete mode 100644 drivers/macintosh/via-pmu68k.c
>>
>> Geert are you OK with this and the other one that touches arch/m68k ?
>
> Sure, feel free to take them through the PPC tree.
> Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Thanks will do.
cheers
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-07-30 13:26 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Will Deacon, Anshuman Khandual, virtualization, linux-kernel,
linuxppc-dev, aik, robh, joe, elfring, david, jasowang, benh, mpe,
linuxram, haren, paulus, srikar, robin.murphy,
jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180730111802.GA9830@infradead.org>
On Mon, Jul 30, 2018 at 04:18:02AM -0700, Christoph Hellwig wrote:
> On Mon, Jul 30, 2018 at 01:28:03PM +0300, Michael S. Tsirkin wrote:
> > Let me reply to the "crappy" part first:
> > So virtio devices can run on another CPU or on a PCI bus. Configuration
> > can happen over mupltiple transports. There is a discovery protocol to
> > figure out where it is. It has some warts but any real system has warts.
> >
> > So IMHO virtio running on another CPU isn't "legacy virtual crappy
> > virtio". virtio devices that actually sit on a PCI bus aren't "sane"
> > simply because the DMA is more convoluted on some architectures.
>
> All of what you said would be true if virtio didn't claim to be
> a PCI device.
There's nothing virtio claims to be. It's a PV device that uses PCI for
its configuration. Configuration is enumerated on the virtual PCI bus.
That part of the interface is emulated PCI. Data path is through a
PV device enumerated on the virtio bus.
> Once it claims to be a PCI device and we also see
> real hardware written to the interface I stand to all what I said
> above.
Real hardware would reuse parts of the interface but by necessity it
needs to behave slightly differently on some platforms. However for
some platforms (such as x86) a PV virtio driver will by luck work with a
PCI device backend without changes. As these platforms and drivers are
widely deployed, some people will deploy hardware like that. Should be
a non issue as by definition it's transparent to guests.
> > With this out of my system:
> > I agree these approaches are hacky. I think it is generally better to
> > have virtio feature negotiation tell you whether device runs on a CPU or
> > not rather than rely on platform specific ways for this. To this end
> > there was a recent proposal to rename VIRTIO_F_IO_BARRIER to
> > VIRTIO_F_REAL_DEVICE. It got stuck since "real" sounds vague to people,
> > e.g. what if it's a VF - is that real or not? But I can see something
> > like e.g. VIRTIO_F_PLATFORM_DMA gaining support.
> >
> > We would then rename virtio_has_iommu_quirk to virtio_has_dma_quirk
> > and test VIRTIO_F_PLATFORM_DMA in addition to the IOMMU thing.
>
> I don't really care about the exact naming, and indeed a device that
> sets the flag doesn't have to be a 'real' device - it just has to act
> like one. I explained all the issues that this means (at least relating
> to DMA) in one of the previous threads.
I believe you refer to this:
https://lkml.org/lkml/2018/6/7/15
that was a very helpful list outlining the problems we need to solve,
thanks a lot for that!
> The important bit is that we can specify exact behavior for both
> devices that sets the "I'm real!" flag and that ones that don't exactly
> in the spec.
I would very much like that, yes.
> And that very much excludes arch-specific (or
> Xen-specific) overrides.
We already committed to a xen specific hack but generally I prefer
devices that describe how they work instead of platforms magically
guessing, yes.
However the question people raise is that DMA API is already full of
arch-specific tricks the likes of which are outlined in your post linked
above. How is this one much worse?
--
MST
^ permalink raw reply
* [PATCH 2/2] powerpc: Use of_machine_compatible_match()
From: Michael Ellerman @ 2018-07-30 13:15 UTC (permalink / raw)
To: devicetree, robh+dt, frowand.list; +Cc: linuxppc-dev
In-Reply-To: <20180730131516.18406-1-mpe@ellerman.id.au>
Use of_machine_compatible_match() in platform code rather than open
coding.
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
arch/powerpc/platforms/40x/ppc40x_simple.c | 2 +-
arch/powerpc/platforms/512x/mpc512x_generic.c | 2 +-
arch/powerpc/platforms/52xx/lite5200.c | 2 +-
arch/powerpc/platforms/52xx/media5200.c | 2 +-
arch/powerpc/platforms/52xx/mpc5200_simple.c | 2 +-
arch/powerpc/platforms/83xx/mpc830x_rdb.c | 2 +-
arch/powerpc/platforms/83xx/mpc831x_rdb.c | 2 +-
arch/powerpc/platforms/83xx/mpc837x_rdb.c | 2 +-
arch/powerpc/platforms/85xx/corenet_generic.c | 2 +-
arch/powerpc/platforms/85xx/tqm85xx.c | 2 +-
10 files changed, 10 insertions(+), 10 deletions(-)
Tested the corenet_generic.c change on a p5020ds, the rest are untested owing to
lack of hardware.
diff --git a/arch/powerpc/platforms/40x/ppc40x_simple.c b/arch/powerpc/platforms/40x/ppc40x_simple.c
index 2a050007bbae..f401a8add141 100644
--- a/arch/powerpc/platforms/40x/ppc40x_simple.c
+++ b/arch/powerpc/platforms/40x/ppc40x_simple.c
@@ -63,7 +63,7 @@ static const char * const board[] __initconst = {
static int __init ppc40x_probe(void)
{
- if (of_device_compatible_match(of_root, board)) {
+ if (of_machine_compatible_match(board)) {
pci_set_flags(PCI_REASSIGN_ALL_RSRC);
return 1;
}
diff --git a/arch/powerpc/platforms/512x/mpc512x_generic.c b/arch/powerpc/platforms/512x/mpc512x_generic.c
index bf884d3075e4..952ea53d4829 100644
--- a/arch/powerpc/platforms/512x/mpc512x_generic.c
+++ b/arch/powerpc/platforms/512x/mpc512x_generic.c
@@ -38,7 +38,7 @@ static const char * const board[] __initconst = {
*/
static int __init mpc512x_generic_probe(void)
{
- if (!of_device_compatible_match(of_root, board))
+ if (!of_machine_compatible_match(board))
return 0;
mpc512x_init_early();
diff --git a/arch/powerpc/platforms/52xx/lite5200.c b/arch/powerpc/platforms/52xx/lite5200.c
index c94c385cc919..5f11e906e02c 100644
--- a/arch/powerpc/platforms/52xx/lite5200.c
+++ b/arch/powerpc/platforms/52xx/lite5200.c
@@ -183,7 +183,7 @@ static const char * const board[] __initconst = {
*/
static int __init lite5200_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
define_machine(lite5200) {
diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c
index 1fcab233d2f2..8641bb55c8e8 100644
--- a/arch/powerpc/platforms/52xx/media5200.c
+++ b/arch/powerpc/platforms/52xx/media5200.c
@@ -242,7 +242,7 @@ static const char * const board[] __initconst = {
*/
static int __init media5200_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
define_machine(media5200_platform) {
diff --git a/arch/powerpc/platforms/52xx/mpc5200_simple.c b/arch/powerpc/platforms/52xx/mpc5200_simple.c
index a80c6278d515..7b8b3be0f159 100644
--- a/arch/powerpc/platforms/52xx/mpc5200_simple.c
+++ b/arch/powerpc/platforms/52xx/mpc5200_simple.c
@@ -70,7 +70,7 @@ static const char *board[] __initdata = {
*/
static int __init mpc5200_simple_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
define_machine(mpc5200_simple_platform) {
diff --git a/arch/powerpc/platforms/83xx/mpc830x_rdb.c b/arch/powerpc/platforms/83xx/mpc830x_rdb.c
index 272c41c387b9..7c0cd27e71e7 100644
--- a/arch/powerpc/platforms/83xx/mpc830x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc830x_rdb.c
@@ -43,7 +43,7 @@ static const char *board[] __initdata = {
*/
static int __init mpc830x_rdb_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
machine_device_initcall(mpc830x_rdb, mpc83xx_declare_of_platform_devices);
diff --git a/arch/powerpc/platforms/83xx/mpc831x_rdb.c b/arch/powerpc/platforms/83xx/mpc831x_rdb.c
index fd80fd570e67..3718db1c74e4 100644
--- a/arch/powerpc/platforms/83xx/mpc831x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc831x_rdb.c
@@ -43,7 +43,7 @@ static const char *board[] __initdata = {
*/
static int __init mpc831x_rdb_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
machine_device_initcall(mpc831x_rdb, mpc83xx_declare_of_platform_devices);
diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
index 0c55fa6af2d5..08c8434d895c 100644
--- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c
+++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c
@@ -70,7 +70,7 @@ static const char * const board[] __initconst = {
*/
static int __init mpc837x_rdb_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
define_machine(mpc837x_rdb) {
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index ac191a7a1337..8a2a32b75c66 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -174,7 +174,7 @@ static int __init corenet_generic_probe(void)
extern struct smp_ops_t smp_85xx_ops;
#endif
- if (of_device_compatible_match(of_root, boards))
+ if (of_machine_compatible_match(boards))
return 1;
/* Check if we're running under the Freescale hypervisor */
diff --git a/arch/powerpc/platforms/85xx/tqm85xx.c b/arch/powerpc/platforms/85xx/tqm85xx.c
index 9fc20a37835e..2172e93a7c67 100644
--- a/arch/powerpc/platforms/85xx/tqm85xx.c
+++ b/arch/powerpc/platforms/85xx/tqm85xx.c
@@ -122,7 +122,7 @@ static const char * const board[] __initconst = {
*/
static int __init tqm85xx_probe(void)
{
- return of_device_compatible_match(of_root, board);
+ return of_machine_compatible_match(board);
}
define_machine(tqm85xx) {
--
2.14.1
^ permalink raw reply related
* [PATCH 1/2] of: Add of_machine_compatible_match()
From: Michael Ellerman @ 2018-07-30 13:15 UTC (permalink / raw)
To: devicetree, robh+dt, frowand.list; +Cc: linuxppc-dev
We have of_machine_is_compatible() to check if a machine is compatible
with a single compatible string. However some code is able to support
multiple compatible boards, and so wants to check for one of many
compatible strings.
So add of_machine_compatible_match() which takes a NULL terminated
array of compatible strings to check against the root node's
compatible property.
Compared to an open coded match this is slightly more self
documenting, and also avoids the caller needing to juggle the root
node either directly or via of_find_node_by_path().
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
drivers/of/base.c | 21 +++++++++++++++++++++
include/linux/of.h | 6 ++++++
2 files changed, 27 insertions(+)
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 848f549164cd..603716ba8513 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -505,6 +505,27 @@ int of_device_compatible_match(struct device_node *device,
return score;
}
+/**
+ * of_machine_compatible_match - Test root of device tree against a compatible array
+ * @compats: NULL terminated array of compatible strings to look for in root node's compatible property.
+ *
+ * Returns true if the root node has any of the given compatible values in its
+ * compatible property.
+ */
+bool of_machine_compatible_match(const char *const *compats)
+{
+ struct device_node *root;
+ int rc = 0;
+
+ root = of_node_get(of_root);
+ if (root) {
+ rc = of_device_compatible_match(root, compats);
+ of_node_put(root);
+ }
+
+ return rc != 0;
+}
+
/**
* of_machine_is_compatible - Test root of device tree for a given compatible value
* @compat: compatible string to look for in root node's compatible property.
diff --git a/include/linux/of.h b/include/linux/of.h
index 4d25e4f952d9..05e3e23a3a57 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -389,6 +389,7 @@ extern int of_alias_get_id(struct device_node *np, const char *stem);
extern int of_alias_get_highest_id(const char *stem);
extern int of_machine_is_compatible(const char *compat);
+extern bool of_machine_compatible_match(const char *const *compats);
extern int of_add_property(struct device_node *np, struct property *prop);
extern int of_remove_property(struct device_node *np, struct property *prop);
@@ -877,6 +878,11 @@ static inline int of_machine_is_compatible(const char *compat)
return 0;
}
+static inline bool of_machine_compatible_match(const char *const *compats)
+{
+ return false;
+}
+
static inline bool of_console_check(const struct device_node *dn, const char *name, int index)
{
return false;
--
2.14.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox