LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH 5/5] powerpc: use the generic dma_ops_bypass mode
From: Christoph Hellwig @ 2020-08-31  6:40 UTC (permalink / raw)
  To: Cédric Le Goater
  Cc: linuxppc-dev, Daniel Borkmann, Alexey Kardashevskiy,
	Björn Töpel, Joerg Roedel, Jesper Dangaard Brouer,
	linux-kernel, iommu, Oliver O'Halloran, Greg Kroah-Hartman,
	aacraid, Robin Murphy, Christoph Hellwig, Lu Baolu
In-Reply-To: <505bcc1d-01a7-9655-88e1-ebddd0b94d56@kaod.org>

On Sun, Aug 30, 2020 at 11:04:21AM +0200, Cédric Le Goater wrote:
> Hello,
> 
> On 7/8/20 5:24 PM, Christoph Hellwig wrote:
> > Use the DMA API bypass mechanism for direct window mappings.  This uses
> > common code and speed up the direct mapping case by avoiding indirect
> > calls just when not using dma ops at all.  It also fixes a problem where
> > the sync_* methods were using the bypass check for DMA allocations, but
> > those are part of the streaming ops.
> > 
> > Note that this patch loses the DMA_ATTR_WEAK_ORDERING override, which
> > has never been well defined, as is only used by a few drivers, which
> > IIRC never showed up in the typical Cell blade setups that are affected
> > by the ordering workaround.
> > 
> > Fixes: efd176a04bef ("powerpc/pseries/dma: Allow SWIOTLB")
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  arch/powerpc/Kconfig              |  1 +
> >  arch/powerpc/include/asm/device.h |  5 --
> >  arch/powerpc/kernel/dma-iommu.c   | 90 ++++---------------------------
> >  3 files changed, 10 insertions(+), 86 deletions(-)
> 
> I am seeing corruptions on a couple of POWER9 systems (boston) when
> stressed with IO. stress-ng gives some results but I have first seen
> it when compiling the kernel in a guest and this is still the best way
> to raise the issue.
> 
> These systems have of a SAS Adaptec controller :
> 
>   0003:01:00.0 Serial Attached SCSI controller: Adaptec Series 8 12G SAS/PCIe 3 (rev 01)
> 
> When the failure occurs, the POWERPC EEH interrupt fires and dumps
> lowlevel PHB4 registers among which :
> 					  
>   [ 2179.251069490,3] PHB#0003[0:3]:           phbErrorStatus = 0000028000000000
>   [ 2179.251117476,3] PHB#0003[0:3]:      phbFirstErrorStatus = 0000020000000000
> 
> The bits raised identify a PPC 'TCE' error, which means it is related
> to DMAs. See below for more details.
> 
> 
> Reverting this patch "fixes" the issue but it is probably else where,
> in some other layers or in the aacraid driver. How should I proceed 
> to get more information ?

The aacraid DMA masks look like a mess.  Can you try the hack
below and see it it helps?

diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 769af4ca9ca97e..79c6b744dbb66c 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -2228,18 +2228,6 @@ int aac_get_adapter_info(struct aac_dev* dev)
 		expose_physicals = 0;
 	}
 
-	if (dev->dac_support) {
-		if (!pci_set_dma_mask(dev->pdev, DMA_BIT_MASK(64))) {
-			if (!dev->in_reset)
-				dev_info(&dev->pdev->dev, "64 Bit DAC enabled\n");
-		} else if (!pci_set_dma_mask(dev->pdev, DMA_BIT_MASK(32))) {
-			dev_info(&dev->pdev->dev, "DMA mask set failed, 64 Bit DAC disabled\n");
-			dev->dac_support = 0;
-		} else {
-			dev_info(&dev->pdev->dev, "No suitable DMA available\n");
-			rcode = -ENOMEM;
-		}
-	}
 	/*
 	 * Deal with configuring for the individualized limits of each packet
 	 * interface.
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index adbdc3b7c7a706..dbb23b351a4e7d 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -1479,7 +1479,6 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type)
 	struct Scsi_Host *host = aac->scsi_host_ptr;
 	int jafo = 0;
 	int bled;
-	u64 dmamask;
 	int num_of_fibs = 0;
 
 	/*
@@ -1558,22 +1557,7 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced, u8 reset_type)
 	kfree(aac->fsa_dev);
 	aac->fsa_dev = NULL;
 
-	dmamask = DMA_BIT_MASK(32);
 	quirks = aac_get_driver_ident(index)->quirks;
-	if (quirks & AAC_QUIRK_31BIT)
-		retval = pci_set_dma_mask(aac->pdev, dmamask);
-	else if (!(quirks & AAC_QUIRK_SRC))
-		retval = pci_set_dma_mask(aac->pdev, dmamask);
-	else
-		retval = pci_set_consistent_dma_mask(aac->pdev, dmamask);
-
-	if (quirks & AAC_QUIRK_31BIT && !retval) {
-		dmamask = DMA_BIT_MASK(31);
-		retval = pci_set_consistent_dma_mask(aac->pdev, dmamask);
-	}
-
-	if (retval)
-		goto out;
 
 	if ((retval = (*(aac_get_driver_ident(index)->init))(aac)))
 		goto out;
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 8588da0a065551..d897a9d59e24a1 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1634,8 +1634,6 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct list_head *insert = &aac_devices;
 	int error;
 	int unique_id = 0;
-	u64 dmamask;
-	int mask_bits = 0;
 	extern int aac_sync_mode;
 
 	/*
@@ -1658,33 +1656,6 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (error)
 		goto out;
 
-	if (!(aac_drivers[index].quirks & AAC_QUIRK_SRC)) {
-		error = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (error) {
-			dev_err(&pdev->dev, "PCI 32 BIT dma mask set failed");
-			goto out_disable_pdev;
-		}
-	}
-
-	/*
-	 * If the quirk31 bit is set, the adapter needs adapter
-	 * to driver communication memory to be allocated below 2gig
-	 */
-	if (aac_drivers[index].quirks & AAC_QUIRK_31BIT) {
-		dmamask = DMA_BIT_MASK(31);
-		mask_bits = 31;
-	} else {
-		dmamask = DMA_BIT_MASK(32);
-		mask_bits = 32;
-	}
-
-	error = pci_set_consistent_dma_mask(pdev, dmamask);
-	if (error) {
-		dev_err(&pdev->dev, "PCI %d B consistent dma mask set failed\n"
-				, mask_bits);
-		goto out_disable_pdev;
-	}
-
 	pci_set_master(pdev);
 
 	shost = scsi_host_alloc(&aac_driver_template, sizeof(struct aac_dev));

^ permalink raw reply related

* [PATCH] powerpc: Fix random segfault when freeing hugetlb range
From: Christophe Leroy @ 2020-08-31  7:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

The following random segfault is observed from time to time with
map_hugetlb selftest:

root@localhost:~# ./map_hugetlb 1 19
524288 kB hugepages
Mapping 1 Mbytes
Segmentation fault

[   31.219972] map_hugetlb[365]: segfault (11) at 117 nip 77974f8c lr 779a6834 code 1 in ld-2.23.so[77966000+21000]
[   31.220192] map_hugetlb[365]: code: 9421ffc0 480318d1 93410028 90010044 9361002c 93810030 93a10034 93c10038
[   31.220307] map_hugetlb[365]: code: 93e1003c 93210024 8123007c 81430038 <80e90004> 814a0004 7f443a14 813a0004
[   31.221911] BUG: Bad rss-counter state mm:(ptrval) type:MM_FILEPAGES val:33
[   31.229362] BUG: Bad rss-counter state mm:(ptrval) type:MM_ANONPAGES val:5

This fault is due to hugetlb_free_pgd_range() freeing page tables
that are also used by regular pages.

As explain in the comment at the beginning of
hugetlb_free_pgd_range(), the verification done in free_pgd_range()
on floor and ceiling is not done here, which means
hugetlb_free_pte_range() can free outside the expected range.

As the verification cannot be done in hugetlb_free_pgd_range(), it
must be done in hugetlb_free_pte_range().

Fixes: b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard pages.")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/hugetlbpage.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 26292544630f..e7ae2a2c4545 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
 				 get_hugepd_cache_index(pdshift - shift));
 }
 
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr)
+static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling)
 {
+	unsigned long start = addr;
 	pgtable_t token = pmd_pgtable(*pmd);
 
+	start &= PMD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PMD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
 	mm_dec_nr_ptes(tlb->mm);
@@ -363,7 +377,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 			 */
 			WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
 
-			hugetlb_free_pte_range(tlb, pmd, addr);
+			hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
 
 			continue;
 		}
-- 
2.25.0


^ permalink raw reply related

* [PATCH] selftests/vm: Fix display of page size in map_hugetlb
From: Christophe Leroy @ 2020-08-31  8:23 UTC (permalink / raw)
  To: Shuah Khan, Andrew Morton; +Cc: linuxppc-dev, linux-kernel, linux-kselftest

The displayed size is in bytes while the text says it is in kB.

Shift it by 10 to really display kBytes.

Fixes: fa7b9a805c79 ("tools/selftest/vm: allow choosing mem size and page size in map_hugetlb")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 tools/testing/selftests/vm/map_hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
index 6af951900aa3..312889edb84a 100644
--- a/tools/testing/selftests/vm/map_hugetlb.c
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -83,7 +83,7 @@ int main(int argc, char **argv)
 	}
 
 	if (shift)
-		printf("%u kB hugepages\n", 1 << shift);
+		printf("%u kB hugepages\n", 1 << (shift - 10));
 	else
 		printf("Default size hugepages\n");
 	printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
-- 
2.25.0


^ permalink raw reply related

* [PATCH 2/2] powerpc/8xx: Support 16k hugepages with 4k pages
From: Christophe Leroy @ 2020-08-31  8:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <f6ea2483c2c389567b007945948f704d18cfaeea.1598862623.git.christophe.leroy@csgroup.eu>

The 8xx has 4 page sizes: 4k, 16k, 512k and 8M

4k and 16k can be selected at build time as standard page sizes,
and 512k and 8M are hugepages.

When 4k standard pages are selected, 16k pages are not available.

Allow 16k pages as hugepages when 4k pages are used.

To allow that, implement arch_make_huge_pte() which receives
the necessary arguments to allow setting the PTE in accordance
with the page size:
- 512 k pages must have _PAGE_HUGE and _PAGE_SPS. They are set
by pte_mkhuge(). arch_make_huge_pte() does nothing.
- 16 k pages must have only _PAGE_SPS. arch_make_huge_pte() clears
_PAGE_HUGE.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 14 ++++++++++++++
 arch/powerpc/include/asm/nohash/32/pgtable.h     |  2 ++
 arch/powerpc/mm/hugetlbpage.c                    |  2 +-
 arch/powerpc/mm/nohash/tlb.c                     |  4 ----
 arch/powerpc/mm/ptdump/8xx.c                     |  5 +++++
 include/uapi/asm-generic/hugetlb_encode.h        |  1 +
 include/uapi/linux/mman.h                        |  1 +
 7 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index e752a5807a59..39be9aea86db 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -65,4 +65,18 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	pte_update(mm, addr, ptep, clr, set, 1);
 }
 
+#ifdef CONFIG_PPC_4K_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				       struct page *page, int writable)
+{
+	size_t size = huge_page_size(hstate_vma(vma));
+
+	if (size == SZ_16K)
+		return __pte(pte_val(entry) & ~_PAGE_HUGE);
+	else
+		return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+#endif
+
 #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 80bbc21b87f0..ee2243ba96cf 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -235,6 +235,8 @@ static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
 		return PAGE_SIZE / SZ_4K;
 	else if (hugepd_ok(*((hugepd_t *)pmd)))
 		return 1;
+	else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE))
+		return SZ_16K / SZ_4K;
 	else
 		return SZ_512K / SZ_4K;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e7ae2a2c4545..36c3800769fb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -180,7 +180,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	if (!hpdp)
 		return NULL;
 
-	if (IS_ENABLED(CONFIG_PPC_8xx) && sz == SZ_512K)
+	if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
 		return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
 
 	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 14514585db98..5872f69141d5 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -83,16 +83,12 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 };
 #elif defined(CONFIG_PPC_8xx)
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
-	/* we only manage 4k and 16k pages as normal pages */
-#ifdef CONFIG_PPC_4K_PAGES
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
 	},
-#else
 	[MMU_PAGE_16K] = {
 		.shift	= 14,
 	},
-#endif
 	[MMU_PAGE_512K] = {
 		.shift	= 19,
 	},
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 8a797dcbf475..86da2a669680 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -11,8 +11,13 @@
 
 static const struct flag_info flag_array[] = {
 	{
+#ifdef CONFIG_PPC_16K_PAGES
 		.mask	= _PAGE_HUGE,
 		.val	= _PAGE_HUGE,
+#else
+		.mask	= _PAGE_SPS,
+		.val	= _PAGE_SPS,
+#endif
 		.set	= "huge",
 		.clear	= "    ",
 	}, {
diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
index b0f8e87235bd..4f3d5aaa11f5 100644
--- a/include/uapi/asm-generic/hugetlb_encode.h
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -20,6 +20,7 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT	26
 #define HUGETLB_FLAG_ENCODE_MASK	0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB	(14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB	(16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB	(19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB		(20 << HUGETLB_FLAG_ENCODE_SHIFT)
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index 923cc162609c..f55bc680b5b0 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -27,6 +27,7 @@
 #define MAP_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
 #define MAP_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
 
+#define MAP_HUGE_16KB	HUGETLB_FLAG_ENCODE_16KB
 #define MAP_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
 #define MAP_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
 #define MAP_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
-- 
2.25.0


^ permalink raw reply related

* [PATCH 1/2] powerpc/8xx: Refactor calculation of number of entries per PTE in page tables
From: Christophe Leroy @ 2020-08-31  8:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On 8xx, the number of entries occupied by a PTE in the page tables
depends on the size of the page. At the time being, this calculation
is done in two places: in pte_update() and in set_huge_pte_at()

Refactor this calculation into a helper called
number_of_cells_per_pte(). For the time being, the val param is
unused. It will be used by following patch.

Instead of opencoding is_hugepd(), use hugepd_ok() with a forward
declaration.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/nohash/32/pgtable.h | 18 ++++++++++++------
 arch/powerpc/mm/pgtable.c                    |  6 ++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b9e134d0f03a..80bbc21b87f0 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -227,6 +227,17 @@ static inline void pmd_clear(pmd_t *pmdp)
  */
 #ifdef CONFIG_PPC_8xx
 static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
+static int hugepd_ok(hugepd_t hpd);
+
+static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
+{
+	if (!huge)
+		return PAGE_SIZE / SZ_4K;
+	else if (hugepd_ok(*((hugepd_t *)pmd)))
+		return 1;
+	else
+		return SZ_512K / SZ_4K;
+}
 
 static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
 				     unsigned long clr, unsigned long set, int huge)
@@ -237,12 +248,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
 	int num, i;
 	pmd_t *pmd = pmd_off(mm, addr);
 
-	if (!huge)
-		num = PAGE_SIZE / SZ_4K;
-	else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) != _PMD_PAGE_8M)
-		num = SZ_512K / SZ_4K;
-	else
-		num = 1;
+	num = number_of_cells_per_pte(pmd, new, huge);
 
 	for (i = 0; i < num; i++, entry++, new += SZ_4K)
 		*entry = new;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9c0547d77af3..2dcad640b869 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -266,8 +266,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_
 	pmd_t *pmd = pmd_off(mm, addr);
 	pte_basic_t val;
 	pte_basic_t *entry = &ptep->pte;
-	int num = is_hugepd(*((hugepd_t *)pmd)) ? 1 : SZ_512K / SZ_4K;
-	int i;
+	int num, i;
 
 	/*
 	 * Make sure hardware valid bit is not set. We don't do
@@ -280,6 +279,9 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_
 	pte = set_pte_filter(pte);
 
 	val = pte_val(pte);
+
+	num = number_of_cells_per_pte(pmd, val, 1);
+
 	for (i = 0; i < num; i++, entry++, val += SZ_4K)
 		*entry = val;
 }
-- 
2.25.0


^ permalink raw reply related

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Michal Suchánek @ 2020-08-31  9:07 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: ro, linuxppc-dev
In-Reply-To: <1598835313.5688ngko4f.astroid@bobo.none>

On Mon, Aug 31, 2020 at 11:14:18AM +1000, Nicholas Piggin wrote:
> Excerpts from Michal Suchánek's message of August 31, 2020 6:11 am:
> > Hello,
> > 
> > on POWER8 KVM hosts lock up since commit 10d91611f426 ("powerpc/64s:
> > Reimplement book3s idle code in C").
> > 
> > The symptom is host locking up completely after some hours of KVM
> > workload with messages like
> > 
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 
> > printed before the host locks up.
> > 
> > The machines run sandboxed builds which is a mixed workload resulting in
> > IO/single core/mutiple core load over time and there are periods of no
> > activity and no VMS runnig as well. The VMs are shortlived so VM
> > setup/terdown is somewhat excercised as well.
> > 
> > POWER9 with the new guest entry fast path does not seem to be affected.
> > 
> > Reverted the patch and the followup idle fixes on top of 5.2.14 and
> > re-applied commit a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR
> > after idle") which gives same idle code as 5.1.16 and the kernel seems
> > stable.
> > 
> > Config is attached.
> > 
> > I cannot easily revert this commit, especially if I want to use the same
> > kernel on POWER8 and POWER9 - many of the POWER9 fixes are applicable
> > only to the new idle code.
> > 
> > Any idea what can be the problem?
> 
> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
> those threads. I wonder what they are doing. POWER8 doesn't have a good
> NMI IPI and I don't know if it supports pdbg dumping registers from the
> BMC unfortunately. Do the messages always come in pairs of CPUs?
> 
> I'm not sure where to start with reproducing, I'll have to try. How many
> vCPUs in the guests? Do you have several guests running at once?

The guests are spawned on demand - there are like 20-30 'slots'
configured where a VM may be running or it may be idle with no VM
spawned when there are no jobs available.

Thanks

Michal

^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Michal Suchánek @ 2020-08-31  9:15 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: ro, linuxppc-dev
In-Reply-To: <1598835313.5688ngko4f.astroid@bobo.none>

On Mon, Aug 31, 2020 at 11:14:18AM +1000, Nicholas Piggin wrote:
> Excerpts from Michal Suchánek's message of August 31, 2020 6:11 am:
> > Hello,
> > 
> > on POWER8 KVM hosts lock up since commit 10d91611f426 ("powerpc/64s:
> > Reimplement book3s idle code in C").
> > 
> > The symptom is host locking up completely after some hours of KVM
> > workload with messages like
> > 
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
> > 
> > printed before the host locks up.
> > 
> > The machines run sandboxed builds which is a mixed workload resulting in
> > IO/single core/mutiple core load over time and there are periods of no
> > activity and no VMS runnig as well. The VMs are shortlived so VM
> > setup/terdown is somewhat excercised as well.
> > 
> > POWER9 with the new guest entry fast path does not seem to be affected.
> > 
> > Reverted the patch and the followup idle fixes on top of 5.2.14 and
> > re-applied commit a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR
> > after idle") which gives same idle code as 5.1.16 and the kernel seems
> > stable.
> > 
> > Config is attached.
> > 
> > I cannot easily revert this commit, especially if I want to use the same
> > kernel on POWER8 and POWER9 - many of the POWER9 fixes are applicable
> > only to the new idle code.
> > 
> > Any idea what can be the problem?
> 
> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
> those threads. I wonder what they are doing. POWER8 doesn't have a good
> NMI IPI and I don't know if it supports pdbg dumping registers from the
> BMC unfortunately.
It may be possible to set up fadump with a later kernel version that
supports it on powernv and dump the whole kernel.

Thanks

Michal

^ permalink raw reply

* Re: [PATCH 5/5] powerpc: use the generic dma_ops_bypass mode
From: Cédric Le Goater @ 2020-08-30  9:04 UTC (permalink / raw)
  To: Christoph Hellwig, iommu, Alexey Kardashevskiy
  Cc: linuxppc-dev, Daniel Borkmann, Björn Töpel,
	Joerg Roedel, Jesper Dangaard Brouer, linux-kernel,
	Oliver O'Halloran, Greg Kroah-Hartman, aacraid, Robin Murphy,
	Lu Baolu
In-Reply-To: <20200708152449.316476-6-hch@lst.de>

Hello,

On 7/8/20 5:24 PM, Christoph Hellwig wrote:
> Use the DMA API bypass mechanism for direct window mappings.  This uses
> common code and speed up the direct mapping case by avoiding indirect
> calls just when not using dma ops at all.  It also fixes a problem where
> the sync_* methods were using the bypass check for DMA allocations, but
> those are part of the streaming ops.
> 
> Note that this patch loses the DMA_ATTR_WEAK_ORDERING override, which
> has never been well defined, as is only used by a few drivers, which
> IIRC never showed up in the typical Cell blade setups that are affected
> by the ordering workaround.
> 
> Fixes: efd176a04bef ("powerpc/pseries/dma: Allow SWIOTLB")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  arch/powerpc/Kconfig              |  1 +
>  arch/powerpc/include/asm/device.h |  5 --
>  arch/powerpc/kernel/dma-iommu.c   | 90 ++++---------------------------
>  3 files changed, 10 insertions(+), 86 deletions(-)

I am seeing corruptions on a couple of POWER9 systems (boston) when
stressed with IO. stress-ng gives some results but I have first seen
it when compiling the kernel in a guest and this is still the best way
to raise the issue.

These systems have of a SAS Adaptec controller :

  0003:01:00.0 Serial Attached SCSI controller: Adaptec Series 8 12G SAS/PCIe 3 (rev 01)

When the failure occurs, the POWERPC EEH interrupt fires and dumps
lowlevel PHB4 registers among which :
					  
  [ 2179.251069490,3] PHB#0003[0:3]:           phbErrorStatus = 0000028000000000
  [ 2179.251117476,3] PHB#0003[0:3]:      phbFirstErrorStatus = 0000020000000000

The bits raised identify a PPC 'TCE' error, which means it is related
to DMAs. See below for more details.


Reverting this patch "fixes" the issue but it is probably else where,
in some other layers or in the aacraid driver. How should I proceed 
to get more information ?

Thanks,

C.


[ 2054.970339] EEH: Frozen PE#1fd on PHB#3 detected
[ 2054.970375] EEH: PE location: UOPWR.BOS0019-Node0-Onboard SAS, PHB location: N/A
[ 2179.249415973,3] PHB#0003[0:3]:                  brdgCtl = 00000002
[ 2179.249515795,3] PHB#0003[0:3]:             deviceStatus = 00060040
[ 2179.249596452,3] PHB#0003[0:3]:               slotStatus = 00402000
[ 2179.249633728,3] PHB#0003[0:3]:               linkStatus = a0830008
[ 2179.249674807,3] PHB#0003[0:3]:             devCmdStatus = 00100107
[ 2179.249725974,3] PHB#0003[0:3]:             devSecStatus = 00100107
[ 2179.249773550,3] PHB#0003[0:3]:          rootErrorStatus = 00000000
[ 2179.249809823,3] PHB#0003[0:3]:          corrErrorStatus = 00000000
[ 2179.249850439,3] PHB#0003[0:3]:        uncorrErrorStatus = 00000000
[ 2179.249887411,3] PHB#0003[0:3]:                   devctl = 00000040
[ 2179.249928677,3] PHB#0003[0:3]:                  devStat = 00000006
[ 2179.249967150,3] PHB#0003[0:3]:                  tlpHdr1 = 00000000
[ 2179.250054987,3] PHB#0003[0:3]:                  tlpHdr2 = 00000000
[ 2179.250146600,3] PHB#0003[0:3]:                  tlpHdr3 = 00000000
[ 2179.250262780,3] PHB#0003[0:3]:                  tlpHdr4 = 00000000
[ 2179.250343101,3] PHB#0003[0:3]:                 sourceId = 00000000
[ 2179.250514264,3] PHB#0003[0:3]:                     nFir = 0000000000000000
[ 2179.250717971,3] PHB#0003[0:3]:                 nFirMask = 0030001c00000000
[ 2179.250791474,3] PHB#0003[0:3]:                  nFirWOF = 0000000000000000
[ 2179.250842054,3] PHB#0003[0:3]:                 phbPlssr = 0000001c00000000
[ 2179.250886003,3] PHB#0003[0:3]:                   phbCsr = 0000001c00000000
[ 2179.250929859,3] PHB#0003[0:3]:                   lemFir = 0000000100000080
[ 2179.250973720,3] PHB#0003[0:3]:             lemErrorMask = 0000000000000000
[ 2179.251018622,3] PHB#0003[0:3]:                   lemWOF = 0000000000000080
[ 2179.251069490,3] PHB#0003[0:3]:           phbErrorStatus = 0000028000000000
[ 2179.251117476,3] PHB#0003[0:3]:      phbFirstErrorStatus = 0000020000000000
[ 2179.251162218,3] PHB#0003[0:3]:             phbErrorLog0 = 2148000098000240
[ 2179.251206251,3] PHB#0003[0:3]:             phbErrorLog1 = a008400000000000
[ 2179.251253096,3] PHB#0003[0:3]:        phbTxeErrorStatus = 0000000000000000
[ 2179.265087656,3] PHB#0003[0:3]:   phbTxeFirstErrorStatus = 0000000000000000
[ 2179.265142247,3] PHB#0003[0:3]:          phbTxeErrorLog0 = 0000000000000000
[ 2179.265189734,3] PHB#0003[0:3]:          phbTxeErrorLog1 = 0000000000000000
[ 2179.266335296,3] PHB#0003[0:3]:     phbRxeArbErrorStatus = 0000000000000000
[ 2179.266380366,3] PHB#0003[0:3]: phbRxeArbFrstErrorStatus = 0000000000000000
[ 2179.266426523,3] PHB#0003[0:3]:       phbRxeArbErrorLog0 = 0000000000000000
[ 2179.267537283,3] PHB#0003[0:3]:       phbRxeArbErrorLog1 = 0000000000000000
[ 2179.267581708,3] PHB#0003[0:3]:     phbRxeMrgErrorStatus = 0000000000000000
[ 2179.267628324,3] PHB#0003[0:3]: phbRxeMrgFrstErrorStatus = 0000000000000000
[ 2179.268748771,3] PHB#0003[0:3]:       phbRxeMrgErrorLog0 = 0000000000000000
[ 2179.268794753,3] PHB#0003[0:3]:       phbRxeMrgErrorLog1 = 0000000000000000
[ 2179.268841144,3] PHB#0003[0:3]:     phbRxeTceErrorStatus = 6000000000000000
[ 2179.269945557,3] PHB#0003[0:3]: phbRxeTceFrstErrorStatus = 2000000000000000
[ 2179.269997896,3] PHB#0003[0:3]:       phbRxeTceErrorLog0 = c0000000000001fd
[ 2179.270094740,3] PHB#0003[0:3]:       phbRxeTceErrorLog1 = 0000000000000000
[ 2179.270144030,3] PHB#0003[0:3]:        phbPblErrorStatus = 0000000000020000
[ 2179.281523166,3] PHB#0003[0:3]:   phbPblFirstErrorStatus = 0000000000020000
[ 2179.281575378,3] PHB#0003[0:3]:          phbPblErrorLog0 = 0000000000000000
[ 2179.281627897,3] PHB#0003[0:3]:          phbPblErrorLog1 = 0000000000000000
[ 2179.282710177,3] PHB#0003[0:3]:      phbPcieDlpErrorLog1 = 0000000000000000
[ 2179.282761495,3] PHB#0003[0:3]:      phbPcieDlpErrorLog2 = 0000000000000000
[ 2179.282813999,3] PHB#0003[0:3]:    phbPcieDlpErrorStatus = 0000000000000000
[ 2179.283926192,3] PHB#0003[0:3]:       phbRegbErrorStatus = 0000004000000000
[ 2179.283978457,3] PHB#0003[0:3]:  phbRegbFirstErrorStatus = 0000004000000000
[ 2179.284033525,3] PHB#0003[0:3]:         phbRegbErrorLog0 = 8800005800000000
[ 2179.285123005,3] PHB#0003[0:3]:         phbRegbErrorLog1 = 0000000007011000
[ 2179.285178505,3] PHB#0003[0:3]:                PEST[1fd] = 8300b03800000000 8000000000000000
[ 2055.043233] EEH: Recovering PHB#3-PE#1fd
[ 2055.043274] EEH: PE location: UOPWR.BOS0019-Node0-Onboard SAS, PHB location: N/A
[ 2055.043423] aacraid 0003:01:00.0: aacraid: PCI error detected 2
[ 2055.099306] blk_update_request: I/O error, dev sda, sector 510949048 op 0x1:(WRITE) flags 0xc800 phys_seg 4 prio class 0

^ permalink raw reply

* Re: [PATCH 5/5] powerpc: use the generic dma_ops_bypass mode
From: Cédric Le Goater @ 2020-08-31  7:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linuxppc-dev, Daniel Borkmann, Alexey Kardashevskiy,
	Björn Töpel, Joerg Roedel, Jesper Dangaard Brouer,
	linux-kernel, iommu, Oliver O'Halloran, Greg Kroah-Hartman,
	aacraid, Robin Murphy, Lu Baolu
In-Reply-To: <20200831064038.GB27617@lst.de>

On 8/31/20 8:40 AM, Christoph Hellwig wrote:
> On Sun, Aug 30, 2020 at 11:04:21AM +0200, Cédric Le Goater wrote:
>> Hello,
>>
>> On 7/8/20 5:24 PM, Christoph Hellwig wrote:
>>> Use the DMA API bypass mechanism for direct window mappings.  This uses
>>> common code and speed up the direct mapping case by avoiding indirect
>>> calls just when not using dma ops at all.  It also fixes a problem where
>>> the sync_* methods were using the bypass check for DMA allocations, but
>>> those are part of the streaming ops.
>>>
>>> Note that this patch loses the DMA_ATTR_WEAK_ORDERING override, which
>>> has never been well defined, as is only used by a few drivers, which
>>> IIRC never showed up in the typical Cell blade setups that are affected
>>> by the ordering workaround.
>>>
>>> Fixes: efd176a04bef ("powerpc/pseries/dma: Allow SWIOTLB")
>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>> ---
>>>  arch/powerpc/Kconfig              |  1 +
>>>  arch/powerpc/include/asm/device.h |  5 --
>>>  arch/powerpc/kernel/dma-iommu.c   | 90 ++++---------------------------
>>>  3 files changed, 10 insertions(+), 86 deletions(-)
>>
>> I am seeing corruptions on a couple of POWER9 systems (boston) when
>> stressed with IO. stress-ng gives some results but I have first seen
>> it when compiling the kernel in a guest and this is still the best way
>> to raise the issue.
>>
>> These systems have of a SAS Adaptec controller :
>>
>>   0003:01:00.0 Serial Attached SCSI controller: Adaptec Series 8 12G SAS/PCIe 3 (rev 01)
>>
>> When the failure occurs, the POWERPC EEH interrupt fires and dumps
>> lowlevel PHB4 registers among which :
>> 					  
>>   [ 2179.251069490,3] PHB#0003[0:3]:           phbErrorStatus = 0000028000000000
>>   [ 2179.251117476,3] PHB#0003[0:3]:      phbFirstErrorStatus = 0000020000000000
>>
>> The bits raised identify a PPC 'TCE' error, which means it is related
>> to DMAs. See below for more details.
>>
>>
>> Reverting this patch "fixes" the issue but it is probably else where,
>> in some other layers or in the aacraid driver. How should I proceed 
>> to get more information ?
> 
> The aacraid DMA masks look like a mess.  Can you try the hack
> below and see it it helps?

No effect. The system crashes the same. But Alexey spotted some issue 
with swiotlb.

C. 

^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Ruediger Oertel @ 2020-08-31  8:48 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, Michal Suchánek
In-Reply-To: <1598835313.5688ngko4f.astroid@bobo.none>


[-- Attachment #1.1: Type: text/plain, Size: 3377 bytes --]

Am 31.08.20 um 03:14 schrieb Nicholas Piggin:
> Excerpts from Michal Suchánek's message of August 31, 2020 6:11 am:
>> Hello,
>>
>> on POWER8 KVM hosts lock up since commit 10d91611f426 ("powerpc/64s:
>> Reimplement book3s idle code in C").
>>
>> The symptom is host locking up completely after some hours of KVM
>> workload with messages like
>>
>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>>
>> printed before the host locks up.
>>
>> The machines run sandboxed builds which is a mixed workload resulting in
>> IO/single core/mutiple core load over time and there are periods of no
>> activity and no VMS runnig as well. The VMs are shortlived so VM
>> setup/terdown is somewhat excercised as well.
>>
>> POWER9 with the new guest entry fast path does not seem to be affected.
>>
>> Reverted the patch and the followup idle fixes on top of 5.2.14 and
>> re-applied commit a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR
>> after idle") which gives same idle code as 5.1.16 and the kernel seems
>> stable.
>>
>> Config is attached.
>>
>> I cannot easily revert this commit, especially if I want to use the same
>> kernel on POWER8 and POWER9 - many of the POWER9 fixes are applicable
>> only to the new idle code.
>>
>> Any idea what can be the problem?
> 
> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
> those threads. I wonder what they are doing. POWER8 doesn't have a good
> NMI IPI and I don't know if it supports pdbg dumping registers from the
> BMC unfortunately. Do the messages always come in pairs of CPUs?
> 
> I'm not sure where to start with reproducing, I'll have to try. How many
> vCPUs in the guests? Do you have several guests running at once?

Hello all,

some details on the setup...
these machines are buildservice workers, (build.opensuse.org) and all they
do is spawn new VMs, run a package building job inside (rpmbuild, debbuild,...)

The machines are running in OPAL/PowerNV mode, with "ppc64_cpu --smt=off".
The number of VMs varies across the machines:
obs-power8-01: 18 instances, "-smp 16,threads=8"
obs-power8-02: 20 instances, "-smp 8,threads=8"
obs-power8-03: 30 instances, "-smp 8,threads=8"
obs-power8-04: 20 instances, "-smp 8,threads=8"
obs-power8-05: 36 instances, "-smp 4,threads=2" (this one with "ppc64_cpu --subcores-per-core=4"

but anyway the stalls can be seen on all of them, sometimes after 4 hours
sometimes just after about a day. The 01 with more cpu overcommit seems
a little faster reproducing the problem, but that's more gut feeling than
anything backed by real numbers.


-- 
with kind regards (mit freundlichem Grinsen),
  Ruediger Oertel (ro@suse.com,ro@suse.de,bugfinder@t-online.de)
--------Do-Not-Accept-Binary-Blobs.----Ever.----From-Anyone.------------
Key fingerprint   =   17DC 6553 86A7 384B 53C5  CA5C 3CE4 F2E7 23F2 B417
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg,
  Germany, (HRB 36809, AG Nürnberg), Geschäftsführer: Felix Imendörffer


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Ruediger Oertel @ 2020-08-31  9:03 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, Michal Suchánek
In-Reply-To: <1598835313.5688ngko4f.astroid@bobo.none>


[-- Attachment #1.1: Type: text/plain, Size: 3794 bytes --]

Am 31.08.20 um 03:14 schrieb Nicholas Piggin:

> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
> those threads. I wonder what they are doing. POWER8 doesn't have a good
> NMI IPI and I don't know if it supports pdbg dumping registers from the
> BMC unfortunately.


> Do the messages always come in pairs of CPUs?

well,
- one problem is that at some point the machine just locks up completely,
  so I can not tell if there were lines not printed any more and in some
  cases all I get is a single line
- looking at the stats in generally it's either one cpu printed several
  times or a pair ("not strictly") alternatingly

2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.029821] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.058630] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.108268] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.210206] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.323465] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.334420] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.345470] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.395185] KVM: couldn't grab cpu 92
2020-07-30T03:16:16+00:00 obs-power8-03 kernel: [51284.517182] KVM: couldn't grab cpu 92
2020-07-30T03:16:17+00:00 obs-power8-03 kernel: [51284.600716] KVM: couldn't grab cpu 92
2020-07-30T03:16:18+00:00 obs-power8-03 kernel: [51286.201589] KVM: couldn't grab cpu 92
2020-07-30T03:16:19+00:00 obs-power8-03 kernel: [51286.627273] KVM: couldn't grab cpu 92

2020-07-30T16:44:16+00:00 obs-power8-04 kernel: [30099.726288] KVM: couldn't grab cpu 61
2020-07-30T16:44:16+00:00 obs-power8-04 kernel: [30099.736843] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30099.747429] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30099.877138] KVM: couldn't grab cpu 61
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30099.916422] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30099.931755] KVM: couldn't grab cpu 61
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.029003] KVM: couldn't grab cpu 61
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.334895] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.392713] KVM: couldn't grab cpu 61
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.569011] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.617048] KVM: couldn't grab cpu 125
2020-07-30T16:44:17+00:00 obs-power8-04 kernel: [30100.628107] KVM: couldn't grab cpu 125
2020-07-30T16:44:18+00:00 obs-power8-04 kernel: [30100.809046] KVM: couldn't grab cpu 125
2020-07-30T16:44:18+00:00 obs-power8-04 kernel: [30101.001097] KVM: couldn't grab cpu 61
2020-07-30T16:44:19+00:00 obs-power8-04 kernel: [30102.109007] KVM: couldn't grab cpu 125
2020-07-30T16:44:19+00:00 obs-power8-04 kernel: [30102.254470] KVM: couldn't grab cpu 61



> 
> I'm not sure where to start with reproducing, I'll have to try. How many
> vCPUs in the guests? Do you have several guests running at once?
> 
> Thanks,
> Nick
> 


-- 
with kind regards (mit freundlichem Grinsen),
  Ruediger Oertel (ro@suse.com,ro@suse.de,bugfinder@t-online.de)
--------Do-Not-Accept-Binary-Blobs.----Ever.----From-Anyone.------------
Key fingerprint   =   17DC 6553 86A7 384B 53C5  CA5C 3CE4 F2E7 23F2 B417
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg,
  Germany, (HRB 36809, AG Nürnberg), Geschäftsführer: Felix Imendörffer


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Michael Ellerman @ 2020-08-31 10:50 UTC (permalink / raw)
  To: Michal Suchánek, Nicholas Piggin; +Cc: linuxppc-dev, ro, Hari Bathini
In-Reply-To: <20200831091523.GC29521@kitsune.suse.cz>

Michal Suchánek <msuchanek@suse.de> writes:
> On Mon, Aug 31, 2020 at 11:14:18AM +1000, Nicholas Piggin wrote:
>> Excerpts from Michal Suchánek's message of August 31, 2020 6:11 am:
>> > Hello,
>> > 
>> > on POWER8 KVM hosts lock up since commit 10d91611f426 ("powerpc/64s:
>> > Reimplement book3s idle code in C").
>> > 
>> > The symptom is host locking up completely after some hours of KVM
>> > workload with messages like
>> > 
>> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>> > 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>> > 
>> > printed before the host locks up.
>> > 
>> > The machines run sandboxed builds which is a mixed workload resulting in
>> > IO/single core/mutiple core load over time and there are periods of no
>> > activity and no VMS runnig as well. The VMs are shortlived so VM
>> > setup/terdown is somewhat excercised as well.
>> > 
>> > POWER9 with the new guest entry fast path does not seem to be affected.
>> > 
>> > Reverted the patch and the followup idle fixes on top of 5.2.14 and
>> > re-applied commit a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR
>> > after idle") which gives same idle code as 5.1.16 and the kernel seems
>> > stable.
>> > 
>> > Config is attached.
>> > 
>> > I cannot easily revert this commit, especially if I want to use the same
>> > kernel on POWER8 and POWER9 - many of the POWER9 fixes are applicable
>> > only to the new idle code.
>> > 
>> > Any idea what can be the problem?
>> 
>> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
>> those threads. I wonder what they are doing. POWER8 doesn't have a good
>> NMI IPI and I don't know if it supports pdbg dumping registers from the
>> BMC unfortunately.
>
> It may be possible to set up fadump with a later kernel version that
> supports it on powernv and dump the whole kernel.

Your firmware won't support it AFAIK.

You could try kdump, but if we have CPUs stuck in KVM then there's a
good chance it won't work :/

cheers

^ permalink raw reply

* Re: fsl_espi errors on v5.7.15
From: Heiner Kallweit @ 2020-08-31 12:33 UTC (permalink / raw)
  To: Chris Packham, Nicholas Piggin, benh@kernel.crashing.org,
	broonie@kernel.org, mpe@ellerman.id.au, paulus@samba.org
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	linux-spi@vger.kernel.org
In-Reply-To: <6a9eb498-2982-05de-52f9-da5f6a626e49@alliedtelesis.co.nz>

On 30.08.2020 23:59, Chris Packham wrote:
> 
> On 31/08/20 9:41 am, Heiner Kallweit wrote:
>> On 30.08.2020 23:00, Chris Packham wrote:
>>> On 31/08/20 12:30 am, Nicholas Piggin wrote:
>>>> Excerpts from Chris Packham's message of August 28, 2020 8:07 am:
>>> <snip>
>>>
>>>>>>>>> I've also now seen the RX FIFO not empty error on the T2080RDB
>>>>>>>>>
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>>>>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>>>>>>>
>>>>>>>>> With my current workaround of emptying the RX FIFO. It seems
>>>>>>>>> survivable. Interestingly it only ever seems to be 1 extra byte in the
>>>>>>>>> RX FIFO and it seems to be after either a READ_SR or a READ_FSR.
>>>>>>>>>
>>>>>>>>> fsl_espi ffe110000.spi: tx 70
>>>>>>>>> fsl_espi ffe110000.spi: rx 03
>>>>>>>>> fsl_espi ffe110000.spi: Extra RX 00
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>>>>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>>>>>>> fsl_espi ffe110000.spi: tx 05
>>>>>>>>> fsl_espi ffe110000.spi: rx 00
>>>>>>>>> fsl_espi ffe110000.spi: Extra RX 03
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>>>>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>>>>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>>>>>>> fsl_espi ffe110000.spi: tx 05
>>>>>>>>> fsl_espi ffe110000.spi: rx 00
>>>>>>>>> fsl_espi ffe110000.spi: Extra RX 03
>>>>>>>>>
>>>>>>>>>    From all the Micron SPI-NOR datasheets I've got access to it is
>>>>>>>>> possible to continually read the SR/FSR. But I've no idea why it
>>>>>>>>> happens some times and not others.
>>>>>>>> So I think I've got a reproduction and I think I've bisected the problem
>>>>>>>> to commit 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in
>>>>>>>> C"). My day is just finishing now so I haven't applied too much scrutiny
>>>>>>>> to this result. Given the various rabbit holes I've been down on this
>>>>>>>> issue already I'd take this information with a good degree of skepticism.
>>>>>>>>
>>>>>>> OK, so an easy test should be to re-test with a 5.4 kernel.
>>>>>>> It doesn't have yet the change you're referring to, and the fsl-espi driver
>>>>>>> is basically the same as in 5.7 (just two small changes in 5.7).
>>>>>> There's 6cc0c16d82f88 and maybe also other interrupt related patches
>>>>>> around this time that could affect book E, so it's good if that exact
>>>>>> patch is confirmed.
>>>>> My confirmation is basically that I can induce the issue in a 5.4 kernel
>>>>> by cherry-picking 3282a3da25bd. I'm also able to "fix" the issue in
>>>>> 5.9-rc2 by reverting that one commit.
>>>>>
>>>>> I both cases it's not exactly a clean cherry-pick/revert so I also
>>>>> confirmed the bisection result by building at 3282a3da25bd (which sees
>>>>> the issue) and the commit just before (which does not).
>>>> Thanks for testing, that confirms it well.
>>>>
>>>> [snip patch]
>>>>
>>>>> I still saw the issue with this change applied. PPC_IRQ_SOFT_MASK_DEBUG
>>>>> didn't report anything (either with or without the change above).
>>>> Okay, it was a bit of a shot in the dark. I still can't see what
>>>> else has changed.
>>>>
>>>> What would cause this, a lost interrupt? A spurious interrupt? Or
>>>> higher interrupt latency?
>>>>
>>>> I don't think the patch should cause significantly worse latency,
>>>> (it's supposed to be a bit better if anything because it doesn't set
>>>> up the full interrupt frame). But it's possible.
>>> My working theory is that the SPI_DON indication is all about the TX
>>> direction an now that the interrupts are faster we're hitting an error
>>> because there is still RX activity going on. Heiner disagrees with my
>>> interpretation of the SPI_DON indication and the fact that it doesn't
>>> happen every time does throw doubt on it.
>>>
>> It's right that the eSPI spec can be interpreted that SPI_DON refers to
>> TX only. However this wouldn't really make sense, because also for RX
>> we program the frame length, and therefore want to be notified once the
>> full frame was received. Also practical experience shows that SPI_DON
>> is set also after RX-only transfers.
>> Typical SPI NOR use case is that you write read command + start address,
>> followed by a longer read. If the TX-only interpretation would be right,
>> we'd always end up with SPI_DON not being set.
>>
>>> I can't really explain the extra RX byte in the fifo. We know how many
>>> bytes to expect and we pull that many from the fifo so it's not as if
>>> we're missing an interrupt causing us to skip the last byte. I've been
>>> looking for some kind of off-by-one calculation but again if it were
>>> something like that it'd happen all the time.
>>>
>> Maybe it helps to know what value this extra byte in the FIFO has. Is it:
>> - a duplicate of the last read byte
>> - or the next byte (at <end address> + 1)
>> - or a fixed value, e.g. always 0x00 or 0xff
> 
> The values were up thread a bit but I'll repeat them here
> 
> fsl_espi ffe110000.spi: tx 70
> fsl_espi ffe110000.spi: rx 03
> fsl_espi ffe110000.spi: Extra RX 00
> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
> fsl_espi ffe110000.spi: tx 05
> fsl_espi ffe110000.spi: rx 00
> fsl_espi ffe110000.spi: Extra RX 03
> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
> fsl_espi ffe110000.spi: tx 05
> fsl_espi ffe110000.spi: rx 00
> fsl_espi ffe110000.spi: Extra RX 03
> 
> 
> The rx 00 Extra RX 03 is a bit concerning. I've only ever seen them with
> either a READ_SR or a READ_FSR. Never a data read.
> 
Just remembered something about SPIE_DON:
Transfers are always full duplex, therefore in case of a read the chip
sends dummy zero's. Having said that in case of a read SPIE_DON means
that the last dummy zero was shifted out.

READ_SR and READ_FSR are the shortest transfers, 1 byte out and 1 byte in.
So the issue may have a dependency on the length of the transfer.
However I see no good explanation so far. You can try adding a delay of
a few miroseconds between the following to commands in fsl_espi_bufs().

	fsl_espi_write_reg(espi, ESPI_SPIM, mask);

	/* Prevent filling the fifo from getting interrupted */
	spin_lock_irq(&espi->lock);

Maybe enabling interrupts and seeing the SPIE_DON interrupt are too close.

^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Michael Ellerman @ 2020-08-31 12:58 UTC (permalink / raw)
  To: Ruediger Oertel, Nicholas Piggin, linuxppc-dev,
	Michal Suchánek
In-Reply-To: <50693d7e-6c8b-4908-d003-25b99a867850@suse.de>

Ruediger Oertel <ro@suse.de> writes:
> Am 31.08.20 um 03:14 schrieb Nicholas Piggin:
>> Excerpts from Michal Suchánek's message of August 31, 2020 6:11 am:
>>> Hello,
>>>
>>> on POWER8 KVM hosts lock up since commit 10d91611f426 ("powerpc/64s:
>>> Reimplement book3s idle code in C").
>>>
>>> The symptom is host locking up completely after some hours of KVM
>>> workload with messages like
>>>
>>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 71
>>> 2020-08-30T10:51:31+00:00 obs-power8-01 kernel: KVM: couldn't grab cpu 47
>>>
>>> printed before the host locks up.
>>>
>>> The machines run sandboxed builds which is a mixed workload resulting in
>>> IO/single core/mutiple core load over time and there are periods of no
>>> activity and no VMS runnig as well. The VMs are shortlived so VM
>>> setup/terdown is somewhat excercised as well.
>>>
>>> POWER9 with the new guest entry fast path does not seem to be affected.
>>>
>>> Reverted the patch and the followup idle fixes on top of 5.2.14 and
>>> re-applied commit a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR
>>> after idle") which gives same idle code as 5.1.16 and the kernel seems
>>> stable.
>>>
>>> Config is attached.
>>>
>>> I cannot easily revert this commit, especially if I want to use the same
>>> kernel on POWER8 and POWER9 - many of the POWER9 fixes are applicable
>>> only to the new idle code.
>>>
>>> Any idea what can be the problem?
>> 
>> So hwthread_state is never getting back to to HWTHREAD_IN_IDLE on
>> those threads. I wonder what they are doing. POWER8 doesn't have a good
>> NMI IPI and I don't know if it supports pdbg dumping registers from the
>> BMC unfortunately. Do the messages always come in pairs of CPUs?
>> 
>> I'm not sure where to start with reproducing, I'll have to try. How many
>> vCPUs in the guests? Do you have several guests running at once?
>
> Hello all,
>
> some details on the setup...
> these machines are buildservice workers, (build.opensuse.org) and all they
> do is spawn new VMs, run a package building job inside (rpmbuild, debbuild,...)
>
> The machines are running in OPAL/PowerNV mode, with "ppc64_cpu --smt=off".
> The number of VMs varies across the machines:
> obs-power8-01: 18 instances, "-smp 16,threads=8"
> obs-power8-02: 20 instances, "-smp 8,threads=8"
> obs-power8-03: 30 instances, "-smp 8,threads=8"
> obs-power8-04: 20 instances, "-smp 8,threads=8"

Can you send us the output of:

# grep . /sys/module/kvm_hv/parameters/*

cheers


^ permalink raw reply

* Re: KVM on POWER8 host lock up since 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
From: Ruediger Oertel @ 2020-08-31 13:13 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, linuxppc-dev,
	Michal Suchánek
In-Reply-To: <87v9gz0y5y.fsf@mpe.ellerman.id.au>


[-- Attachment #1.1: Type: text/plain, Size: 1312 bytes --]

Am 31.08.20 um 14:58 schrieb Michael Ellerman:
[...]
>> The machines are running in OPAL/PowerNV mode, with "ppc64_cpu --smt=off".
>> The number of VMs varies across the machines:
>> obs-power8-01: 18 instances, "-smp 16,threads=8"
>> obs-power8-02: 20 instances, "-smp 8,threads=8"
>> obs-power8-03: 30 instances, "-smp 8,threads=8"
>> obs-power8-04: 20 instances, "-smp 8,threads=8"
> 
> Can you send us the output of:
> 
> # grep . /sys/module/kvm_hv/parameters/*

of course, the current values are:

/sys/module/kvm_hv/parameters/dynamic_mt_modes:6
/sys/module/kvm_hv/parameters/h_ipi_redirect:1
/sys/module/kvm_hv/parameters/indep_threads_mode:Y
/sys/module/kvm_hv/parameters/kvm_irq_bypass:1
/sys/module/kvm_hv/parameters/nested:Y
/sys/module/kvm_hv/parameters/one_vm_per_core:N
/sys/module/kvm_hv/parameters/target_smt_mode:0

(actually identical on all 5 above)

-- 
with kind regards (mit freundlichem Grinsen),
  Ruediger Oertel (ro@suse.com,ro@suse.de,bugfinder@t-online.de)
--------Do-Not-Accept-Binary-Blobs.----Ever.----From-Anyone.------------
Key fingerprint   =   17DC 6553 86A7 384B 53C5  CA5C 3CE4 F2E7 23F2 B417
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg,
  Germany, (HRB 36809, AG Nürnberg), Geschäftsführer: Felix Imendörffer


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply

* [Bug 205183] PPC64: Signal delivery fails with SIGSEGV if between about 1KB and 4KB bytes of stack remain
From: bugzilla-daemon @ 2020-08-31 13:16 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-205183-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=205183

Michael Ellerman (michael@ellerman.id.au) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |CLOSED

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* RE: [PATCH] fsldma: fsl_ioread64*() do not need lower_32_bits()
From: Leo Li @ 2020-08-31 14:25 UTC (permalink / raw)
  To: Linus Torvalds, Guenter Roeck
  Cc: Herbert Xu, Joerg Roedel, Linux Kernel Mailing List, Zhang Wei,
	Vinod Koul, dma, Andrew Morton, linuxppc-dev, Dan Williams,
	Luc Van Oostenryck
In-Reply-To: <CAHk-=wjDEiWF_DsCVFPFqNa+JCS5SkOygbqeq8_=ZNOrFt7-rg@mail.gmail.com>



> -----Original Message-----
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Sent: Saturday, August 29, 2020 4:20 PM
> To: Guenter Roeck <linux@roeck-us.net>
> Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>; Herbert Xu
> <herbert@gondor.apana.org.au>; Andrew Morton <akpm@linux-
> foundation.org>; Joerg Roedel <joerg.roedel@amd.com>; Leo Li
> <leoyang.li@nxp.com>; Zhang Wei <zw@zh-kernel.org>; Dan Williams
> <dan.j.williams@intel.com>; Vinod Koul <vkoul@kernel.org>; linuxppc-dev
> <linuxppc-dev@lists.ozlabs.org>; dma <dmaengine@vger.kernel.org>; Linux
> Kernel Mailing List <linux-kernel@vger.kernel.org>
> Subject: Re: [PATCH] fsldma: fsl_ioread64*() do not need lower_32_bits()
> 
> On Sat, Aug 29, 2020 at 1:40 PM Guenter Roeck <linux@roeck-us.net> wrote:
> >
> > Except for
> >
> > CHECK: spaces preferred around that '+' (ctx:VxV)
> > #29: FILE: drivers/dma/fsldma.h:223:
> > +       u32 val_lo = in_be32((u32 __iomem *)addr+1);
> 
> Added spaces.
> 
> > I don't see anything wrong with it either, so
> >
> > Reviewed-by: Guenter Roeck <linux@roeck-us.net>
> >
> > Since I didn't see the real problem with the original code, I'd take
> > that with a grain of salt, though.
> 
> Well, honestly, the old code was so confused that just making it build is
> clearly already an improvement even if everything else were to be wrong.
> 
> So I committed my "fix". If it turns out there's more wrong in there and
> somebody tests it, we can fix it again. But now it hopefully compiles, at least.
> 
> My bet is that if that driver ever worked on ppc32, it will continue to work
> whatever we do to that function.
> 
> I _think_ the old code happened to - completely by mistake - get the value
> right for the case of "little endian access, with dma_addr_t being 32-bit".
> Because then it would still read the upper bits wrong, but the cast to
> dma_addr_t would then throw those bits away. And the lower bits would be
> right.
> 
> But for big-endian accesses or for ARCH_DMA_ADDR_T_64BIT it really looks
> like it always returned a completely incorrect value.
> 
> And again - the driver may have worked even with that completely incorrect
> value, since the use of it seems to be very incidental.
> 
> In either case ("it didn't work before" or "it worked because the value
> doesn't really matter"), I don't think I could possibly have made things worse.
> 
> Famous last words.

Thanks for the patch.  

Acked-by: Li Yang <leoyang.li@nxp.com>

We are having periodical auto regression tests covering ppc32 platforms.  But looks like it missed this issue.  I will ask the test team to investigate on why the test cases are not sufficient.

Regards,
Leo

^ permalink raw reply

* [PATCH AUTOSEL 5.8 42/42] fsldma: fix very broken 32-bit ppc ioread64 functionality
From: Sasha Levin @ 2020-08-31 15:29 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Linus Torvalds, dmaengine,
	Guenter Roeck
In-Reply-To: <20200831152934.1023912-1-sashal@kernel.org>

From: Linus Torvalds <torvalds@linux-foundation.org>

[ Upstream commit 0a4c56c80f90797e9b9f8426c6aae4c0cf1c9785 ]

Commit ef91bb196b0d ("kernel.h: Silence sparse warning in
lower_32_bits") caused new warnings to show in the fsldma driver, but
that commit was not to blame: it only exposed some very incorrect code
that tried to take the low 32 bits of an address.

That made no sense for multiple reasons, the most notable one being that
that code was intentionally limited to only 32-bit ppc builds, so "only
low 32 bits of an address" was completely nonsensical.  There were no
high bits to mask off to begin with.

But even more importantly fropm a correctness standpoint, turning the
address into an integer then caused the subsequent address arithmetic to
be completely wrong too, and the "+1" actually incremented the address
by one, rather than by four.

Which again was incorrect, since the code was reading two 32-bit values
and trying to make a 64-bit end result of it all.  Surprisingly, the
iowrite64() did not suffer from the same odd and incorrect model.

This code has never worked, but it's questionable whether anybody cared:
of the two users that actually read the 64-bit value (by way of some C
preprocessor hackery and eventually the 'get_cdar()' inline function),
one of them explicitly ignored the value, and the other one might just
happen to work despite the incorrect value being read.

This patch at least makes it not fail the build any more, and makes the
logic superficially sane.  Whether it makes any difference to the code
_working_ or not shall remain a mystery.

Compile-tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/fsldma.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 56f18ae992332..308bed0a560ac 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -205,10 +205,10 @@ struct fsldma_chan {
 #else
 static u64 fsl_ioread64(const u64 __iomem *addr)
 {
-	u32 fsl_addr = lower_32_bits(addr);
-	u64 fsl_addr_hi = (u64)in_le32((u32 *)(fsl_addr + 1)) << 32;
+	u32 val_lo = in_le32((u32 __iomem *)addr);
+	u32 val_hi = in_le32((u32 __iomem *)addr + 1);
 
-	return fsl_addr_hi | in_le32((u32 *)fsl_addr);
+	return ((u64)val_hi << 32) + val_lo;
 }
 
 static void fsl_iowrite64(u64 val, u64 __iomem *addr)
@@ -219,10 +219,10 @@ static void fsl_iowrite64(u64 val, u64 __iomem *addr)
 
 static u64 fsl_ioread64be(const u64 __iomem *addr)
 {
-	u32 fsl_addr = lower_32_bits(addr);
-	u64 fsl_addr_hi = (u64)in_be32((u32 *)fsl_addr) << 32;
+	u32 val_hi = in_be32((u32 __iomem *)addr);
+	u32 val_lo = in_be32((u32 __iomem *)addr + 1);
 
-	return fsl_addr_hi | in_be32((u32 *)(fsl_addr + 1));
+	return ((u64)val_hi << 32) + val_lo;
 }
 
 static void fsl_iowrite64be(u64 val, u64 __iomem *addr)
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.4 23/23] fsldma: fix very broken 32-bit ppc ioread64 functionality
From: Sasha Levin @ 2020-08-31 15:30 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Linus Torvalds, dmaengine,
	Guenter Roeck
In-Reply-To: <20200831153039.1024302-1-sashal@kernel.org>

From: Linus Torvalds <torvalds@linux-foundation.org>

[ Upstream commit 0a4c56c80f90797e9b9f8426c6aae4c0cf1c9785 ]

Commit ef91bb196b0d ("kernel.h: Silence sparse warning in
lower_32_bits") caused new warnings to show in the fsldma driver, but
that commit was not to blame: it only exposed some very incorrect code
that tried to take the low 32 bits of an address.

That made no sense for multiple reasons, the most notable one being that
that code was intentionally limited to only 32-bit ppc builds, so "only
low 32 bits of an address" was completely nonsensical.  There were no
high bits to mask off to begin with.

But even more importantly fropm a correctness standpoint, turning the
address into an integer then caused the subsequent address arithmetic to
be completely wrong too, and the "+1" actually incremented the address
by one, rather than by four.

Which again was incorrect, since the code was reading two 32-bit values
and trying to make a 64-bit end result of it all.  Surprisingly, the
iowrite64() did not suffer from the same odd and incorrect model.

This code has never worked, but it's questionable whether anybody cared:
of the two users that actually read the 64-bit value (by way of some C
preprocessor hackery and eventually the 'get_cdar()' inline function),
one of them explicitly ignored the value, and the other one might just
happen to work despite the incorrect value being read.

This patch at least makes it not fail the build any more, and makes the
logic superficially sane.  Whether it makes any difference to the code
_working_ or not shall remain a mystery.

Compile-tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/dma/fsldma.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 56f18ae992332..308bed0a560ac 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -205,10 +205,10 @@ struct fsldma_chan {
 #else
 static u64 fsl_ioread64(const u64 __iomem *addr)
 {
-	u32 fsl_addr = lower_32_bits(addr);
-	u64 fsl_addr_hi = (u64)in_le32((u32 *)(fsl_addr + 1)) << 32;
+	u32 val_lo = in_le32((u32 __iomem *)addr);
+	u32 val_hi = in_le32((u32 __iomem *)addr + 1);
 
-	return fsl_addr_hi | in_le32((u32 *)fsl_addr);
+	return ((u64)val_hi << 32) + val_lo;
 }
 
 static void fsl_iowrite64(u64 val, u64 __iomem *addr)
@@ -219,10 +219,10 @@ static void fsl_iowrite64(u64 val, u64 __iomem *addr)
 
 static u64 fsl_ioread64be(const u64 __iomem *addr)
 {
-	u32 fsl_addr = lower_32_bits(addr);
-	u64 fsl_addr_hi = (u64)in_be32((u32 *)fsl_addr) << 32;
+	u32 val_hi = in_be32((u32 __iomem *)addr);
+	u32 val_lo = in_be32((u32 __iomem *)addr + 1);
 
-	return fsl_addr_hi | in_be32((u32 *)(fsl_addr + 1));
+	return ((u64)val_hi << 32) + val_lo;
 }
 
 static void fsl_iowrite64be(u64 val, u64 __iomem *addr)
-- 
2.25.1


^ permalink raw reply related

* [PATCH] scsi: ibmvfc: interface updates for future FPIN and MQ support
From: Tyrel Datwyler @ 2020-08-31 17:18 UTC (permalink / raw)
  To: james.bottomley
  Cc: Tyrel Datwyler, martin.petersen, linux-scsi, linux-kernel, brking,
	linuxppc-dev

VIOS partitions with SLI-4 enabled Emulex adapters will be capable of
driving IO in parallel through mulitple work queues or channels, and
with new hyperviosr firmware that supports multiple interrupt sources
an ibmvfc NPIV single initiator can be modified to exploit end to end
channelization in a PowerVM environment.

VIOS hosts will also be able to expose fabric perfromance impact
notifications (FPIN) via a new asynchronous event to ibmvfc clients that
advertise support via IBMVFC_CAN_HANDLE_FPIN in their capabilities flag
during NPIV_LOGIN.

This patch introduces three new Managment Datagrams (MADs) for
channelization support negotiation as well as the FPIN asynchronous
event and FPIN status flags. Follow up work is required to plumb the
ibmvfc client driver to use these new interfaces.

Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.h | 66 +++++++++++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 907889f1fa9d..801681b63daa 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -124,6 +124,9 @@ enum ibmvfc_mad_types {
 	IBMVFC_PASSTHRU		= 0x0200,
 	IBMVFC_TMF_MAD		= 0x0100,
 	IBMVFC_NPIV_LOGOUT	= 0x0800,
+	IBMVFC_CHANNEL_ENQUIRY	= 0x1000,
+	IBMVFC_CHANNEL_SETUP	= 0x2000,
+	IBMVFC_CONNECTION_INFO	= 0x4000,
 };
 
 struct ibmvfc_mad_common {
@@ -162,6 +165,8 @@ struct ibmvfc_npiv_login {
 	__be32 max_cmds;
 	__be64 capabilities;
 #define IBMVFC_CAN_MIGRATE		0x01
+#define IBMVFC_CAN_USE_CHANNELS		0x02
+#define IBMVFC_CAN_HANDLE_FPIN		0x04
 	__be64 node_name;
 	struct srp_direct_buf async;
 	u8 partition_name[IBMVFC_MAX_NAME];
@@ -204,6 +209,7 @@ struct ibmvfc_npiv_login_resp {
 	__be64 capabilities;
 #define IBMVFC_CAN_FLUSH_ON_HALT	0x08
 #define IBMVFC_CAN_SUPPRESS_ABTS	0x10
+#define IBMVFC_CAN_SUPPORT_CHANNELS	0x20
 	__be32 max_cmds;
 	__be32 scsi_id_sz;
 	__be64 max_dma_len;
@@ -482,6 +488,52 @@ struct ibmvfc_passthru_mad {
 	struct ibmvfc_passthru_fc_iu fc_iu;
 }__attribute__((packed, aligned (8)));
 
+struct ibmvfc_channel_enquiry {
+	struct ibmvfc_mad_common common;
+	__be32 flags;
+#define IBMVFC_NO_CHANNELS_TO_CRQ_SUPPORT	0x01
+#define IBMVFC_SUPPORT_VARIABLE_SUBQ_MSG	0x02
+#define IBMVFC_NO_N_TO_M_CHANNELS_SUPPORT	0x04
+	__be32 num_scsi_subq_channels;
+	__be32 num_nvmeof_subq_channels;
+	__be32 num_scsi_vas_channels;
+	__be32 num nvmeof_vas_channels;
+}__attribute__((packed, aligned (8)));
+
+struct ibmvfc_channel_setup_mad {
+	struct ibmvfc_mad_common common;
+	struct srp_direct_buf buffer;
+}__attrribute__((packed, aligned (8)));
+
+#define IBMVFC_MAX_CHANNELS	502
+
+struct ibmvfc_channel_setup {
+	__be32 flags;
+#define IBMVFC_CANCEL_CHANNELS		0x01
+#define IBMVFC_USE_BUFFER		0x02
+#define IBMVFC_CHANNELS_CANCELED	0x04
+	__be32 reserved;
+	__be32 num_scsi_subq_channels;
+	__be32 num_nvmeof_subq_channels;
+	__be32 num_scsi_vas_channels;
+	__be32 num_nvmeof_vas_channels;
+	struct srp_direct_buf buffer;
+	__be64 reserved2[5];
+	__be64 channel_handles[IBMVFC_MAX_CHANNELS];
+}__attribute__((packed, aligned (8)));
+
+struct ibmvfc_connection_info {
+	struct ibmvfc_madd_common common;
+	__be64 information_bits;
+#define IBMVFC_NO_FC_IO_CHANNEL		0x01
+#define IBMVFC_NO_PHYP_VAS		0x02
+#define IBMVFC_NO_PHYP_SUBQ		0x04
+#define IBMVFC_PHYP_DEPRECATED_SUBQ	0x08
+#define IBMVFC_PHYP_PRESERVED_SUBQ	0x10
+#define IBMVFC_PHYP_FULL_SUBQ		0x20
+	__be64 reserved[16];
+}__attribute__((packed, aligned (8)));
+
 struct ibmvfc_trace_start_entry {
 	u32 xfer_len;
 }__attribute__((packed));
@@ -532,6 +584,7 @@ enum ibmvfc_async_event {
 	IBMVFC_AE_HALT			= 0x0400,
 	IBMVFC_AE_RESUME			= 0x0800,
 	IBMVFC_AE_ADAPTER_FAILED	= 0x1000,
+	IBMVFC_AE_FPIN			= 0x2000,
 };
 
 struct ibmvfc_async_desc {
@@ -560,10 +613,18 @@ enum ibmvfc_ae_link_state {
 	IBMVFC_AE_LS_LINK_DEAD		= 0x08,
 };
 
+enum ibmvfc_ae_fpin_status {
+	IBMVFC_AE_FPIN_LINK_CONGESTED	= 0x1,
+	IBMVFC_AE_FPIN_PORT_CONGESTED	= 0x2,
+	IBMVFC_AE_FPIN_PORT_CLEARED	= 0x3,
+	IBMVFC_AE_FPIN_PORT_DEGRADED	= 0x4,
+};
+
 struct ibmvfc_async_crq {
 	volatile u8 valid;
 	u8 link_state;
-	u8 pad[2];
+	u8 fpin_status
+	u8 pad;
 	__be32 pad2;
 	volatile __be64 event;
 	volatile __be64 scsi_id;
@@ -590,6 +651,9 @@ union ibmvfc_iu {
 	struct ibmvfc_tmf tmf;
 	struct ibmvfc_cmd cmd;
 	struct ibmvfc_passthru_mad passthru;
+	struct ibmvfc_channel_enquiry channel_enquiry;
+	struct ibmvfc_channel_setup_mad channel_setup;
+	struct ibmvfc_connection_info connection_info;
 }__attribute__((packed, aligned (8)));
 
 enum ibmvfc_target_action {
-- 
2.27.0


^ permalink raw reply related

* [PATCH] arch: vdso: add vdso linker script to 'targets' instead of extra-y
From: Masahiro Yamada @ 2020-08-31 18:22 UTC (permalink / raw)
  To: linux-kbuild
  Cc: linux-s390, Vasily Gorbik, Nick Hu, Heiko Carstens, linuxppc-dev,
	linux-kernel, Christian Borntraeger, Paul Mackerras, Greentime Hu,
	Catalin Marinas, Vincent Chen, Will Deacon, Masahiro Yamada,
	linux-arm-kernel

The vdso linker script is preprocessed on demand.
Adding it to 'targets' is enough to include the .cmd file.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---

 arch/arm64/kernel/vdso/Makefile     | 2 +-
 arch/arm64/kernel/vdso32/Makefile   | 2 +-
 arch/nds32/kernel/vdso/Makefile     | 2 +-
 arch/powerpc/kernel/vdso32/Makefile | 2 +-
 arch/powerpc/kernel/vdso64/Makefile | 2 +-
 arch/s390/kernel/vdso64/Makefile    | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 45d5cfe46429..7cd8aafbe96e 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -54,7 +54,7 @@ endif
 GCOV_PROFILE := n
 
 obj-y += vdso.o
-extra-y += vdso.lds
+targets += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
 # Force dependency (incbin is bad)
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index d6adb4677c25..572475b7b7ed 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -155,7 +155,7 @@ asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso))
 obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso)
 
 obj-y += vdso.o
-extra-y += vdso.lds
+targets += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
 # Force dependency (vdso.s includes vdso.so through incbin)
diff --git a/arch/nds32/kernel/vdso/Makefile b/arch/nds32/kernel/vdso/Makefile
index 7c3c1ccb196e..55df25ef0057 100644
--- a/arch/nds32/kernel/vdso/Makefile
+++ b/arch/nds32/kernel/vdso/Makefile
@@ -20,7 +20,7 @@ GCOV_PROFILE := n
 
 
 obj-y += vdso.o
-extra-y += vdso.lds
+targets += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
 # Force dependency
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index 87ab1152d5ce..fd5072a4c73c 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -29,7 +29,7 @@ ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
 asflags-y := -D__VDSO32__ -s
 
 obj-y += vdso32_wrapper.o
-extra-y += vdso32.lds
+targets += vdso32.lds
 CPPFLAGS_vdso32.lds += -P -C -Upowerpc
 
 # Force dependency (incbin is bad)
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index 38c317f25141..c737b3ea3207 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -17,7 +17,7 @@ ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
 asflags-y := -D__VDSO64__ -s
 
 obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
 CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
 
 # Force dependency (incbin is bad)
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 4a66a1cb919b..d0d406cfffa9 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -25,7 +25,7 @@ $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
 
 obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
 CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
 
 # Disable gcov profiling, ubsan and kasan for VDSO code
-- 
2.25.1


^ permalink raw reply related

* [RFC PATCH 0/2] DMA pagecache
From: Leonardo Bras @ 2020-08-31 18:48 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy, Nayna Jain,
	Eric Richter, Nicholas Piggin, Hari Bathini, Masahiro Yamada,
	Brian King, Murilo Fossa Vicentini, David Dai, Matthew Wilcox
  Cc: linuxppc-dev, linux-kernel

This RFC improves the performance of indirect mapping on all tested DMA
usages, based on a mlx5 device, ranging from 64k packages to 1-byte
packages, from 1 thread to 64 threads.

In all workloads tested, the performance of indirect mapping gets very
near to direct mapping case.

The whole thing is designed to have as much perfomance as possible, so
the impact of the pagecache is not too big.

As I am not very experienced in XArrays usage, nor in lockless
algorithms, I would specially appreaciate feedback on possible
failures on it's usage, missing barriers, and so on.

Also, this size for the FIFO is just for testing purposes.
It's also very possible that it will not be a good idea in platforms
other than pseries, (i have not tested them).
I can plan I bypass for those cases without much work.

Thank you!

Leonardo Bras (2):
  dma-direction: Add DMA_DIR_COMPAT() macro to test direction
    compability
  powerpc/kernel/iommu: Introduce IOMMU DMA pagecache

 arch/powerpc/include/asm/iommu-cache.h |  31 ++++
 arch/powerpc/include/asm/iommu.h       |   4 +
 arch/powerpc/kernel/Makefile           |   2 +-
 arch/powerpc/kernel/iommu-cache.c      | 247 +++++++++++++++++++++++++
 arch/powerpc/kernel/iommu.c            |  15 +-
 include/linux/dma-direction.h          |   3 +
 6 files changed, 296 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/include/asm/iommu-cache.h
 create mode 100644 arch/powerpc/kernel/iommu-cache.c

-- 
2.25.4


^ permalink raw reply

* [RFC PATCH 1/2] dma-direction: Add DMA_DIR_COMPAT() macro to test direction compability
From: Leonardo Bras @ 2020-08-31 18:48 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy, Nayna Jain,
	Eric Richter, Nicholas Piggin, Hari Bathini, Masahiro Yamada,
	Brian King, Murilo Fossa Vicentini, David Dai, Matthew Wilcox
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200831184859.110660-1-leobras.c@gmail.com>

Given a existing mapping with 'current' direction, and a 'wanted'
direction for using that mapping, check if 'wanted' is satisfied by
'current'.

current			accepts
DMA_BIDIRECTIONAL	DMA_BIDIRECTIONAL, DMA_TO_DEVICE,
			DMA_FROM_DEVICE, DMA_NONE
DMA_TO_DEVICE		DMA_TO_DEVICE, DMA_NONE
DMA_FROM_DEVICE		DMA_FROM_DEVICE, DMA_NONE
DMA_NONE		DMA_NONE

This macro is useful for checking if a DMA mapping can be reused.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 include/linux/dma-direction.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/dma-direction.h b/include/linux/dma-direction.h
index 9c96e30e6a0b..caf3943a21f4 100644
--- a/include/linux/dma-direction.h
+++ b/include/linux/dma-direction.h
@@ -9,4 +9,7 @@ enum dma_data_direction {
 	DMA_NONE = 3,
 };
 
+/* Checks if wanted direction is satisfied by current mapping direction*/
+#define DMA_DIR_COMPAT(current, wanted)	(((current) & ~(wanted)) == 0)
+
 #endif
-- 
2.25.4


^ permalink raw reply related

* [RFC PATCH 2/2] powerpc/kernel/iommu: Introduce IOMMU DMA pagecache
From: Leonardo Bras @ 2020-08-31 18:48 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Leonardo Bras, Joel Stanley, Christophe Leroy, Nayna Jain,
	Eric Richter, Nicholas Piggin, Hari Bathini, Masahiro Yamada,
	Brian King, Murilo Fossa Vicentini, David Dai, Matthew Wilcox
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200831184859.110660-1-leobras.c@gmail.com>

In pseries, mapping a DMA page for a cpu memory range requires a
H_PUT_TCE* hypercall, and unmapping requires a H_STUFF_TCE hypercall.
When doing a lot of I/O, a thread can spend a lot of time doing such
hcalls, specially when a DMA mapping don't get reused.

The purpose of this change is to introduce a mechanism similar to
a pagecache, but for reusing DMA mappings, improving performance and
avoiding multiple DMA mappings of the same cpupage.

This works based in a few current behaviors:
- Page reuse:
    It's common for userspace process to reuse the same page for several
    allocations This is probably caused by page buffering behavior that
    come from libc, so getting pages from the kernel is less expensive.

- A lot of small allocations are used:
    Some workloads do a lot of allocations that do not fill a whole page
    causing multiple DMA mappings for a single page.

How it work:
- When a DMA mapping is required for given allocation, it first searches
  the DMA pagecache for a matching mapping. When found, increment refcount,
  when not found, a new mapping is created.
- Every time a new mapping is created, it's added to the DMA pagecache,
  with refcount = 1;
- When the mapping is not needed anymore (iommu_free()), it looks in the
  DMA pagecache for the entry, and the decrements it's refcount.
- When there are more than IOMMU_MAP_LIST_MAX entries in the
   DMA pagecache, it removes the older ones.

What is inside:
- 1 XArray which indexes the DMA page addresses, used for removing
  mappings and decreasing refcounts.
- 1 XArray which indexes CPU page addresses, for finding matching mappings.
  - As there can be multiple mappings (directions) for the same cpupage,
   this one keeps a llist for looking into the entries.
- Every entry points (page) in the XArray points to the mapping struct.

- The mapping struct have:
  - DMA & CPU page addresses, size (pages) , direction and refcount.
  - 1 llist used for multiple mappings at the same cpupage
  - 1 llist used for the FIFO (removing old unused entries)

- The cache struct, added to iommu_table, have:
  - Both XArrays,
  - 2 llist for entry/removal point on FIFO,
  - 1 Atomic Cachesize, to manage the FIFO.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/include/asm/iommu-cache.h |  31 ++++
 arch/powerpc/include/asm/iommu.h       |   4 +
 arch/powerpc/kernel/Makefile           |   2 +-
 arch/powerpc/kernel/iommu-cache.c      | 247 +++++++++++++++++++++++++
 arch/powerpc/kernel/iommu.c            |  15 +-
 5 files changed, 293 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/include/asm/iommu-cache.h
 create mode 100644 arch/powerpc/kernel/iommu-cache.c

diff --git a/arch/powerpc/include/asm/iommu-cache.h b/arch/powerpc/include/asm/iommu-cache.h
new file mode 100644
index 000000000000..ad298a4cd9c9
--- /dev/null
+++ b/arch/powerpc/include/asm/iommu-cache.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMMU_CACHE_H
+#define _IOMMU_CACHE_H
+#ifdef __KERNEL__
+
+#include <linux/llist.h>
+#include <linux/xarray.h>
+#include <linux/atomic.h>
+
+struct dmacache {
+	struct llist_head fifo_add;
+	struct llist_head fifo_del;
+	struct xarray cpupages;
+	struct xarray dmapages;
+	atomic64_t cachesize;
+};
+
+#include <asm/iommu.h>
+
+void iommu_cache_init(struct iommu_table *tbl);
+void iommu_dmacache_add(struct iommu_table *tbl, void *page, unsigned int npages, dma_addr_t addr,
+			enum dma_data_direction direction);
+dma_addr_t iommu_dmacache_use(struct iommu_table *tbl, void *page, unsigned int npages,
+			      enum dma_data_direction direction);
+void iommu_dmacache_free(struct iommu_table *tbl, dma_addr_t dma_handle, unsigned int npages);
+
+#define IOMMU_MAP_LIST_MAX	8192
+#define IOMMU_MAP_LIST_THRES	128
+
+#endif /* __KERNEL__ */
+#endif /* _IOMMU_CACHE_H */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2913e5c8b1f8..51a2f5503f8e 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -18,6 +18,7 @@
 #include <asm/types.h>
 #include <asm/pci-bridge.h>
 #include <asm/asm-const.h>
+#include <asm/iommu-cache.h>
 
 #define IOMMU_PAGE_SHIFT_4K      12
 #define IOMMU_PAGE_SIZE_4K       (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K)
@@ -114,6 +115,7 @@ struct iommu_table {
 	int it_nid;
 	unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
 	unsigned long it_reserved_end;
+	struct dmacache cache;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -317,6 +319,8 @@ extern void iommu_release_ownership(struct iommu_table *tbl);
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
+void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages);
+
 #ifdef CONFIG_PPC_CELL_NATIVE
 extern bool iommu_fixed_is_weak;
 #else
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index cbf41fb4ee89..62f6c90076d9 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -104,7 +104,7 @@ extra-y				+= vmlinux.lds
 obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
 
 obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o
-obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
+obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o iommu-cache.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
diff --git a/arch/powerpc/kernel/iommu-cache.c b/arch/powerpc/kernel/iommu-cache.c
new file mode 100644
index 000000000000..3a5b2725ab51
--- /dev/null
+++ b/arch/powerpc/kernel/iommu-cache.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <asm/iommu-cache.h>
+
+struct dma_mapping {
+	struct llist_node mapping;
+	struct llist_node fifo;
+	unsigned long dmapage;
+	unsigned long cpupage;
+	unsigned long size;
+	refcount_t count;
+	enum dma_data_direction direction;
+
+};
+
+struct cpupage_entry {
+	struct llist_node node;
+	struct dma_mapping *data;
+};
+
+/**
+ * iommu_dmacache_use() - Looks for a DMA mapping in cache
+ * @tbl: Device's iommu_table.
+ * @page: Address for which a DMA mapping is desired.
+ * @npages: Page count needed from that address
+ * @direction: DMA direction needed for the mapping
+ *
+ * Looks into the DMA cache for a page/range that is already mapped with given direction.
+ *
+ * Return: DMA mapping for range/direction present in cache
+ *	   DMA_MAPPING_ERROR if not found.
+ */
+dma_addr_t iommu_dmacache_use(struct iommu_table *tbl, void *page, unsigned int npages,
+			      enum dma_data_direction direction)
+{
+	struct cpupage_entry *e;
+	struct dma_mapping *d;
+	const unsigned long start = (unsigned long)page >> tbl->it_page_shift;
+	const unsigned long end = start + npages;
+
+	e = xa_load(&tbl->cache.cpupages, start);
+	if (!e)
+		return DMA_MAPPING_ERROR;
+
+	llist_for_each_entry(e, &e->node, node) {
+		d = e->data;
+		if (start < d->cpupage || end > (d->cpupage + d->size) ||
+		    !DMA_DIR_COMPAT(d->direction, direction))
+			continue;
+
+		refcount_inc(&d->count);
+		return (d->dmapage + start - d->cpupage) << tbl->it_page_shift;
+	}
+
+	return DMA_MAPPING_ERROR;
+}
+
+/**
+ * iommu_dmacache_entry_remove() - Remove a dma mapping from cpupage & dmapage XArrays
+ * @cache: Device's dmacache.
+ * @d: dma_mapping to be removed
+ */
+static void iommu_dmacache_entry_remove(struct dmacache *cache, struct dma_mapping *d)
+{
+	struct cpupage_entry *e, *tmp;
+	dma_addr_t dp = d->dmapage;
+	dma_addr_t end = dp + d->size;
+	unsigned long cp = d->cpupage;
+
+	for (; dp < end; dp++, cp++) {
+		e = xa_erase(&cache->cpupages, cp);
+		if (e && e->node.next) {
+			tmp = llist_entry(e->node.next, struct cpupage_entry, node);
+			xa_store(&cache->cpupages, cp, tmp, GFP_KERNEL);
+		}
+		xa_erase(&cache->dmapages, dp);
+		kfree(e);
+	}
+}
+
+/**
+ * iommu_dmacache_clean() - Clean count mappings from dmacache fifo
+ * @tbl: Device's iommu_table.
+ * @count: number of entries to be removed.
+ */
+static void iommu_dmacache_clean(struct iommu_table *tbl, const long count)
+{
+	struct dma_mapping *d, *tmp;
+	struct llist_node *n;
+	struct dmacache *cache = &tbl->cache;
+	unsigned long removed = 0;
+
+	n = llist_del_all(&cache->fifo_del);
+
+	if (!n)
+		return;
+
+	llist_for_each_entry_safe(d, tmp, n, fifo) {
+		switch (refcount_read(&d->count)) {
+		case 0:
+			/* Fully remove entry */
+			iommu_dmacache_entry_remove(cache, d);
+			__iommu_free(tbl, d->dmapage << tbl->it_page_shift, d->size);
+			kfree(d);
+			removed++;
+			break;
+		case 1:
+			/* Remove entry but don't undo mapping */
+			iommu_dmacache_entry_remove(cache, d);
+			kfree(d);
+			removed++;
+			break;
+		default:
+			/* In use. Re-add it to list. */
+			n = xchg(&tbl->cache.fifo_add.first, &d->fifo);
+			if (!n)
+				n->next = &d->fifo;
+
+			break;
+		}
+
+		if (removed >= count)
+			break;
+	}
+
+	atomic64_sub(removed, &tbl->cache.cachesize);
+
+	xchg(&tbl->cache.fifo_del.first, &tmp->fifo);
+}
+
+/**
+ * iommu_dmacache_free() - Decrement a mapping usage from dmacache and clean when full
+ * @tbl: Device's iommu_table.
+ * @dma_handle: DMA address from the mapping.
+ * @npages: Page count from that address
+ *
+ * Decrements a refcount for a mapping in this dma_handle + npages, and remove
+ * some unused dma mappings from dmacache fifo.
+ */
+void iommu_dmacache_free(struct iommu_table *tbl, dma_addr_t dma_handle,	unsigned int npages)
+{
+	struct dma_mapping *d;
+	long exceeding;
+
+	d = xa_load(&tbl->cache.dmapages, dma_handle >> tbl->it_page_shift);
+	if (!d) {
+		/* Not in list, just free */
+		__iommu_free(tbl, dma_handle, npages);
+		return;
+	}
+
+	refcount_dec(&d->count);
+
+	exceeding = atomic64_read(&tbl->cache.cachesize) - IOMMU_MAP_LIST_MAX;
+
+	if (exceeding > 0)
+		iommu_dmacache_clean(tbl, exceeding + IOMMU_MAP_LIST_THRES);
+}
+
+/**
+ * iommu_dmacache_add() - Create and add a new dma mapping into cache.
+ * @tbl: Device's iommu_table.
+ * @page: Address for which a DMA mapping was created.
+ * @npages: Page count mapped from that address
+ * @addr: DMA address created for that mapping
+ * @direction: DMA direction for the mapping created
+ *
+ * Create a dma_mapping and add it to dmapages and cpupages XArray, then add it to fifo.
+ * On both cpupages and dmapages, an entry will be created for each page in the mapping.
+ * On cpupages, as there may exist many mappings for a single cpupage, each entry has a llist
+ * that starts at the last mapped entry.
+ *
+ */
+void iommu_dmacache_add(struct iommu_table *tbl, void *page, unsigned int npages, dma_addr_t addr,
+			enum dma_data_direction direction)
+{
+	struct dma_mapping *d, *tmp;
+	struct cpupage_entry *e, *old;
+	struct llist_node *n;
+	unsigned long p = (unsigned long)page;
+	unsigned int i;
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return;
+
+	d->cpupage = (unsigned long)p >> tbl->it_page_shift;
+	d->dmapage = (unsigned long)addr >> tbl->it_page_shift;
+	d->size = npages;
+	d->direction = direction;
+	d->fifo.next = NULL;
+	refcount_set(&d->count, 1);
+
+	p = d->cpupage;
+	addr = d->dmapage;
+
+	for (i = 0; i < npages ; i++) {
+		/* Only one mapping may exist for a DMA address*/
+		tmp = xa_store(&tbl->cache.dmapages, addr++, d, GFP_KERNEL);
+		if (xa_is_err(tmp))
+			break;
+
+		/* Multiple mappings may exist for a page, get them in a list*/
+		e = kmalloc(sizeof(*e), GFP_KERNEL);
+		if (!d)
+			break;
+
+		e->data = d;
+		old = xa_store(&tbl->cache.cpupages, p++, e, GFP_KERNEL);
+		e->node.next = &old->node;
+
+		if (xa_is_err(old)) {
+			kfree(e);
+			break;
+		}
+	}
+
+	n = xchg(&tbl->cache.fifo_add.first, &d->fifo);
+	if (!n)
+		n->next = &d->fifo;
+
+	atomic64_inc(&tbl->cache.cachesize);
+}
+
+void iommu_cache_init(struct iommu_table *tbl)
+{
+	struct dma_mapping *d;
+
+	init_llist_head(&tbl->cache.fifo_add);
+	init_llist_head(&tbl->cache.fifo_del);
+
+	/* First entry for linking both llist_heads */
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		panic("%s: Can't allocate %ld bytes\n", __func__, sizeof(*d));
+
+	d->cpupage = -1UL;
+	d->dmapage = -1UL;
+	refcount_set(&d->count, 1);
+	llist_add(&d->fifo, &tbl->cache.fifo_add);
+	llist_add(&d->fifo, &tbl->cache.fifo_del);
+
+	xa_init(&tbl->cache.cpupages);
+	xa_init(&tbl->cache.dmapages);
+
+	atomic64_set(&tbl->cache.cachesize, 0);
+}
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c5d5d36ab65e..60703af172ad 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -40,8 +40,6 @@
 
 static int novmerge;
 
-static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
-
 static int __init setup_iommu(char *str)
 {
 	if (!strcmp(str, "novmerge"))
@@ -308,6 +306,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	dma_addr_t ret = DMA_MAPPING_ERROR;
 	int build_fail;
 
+	ret = iommu_dmacache_use(tbl, page, npages, direction);
+	if (ret != DMA_MAPPING_ERROR)
+		return ret;
+
 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
 
 	if (unlikely(entry == DMA_MAPPING_ERROR))
@@ -338,6 +340,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	/* Make sure updates are seen by hardware */
 	mb();
 
+	iommu_dmacache_add(tbl, page, npages, ret, direction);
+
 	return ret;
 }
 
@@ -388,8 +392,7 @@ static struct iommu_pool *get_pool(struct iommu_table *tbl,
 	return p;
 }
 
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
-			 unsigned int npages)
+void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages)
 {
 	unsigned long entry, free_entry;
 	unsigned long flags;
@@ -413,7 +416,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		unsigned int npages)
 {
-	__iommu_free(tbl, dma_addr, npages);
+	iommu_dmacache_free(tbl, dma_addr, npages);
 
 	/* Make sure TLB cache is flushed if the HW needs it. We do
 	 * not do an mb() here on purpose, it is not needed on any of
@@ -719,6 +722,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 
 	iommu_table_clear(tbl);
 
+	iommu_cache_init(tbl);
+
 	if (!welcomed) {
 		printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
 		       novmerge ? "disabled" : "enabled");
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH net-next] net/wan/fsl_ucc_hdlc: Add MODULE_DESCRIPTION
From: David Miller @ 2020-08-31 19:42 UTC (permalink / raw)
  To: yuehaibing; +Cc: kuba, netdev, linuxppc-dev, linux-kernel, qiang.zhao
In-Reply-To: <20200829115823.10140-1-yuehaibing@huawei.com>

From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 29 Aug 2020 19:58:23 +0800

> Add missing MODULE_DESCRIPTION.
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>

Applied.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox