LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH kernel v2 2/2] powerpc/dma: Fallback to dma_ops when persistent memory present
From: Alexey Kardashevskiy @ 2020-10-27 10:18 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, iommu, Christoph Hellwig, linux-kernel
In-Reply-To: <20201027101841.96056-1-aik@ozlabs.ru>

So far we have been using huge DMA windows to map all the RAM available.
The RAM is normally mapped to the VM address space contiguously, and
there is always a reasonable upper limit for possible future hot plugged
RAM which makes it easy to map all RAM via IOMMU.

Now there is persistent memory ("ibm,pmemory" in the FDT) which (unlike
normal RAM) can map anywhere in the VM space beyond the maximum RAM size
and since it can be used for DMA, it requires extending the huge window
up to MAX_PHYSMEM_BITS which requires hypervisor support for:
1. huge TCE tables;
2. multilevel TCE tables;
3. huge IOMMU pages.

Certain hypervisors cannot do either so the only option left is
restricting the huge DMA window to include only RAM and fallback to
the default DMA window for persistent memory.

This checks if the system has persistent memory. If it does not,
the DMA bypass mode is selected, i.e.
* dev->bus_dma_limit = 0
* dev->dma_ops_bypass = true <- this avoid calling dma_ops for mapping.

If there is such memory, this creates identity mapping only for RAM and
sets the dev->bus_dma_limit to let the generic code decide whether to
call into the direct DMA or the indirect DMA ops.

This should not change the existing behaviour when no persistent memory
as dev->dma_ops_bypass is expected to be set.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/dma-iommu.c        | 12 +++++--
 arch/powerpc/platforms/pseries/iommu.c | 44 ++++++++++++++++++++------
 arch/powerpc/Kconfig                   |  1 +
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index a1c744194018..d123b7205f76 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -90,8 +90,16 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
-		dev->dma_ops_bypass = true;
-		dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
+		/*
+		 * dma_iommu_bypass_supported() sets dma_max when there is
+		 * 1:1 mapping but it is somehow limited.
+		 * ibm,pmemory is one example.
+		 */
+		dev->dma_ops_bypass = dev->bus_dma_limit == 0;
+		if (!dev->dma_ops_bypass)
+			dev_warn(dev, "iommu: 64-bit OK but using default ops\n");
+		else
+			dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
 		return 1;
 	}
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index e4198700ed1a..91112e748491 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -839,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 			np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn)
+static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
 {
 	struct direct_window *window;
 	const struct dynamic_dma_window_prop *direct64;
@@ -851,6 +851,7 @@ static u64 find_existing_ddw(struct device_node *pdn)
 		if (window->device == pdn) {
 			direct64 = window->prop;
 			dma_addr = be64_to_cpu(direct64->dma_base);
+			*window_shift = be32_to_cpu(direct64->window_shift);
 			break;
 		}
 	}
@@ -1111,11 +1112,13 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-	int len, ret;
+	int len = 0, ret;
+	bool pmem_present = of_find_node_by_type(NULL, "ibm,pmemory") != NULL;
+	int max_ram_len = order_base_2(ddw_memory_hotplug_max());
 	struct ddw_query_response query;
 	struct ddw_create_response create;
 	int page_shift;
-	u64 dma_addr, max_addr;
+	u64 dma_addr;
 	struct device_node *dn;
 	u32 ddw_avail[DDW_APPLICABLE_SIZE];
 	struct direct_window *window;
@@ -1126,7 +1129,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 	mutex_lock(&direct_window_init_mutex);
 
-	dma_addr = find_existing_ddw(pdn);
+	dma_addr = find_existing_ddw(pdn, &len);
 	if (dma_addr != 0)
 		goto out_unlock;
 
@@ -1212,14 +1215,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
-	max_addr = ddw_memory_hotplug_max();
-	if (query.largest_available_block < (max_addr >> page_shift)) {
-		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
-			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
-			  1ULL << page_shift);
+	/*
+	 * The "ibm,pmemory" can appear anywhere in the address space.
+	 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
+	 * for the upper limit and fallback to max RAM otherwise but this
+	 * disables device::dma_ops_bypass.
+	 */
+	len = max_ram_len;
+	if (pmem_present) {
+		if (query.largest_available_block >=
+		    (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
+			len = MAX_PHYSMEM_BITS - page_shift;
+		else
+			dev_info(&dev->dev, "Skipping ibm,pmemory");
+	}
+
+	if (query.largest_available_block < (1ULL << (len - page_shift))) {
+		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
+			1ULL << len, query.largest_available_block, 1ULL << page_shift);
 		goto out_failed;
 	}
-	len = order_base_2(max_addr);
 	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
 	if (!win64) {
 		dev_info(&dev->dev,
@@ -1299,6 +1314,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);
+
+	/*
+	 * If we have persistent memory and the window size is only as big
+	 * as RAM, then we failed to create a window to cover persistent
+	 * memory and need to set the DMA limit.
+	 */
+	if (pmem_present && dma_addr && (len == max_ram_len))
+		dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+
 	return dma_addr;
 }
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e9f13fe08492..5a8881bf140e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -159,6 +159,7 @@ config PPC
 	select DCACHE_WORD_ACCESS		if PPC64 && CPU_LITTLE_ENDIAN
 	select DMA_OPS				if PPC64
 	select DMA_OPS_BYPASS			if PPC64
+	select DMA_OPS_BYPASS_BUS_LIMIT		if PPC64 && PPC_PSERIES
 	select DYNAMIC_FTRACE			if FUNCTION_TRACER
 	select EDAC_ATOMIC_SCRUB
 	select EDAC_SUPPORT
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH 02/10] fs: don't allow splice read/write without explicit ops
From: Christoph Hellwig @ 2020-10-27  9:54 UTC (permalink / raw)
  To: David Howells
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel, Al Viro,
	linux-fsdevel, Linus Torvalds, Christoph Hellwig
In-Reply-To: <3155818.1603792294@warthog.procyon.org.uk>

On Tue, Oct 27, 2020 at 09:51:34AM +0000, David Howells wrote:
> David Howells <dhowells@redhat.com> wrote:
> 
> > > default_file_splice_write is the last piece of generic code that uses
> > > set_fs to make the uaccess routines operate on kernel pointers.  It
> > > implements a "fallback loop" for splicing from files that do not actually
> > > provide a proper splice_read method.  The usual file systems and other
> > > high bandwith instances all provide a ->splice_read, so this just removes
> > > support for various device drivers and procfs/debugfs files.  If splice
> > > support for any of those turns out to be important it can be added back
> > > by switching them to the iter ops and using generic_file_splice_read.
> > 
> > Hmmm...  this causes the copy_file_range() syscall to fail with EINVAL in some
> > places where before it used to work.
> > 
> > For my part, it causes the generic/112 xfstest to fail with afs, but there may
> > be other places.
> > 
> > Is this a regression we need to fix in the VFS core?  Or is it something we
> > need to fix in xfstests and assume userspace will fallback to doing it itself?
> 
> That said, for afs at least, the fix seems to be just this:

And that is the correct fix, I was about to send it to you.

We can't have a "generic" splice using ->read/->write without set_fs,
in addition to the iter_file_splice_write based version being a lot
more efficient than what you had before.

^ permalink raw reply

* Re: [PATCH 02/10] fs: don't allow splice read/write without explicit ops
From: David Howells @ 2020-10-27  9:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel, dhowells,
	Al Viro, linux-fsdevel, Linus Torvalds
In-Reply-To: <3088368.1603790984@warthog.procyon.org.uk>

David Howells <dhowells@redhat.com> wrote:

> > default_file_splice_write is the last piece of generic code that uses
> > set_fs to make the uaccess routines operate on kernel pointers.  It
> > implements a "fallback loop" for splicing from files that do not actually
> > provide a proper splice_read method.  The usual file systems and other
> > high bandwith instances all provide a ->splice_read, so this just removes
> > support for various device drivers and procfs/debugfs files.  If splice
> > support for any of those turns out to be important it can be added back
> > by switching them to the iter ops and using generic_file_splice_read.
> 
> Hmmm...  this causes the copy_file_range() syscall to fail with EINVAL in some
> places where before it used to work.
> 
> For my part, it causes the generic/112 xfstest to fail with afs, but there may
> be other places.
> 
> Is this a regression we need to fix in the VFS core?  Or is it something we
> need to fix in xfstests and assume userspace will fallback to doing it itself?

That said, for afs at least, the fix seems to be just this:

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 395075d7fe02..2bc6adfe351a 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = {
 	.write_iter	= afs_file_write,
 	.mmap		= afs_file_mmap,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
 	.fsync		= afs_fsync,
 	.lock		= afs_lock,
 	.flock		= afs_flock,

David


^ permalink raw reply related

* Re: [PATCH 0/4] arch, mm: improve robustness of direct map manipulation
From: Mike Rapoport @ 2020-10-27  9:47 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: peterz@infradead.org, catalin.marinas@arm.com,
	dave.hansen@linux.intel.com, linux-mm@kvack.org, paulus@samba.org,
	pavel@ucw.cz, hpa@zytor.com, sparclinux@vger.kernel.org,
	cl@linux.com, will@kernel.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, x86@kernel.org, rppt@linux.ibm.com,
	borntraeger@de.ibm.com, mingo@redhat.com, rientjes@google.com,
	Brown, Len, aou@eecs.berkeley.edu, gor@linux.ibm.com,
	linux-pm@vger.kernel.org, hca@linux.ibm.com, bp@alien8.de,
	luto@kernel.org, paul.walmsley@sifive.com, kirill@shutemov.name,
	tglx@linutronix.de, iamjoonsoo.kim@lge.com,
	linux-arm-kernel@lists.infradead.org, rjw@rjwysocki.net,
	linux-kernel@vger.kernel.org, penberg@kernel.org,
	palmer@dabbelt.com, akpm@linux-foundation.org, Edgecombe, Rick P,
	linuxppc-dev@lists.ozlabs.org, davem@davemloft.net
In-Reply-To: <e5fc62b6-f644-4ed5-de5b-ffd8337861e4@redhat.com>

On Tue, Oct 27, 2020 at 09:46:35AM +0100, David Hildenbrand wrote:
> On 27.10.20 09:38, Mike Rapoport wrote:
> > On Mon, Oct 26, 2020 at 06:05:30PM +0000, Edgecombe, Rick P wrote:
> > > On Mon, 2020-10-26 at 11:05 +0200, Mike Rapoport wrote:
> > > > On Mon, Oct 26, 2020 at 01:13:52AM +0000, Edgecombe, Rick P wrote:
> > > > > On Sun, 2020-10-25 at 12:15 +0200, Mike Rapoport wrote:
> > > > > > Indeed, for architectures that define
> > > > > > CONFIG_ARCH_HAS_SET_DIRECT_MAP
> > > > > > it is
> > > > > > possible that __kernel_map_pages() would fail, but since this
> > > > > > function is
> > > > > > void, the failure will go unnoticed.
> > > > > 
> > > > > Could you elaborate on how this could happen? Do you mean during
> > > > > runtime today or if something new was introduced?
> > > > 
> > > > A failure in__kernel_map_pages() may happen today. For instance, on
> > > > x86
> > > > if the kernel is built with DEBUG_PAGEALLOC.
> > > > 
> > > >          __kernel_map_pages(page, 1, 0);
> > > > 
> > > > will need to split, say, 2M page and during the split an allocation
> > > > of
> > > > page table could fail.
> > > 
> > > On x86 at least, DEBUG_PAGEALLOC expects to never have to break a page
> > > on the direct map and even disables locking in cpa because it assumes
> > > this. If this is happening somehow anyway then we should probably fix
> > > that. Even if it's a debug feature, it will not be as useful if it is
> > > causing its own crashes.
> > > 
> > > I'm still wondering if there is something I'm missing here. It seems
> > > like you are saying there is a bug in some arch's, so let's add a WARN
> > > in cross-arch code to log it as it crashes. A warn and making things
> > > clearer seem like good ideas, but if there is a bug we should fix it.
> > > The code around the callers still functionally assume re-mapping can't
> > > fail.
> > 
> > Oh, I've meant x86 kernel *without* DEBUG_PAGEALLOC, and indeed the call
> > that unmaps pages back in safe_copy_page will just reset a 4K page to
> > NP because whatever made it NP at the first place already did the split.
> > 
> > Still, on arm64 with DEBUG_PAGEALLOC=n there is a possibility of a race
> > between map/unmap dance in __vunmap() and safe_copy_page() that may
> > cause access to unmapped memory:
> > 
> > __vunmap()
> >      vm_remove_mappings()
> >          set_direct_map_invalid()
> > 					safe_copy_page()	
> > 					    __kernel_map_pages()
> > 					    	return
> > 					    do_copy_page() -> fault
> > 					   	
> > This is a theoretical bug, but it is still not nice :) 							
> > 
> > > > Currently, the only user of __kernel_map_pages() outside
> > > > DEBUG_PAGEALLOC
> > > > is hibernation, but I think it would be safer to entirely prevent
> > > > usage
> > > > of __kernel_map_pages() when DEBUG_PAGEALLOC=n.
> > > 
> > > I totally agree it's error prone FWIW. On x86, my mental model of how
> > > it is supposed to work is: If a page is 4k and NP it cannot fail to be
> > > remapped. set_direct_map_invalid_noflush() should result in 4k NP
> > > pages, and DEBUG_PAGEALLOC should result in all 4k pages on the direct
> > > map. Are you seeing this violated or do I have wrong assumptions?
> > 
> > You are right, there is a set of assumptions about the remapping of the
> > direct map pages that make it all work, at least on x86.
> > But this is very subtle and it's not easy to wrap one's head around
> > this.
> > 
> > That's why putting __kernel_map_pages() out of "common" use and
> > keep it only for DEBUG_PAGEALLOC would make things clearer.
> > 
> > > Beyond whatever you are seeing, for the latter case of new things
> > > getting introduced to an interface with hidden dependencies... Another
> > > edge case could be a new caller to set_memory_np() could result in
> > > large NP pages. None of the callers today should cause this AFAICT, but
> > > it's not great to rely on the callers to know these details.
> > A caller of set_memory_*() or set_direct_map_*() should expect a failure
> > and be ready for that. So adding a WARN to safe_copy_page() is the first
> > step in that direction :)
> > 
> 
> I am probably missing something important, but why are we saving/restoring
> the content of pages that were explicitly removed from the identity mapping
> such that nobody will access them?
> 
> Pages that are not allocated should contain garbage or be zero
> (init_on_free). That should be easy to handle without ever reading the page
> content.

I'm not familiar with hibernation to say anything smart here, but the
help text of DEBUG_PAGEALLOC in Kconfig says:

	... this option cannot be enabled in combination with
	hibernation as that would result in incorrect warnings of memory
	corruption after a resume because free pages are not saved to
	the suspend image.

Probably you are right and free pages need to be handled differently,
but it does not seem the case now.

> The other user seems to be vm_remove_mappings(), where we only *temporarily*
> remove the mapping - while hibernating, that code shouldn't be active
> anymore I guess - or we could protect it from happening.

Hmm, I _think_ vm_remove_mappings() shouldn't be active while
hibernating, but I'm not 100% sure.

> As I expressed in another mail, secretmem pages should rather not be saved
> when hibernating - hibernation should be rather be disabled.

Agree.

> What am I missing?

I think I miscommunicated the purpose of this set, which was to hide
__kernel_map_pages() under DEBUG_PAGEALLOC and make hibernation use
set_direct_map_*() explictly without major rework of free pages handling
during hibernation.

Does it help?

> -- 
> Thanks,
> 
> David / dhildenb
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 02/10] fs: don't allow splice read/write without explicit ops
From: David Howells @ 2020-10-27  9:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel, dhowells,
	Al Viro, linux-fsdevel, Linus Torvalds
In-Reply-To: <20200827150030.282762-3-hch@lst.de>

Christoph Hellwig <hch@lst.de> wrote:

> default_file_splice_write is the last piece of generic code that uses
> set_fs to make the uaccess routines operate on kernel pointers.  It
> implements a "fallback loop" for splicing from files that do not actually
> provide a proper splice_read method.  The usual file systems and other
> high bandwith instances all provide a ->splice_read, so this just removes
> support for various device drivers and procfs/debugfs files.  If splice
> support for any of those turns out to be important it can be added back
> by switching them to the iter ops and using generic_file_splice_read.

Hmmm...  this causes the copy_file_range() syscall to fail with EINVAL in some
places where before it used to work.

For my part, it causes the generic/112 xfstest to fail with afs, but there may
be other places.

Is this a regression we need to fix in the VFS core?  Or is it something we
need to fix in xfstests and assume userspace will fallback to doing it itself?

David


^ permalink raw reply

* [RFC PATCH kernel 2/2] powerpc/pci: Remove LSI mappings on device teardown
From: Alexey Kardashevskiy @ 2020-10-27  9:06 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Rob Herring, Alexey Kardashevskiy, Marc Zyngier, linux-kernel,
	Qian Cai, Cédric Le Goater, Frederic Barrat,
	Oliver O'Halloran, Thomas Gleixner, Michal Suchánek
In-Reply-To: <20201027090655.14118-1-aik@ozlabs.ru>

From: Oliver O'Halloran <oohall@gmail.com>

When a passthrough IO adapter is removed from a pseries machine using hash
MMU and the XIVE interrupt mode, the POWER hypervisor expects the guest OS
to clear all page table entries related to the adapter. If some are still
present, the RTAS call which isolates the PCI slot returns error 9001
"valid outstanding translations" and the removal of the IO adapter fails.
This is because when the PHBs are scanned, Linux maps automatically the
INTx interrupts in the Linux interrupt number space but these are never
removed.

This problem can be fixed by adding the corresponding unmap operation when
the device is removed. There's no pcibios_* hook for the remove case, but
the same effect can be achieved using a bus notifier.

Cc: Cédric Le Goater <clg@kaod.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/pci-common.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index be108616a721..95f4e173368a 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -404,6 +404,27 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 	return 0;
 }

+static int ppc_pci_unmap_irq_line(struct notifier_block *nb,
+			       unsigned long action, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(data);
+
+	if (action == BUS_NOTIFY_DEL_DEVICE)
+		irq_dispose_mapping(pdev->irq);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ppc_pci_unmap_irq_notifier = {
+	.notifier_call = ppc_pci_unmap_irq_line,
+};
+
+static int ppc_pci_register_irq_notifier(void)
+{
+	return bus_register_notifier(&pci_bus_type, &ppc_pci_unmap_irq_notifier);
+}
+arch_initcall(ppc_pci_register_irq_notifier);
+
 /*
  * Platform support for /proc/bus/pci/X/Y mmap()s.
  *  -- paulus.
-- 
2.17.1

^ permalink raw reply related

* [RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings
From: Alexey Kardashevskiy @ 2020-10-27  9:06 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Rob Herring, Alexey Kardashevskiy, Marc Zyngier, linux-kernel,
	Qian Cai, Cédric Le Goater, Frederic Barrat,
	Oliver O'Halloran, Thomas Gleixner, Michal Suchánek
In-Reply-To: <20201027090655.14118-1-aik@ozlabs.ru>

PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.

This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

If some driver or platform does its own reference counting, this expects
those parties to call irq_find_mapping() and call irq_dispose_mapping()
for every irq_create_fwspec_mapping()/irq_create_mapping().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 kernel/irq/irqdesc.c   | 35 +++++++++++++++++++++++------------
 kernel/irq/irqdomain.c | 27 +++++++++++++--------------
 2 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..dae096238500 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 	return NULL;
 }
 
+static void delayed_free_desc(struct rcu_head *rhp);
 static void irq_kobj_release(struct kobject *kobj)
 {
 	struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+	struct irq_domain *domain;
+	unsigned int virq = desc->irq_data.irq;
 
-	free_masks(desc);
-	free_percpu(desc->kstat_irqs);
-	kfree(desc);
+	domain = desc->irq_data.domain;
+	if (domain) {
+		if (irq_domain_is_hierarchy(domain)) {
+			irq_domain_free_irqs(virq, 1);
+		} else {
+			irq_domain_disassociate(domain, virq);
+			irq_free_desc(virq);
+		}
+	}
+
+	/*
+	 * We free the descriptor, masks and stat fields via RCU. That
+	 * allows demultiplex interrupts to do rcu based management of
+	 * the child interrupts.
+	 * This also allows us to use rcu in kstat_irqs_usr().
+	 */
+	call_rcu(&desc->rcu, delayed_free_desc);
 }
 
 static void delayed_free_desc(struct rcu_head *rhp)
 {
 	struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
 
-	kobject_put(&desc->kobj);
+	free_masks(desc);
+	free_percpu(desc->kstat_irqs);
+	kfree(desc);
 }
 
 static void free_desc(unsigned int irq)
@@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
 	 */
 	irq_sysfs_del(desc);
 	delete_irq_desc(irq);
-
-	/*
-	 * We free the descriptor, masks and stat fields via RCU. That
-	 * allows demultiplex interrupts to do rcu based management of
-	 * the child interrupts.
-	 * This also allows us to use rcu in kstat_irqs_usr().
-	 */
-	call_rcu(&desc->rcu, delayed_free_desc);
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf8b374b892d..02733ddc321f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 {
 	struct device_node *of_node;
 	int virq;
+	struct irq_desc *desc;
 
 	pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
 
@@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 	/* Check if mapping already exists */
 	virq = irq_find_mapping(domain, hwirq);
 	if (virq) {
+		desc = irq_to_desc(virq);
 		pr_debug("-> existing mapping on virq %d\n", virq);
+		kobject_get(&desc->kobj);
 		return virq;
 	}
 
@@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	int virq;
+	struct irq_desc *desc;
 
 	if (fwspec->fwnode) {
 		domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
@@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 		 * current trigger type then we are done so return the
 		 * interrupt number.
 		 */
-		if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+		if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) {
+			desc = irq_to_desc(virq);
+			kobject_get(&desc->kobj);
 			return virq;
+		}
 
 		/*
 		 * If the trigger type has not been set yet, then set
@@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 				return 0;
 
 			irqd_set_trigger_type(irq_data, type);
+			desc = irq_to_desc(virq);
+			kobject_get(&desc->kobj);
 			return virq;
 		}
 
@@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
  */
 void irq_dispose_mapping(unsigned int virq)
 {
-	struct irq_data *irq_data = irq_get_irq_data(virq);
-	struct irq_domain *domain;
+	struct irq_desc *desc = irq_to_desc(virq);
 
-	if (!virq || !irq_data)
+	if (!virq || !desc)
 		return;
 
-	domain = irq_data->domain;
-	if (WARN_ON(domain == NULL))
-		return;
-
-	if (irq_domain_is_hierarchy(domain)) {
-		irq_domain_free_irqs(virq, 1);
-	} else {
-		irq_domain_disassociate(domain, virq);
-		irq_free_desc(virq);
-	}
+	kobject_put(&desc->kobj);
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH kernel 0/2] irq: Add reference counting to IRQ mappings
From: Alexey Kardashevskiy @ 2020-10-27  9:06 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Rob Herring, Alexey Kardashevskiy, Marc Zyngier, linux-kernel,
	Qian Cai, Cédric Le Goater, Frederic Barrat,
	Oliver O'Halloran, Thomas Gleixner, Michal Suchánek

This is an attempt to fix a bug with PCI hot unplug with
a bunch of PCIe bridges and devices sharing INTx.

This did not hit us before as even if we did not
call irq_domain_ops::unmap, the platform (PowerVM) would not
produce an error but with POWER9's XIVE interrupt controller
there is an error if unmap is not called at all (2/2 fixes that)
or an error if we unmapped an interrupt which is still in use
by another device (1/2 fixes that).

One way of fixing that is doing reference counting in
the POWERPC code but since there is a kobj in irq_desc
already, I thought I'll give it a try first.

This is based on sha1
4525c8781ec0 Linus Torvalds "scsi: qla2xxx: remove incorrect sparse #ifdef".

Please comment. Thanks.

Alexey Kardashevskiy (1):
  irq: Add reference counting to IRQ mappings

Oliver O'Halloran (1):
  powerpc/pci: Remove LSI mappings on device teardown

 arch/powerpc/kernel/pci-common.c | 21 +++++++++++++++++++
 kernel/irq/irqdesc.c             | 35 +++++++++++++++++++++-----------
 kernel/irq/irqdomain.c           | 27 ++++++++++++------------
 3 files changed, 57 insertions(+), 26 deletions(-)

-- 
2.17.1

^ permalink raw reply

* Re: [PATCH 2/4] PM: hibernate: improve robustness of mapping pages in the direct map
From: Mike Rapoport @ 2020-10-27  8:49 UTC (permalink / raw)
  To: Edgecombe, Rick P
  Cc: david@redhat.com, peterz@infradead.org, catalin.marinas@arm.com,
	dave.hansen@linux.intel.com, linux-mm@kvack.org, paulus@samba.org,
	pavel@ucw.cz, hpa@zytor.com, sparclinux@vger.kernel.org,
	cl@linux.com, will@kernel.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, x86@kernel.org, rppt@linux.ibm.com,
	borntraeger@de.ibm.com, mingo@redhat.com, rientjes@google.com,
	Brown, Len, aou@eecs.berkeley.edu, gor@linux.ibm.com,
	linux-pm@vger.kernel.org, hca@linux.ibm.com, bp@alien8.de,
	luto@kernel.org, paul.walmsley@sifive.com, kirill@shutemov.name,
	tglx@linutronix.de, iamjoonsoo.kim@lge.com,
	linux-arm-kernel@lists.infradead.org, rjw@rjwysocki.net,
	linux-kernel@vger.kernel.org, penberg@kernel.org,
	palmer@dabbelt.com, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org, davem@davemloft.net
In-Reply-To: <a28d8248057e7dc01716764da9edfd666722ff62.camel@intel.com>

On Mon, Oct 26, 2020 at 06:57:32PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2020-10-26 at 11:15 +0200, Mike Rapoport wrote:
> > On Mon, Oct 26, 2020 at 12:38:32AM +0000, Edgecombe, Rick P wrote:
> > > On Sun, 2020-10-25 at 12:15 +0200, Mike Rapoport wrote:
> > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > 
> > > > When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page
> > > > may
> > > > be
> > > > not present in the direct map and has to be explicitly mapped
> > > > before
> > > > it
> > > > could be copied.
> > > > 
> > > > On arm64 it is possible that a page would be removed from the
> > > > direct
> > > > map
> > > > using set_direct_map_invalid_noflush() but __kernel_map_pages()
> > > > will
> > > > refuse
> > > > to map this page back if DEBUG_PAGEALLOC is disabled.
> > > 
> > > It looks to me that arm64 __kernel_map_pages() will still attempt
> > > to
> > > map it if rodata_full is true, how does this happen?
> > 
> > Unless I misread the code, arm64 requires both rodata_full and
> > debug_pagealloc_enabled() to be true for __kernel_map_pages() to do
> > anything.
> > But rodata_full condition applies to set_direct_map_*_noflush() as
> > well,
> > so with !rodata_full the linear map won't be ever changed.
> 
> Hmm, looks to me that __kernel_map_pages() will only skip it if both
> debug pagealloc and rodata_full are false.
> 
> But now I'm wondering if maybe we could simplify things by just moving
> the hibernate unmapped page logic off of the direct map. On x86,
> text_poke() used to use this reserved fixmap pte thing that it could
> rely on to remap memory with. If hibernate had some separate pte for
> remapping like that, then we could not have any direct map restrictions
> caused by it/kernel_map_pages(), and it wouldn't have to worry about
> relying on anything else.

Well, there is map_kernel_range() that can be used by hibernation as
there is no requirement for particular virtual address, but that would
be quite costly if done for every page.

Maybe we can do somthing like

	if (kernel_page_present(s_page)) {
		do_copy_page(dst, page_address(s_page));
	} else {
		map_kernel_range_noflush(page_address(page), PAGE_SIZE,
					 PROT_READ, &page);
		do_copy_page(dst, page_address(s_page));
		unmap_kernel_range_noflush(page_address(page), PAGE_SIZE);
	}

But it seems that a prerequisite for changing the way a page is mapped
in safe_copy_page() would be to teach hibernation that a mapping here
may fail.

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 0/4] arch, mm: improve robustness of direct map manipulation
From: David Hildenbrand @ 2020-10-27  8:46 UTC (permalink / raw)
  To: Mike Rapoport, Edgecombe, Rick P
  Cc: peterz@infradead.org, catalin.marinas@arm.com,
	dave.hansen@linux.intel.com, linux-mm@kvack.org, paulus@samba.org,
	pavel@ucw.cz, hpa@zytor.com, sparclinux@vger.kernel.org,
	cl@linux.com, will@kernel.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, x86@kernel.org, rppt@linux.ibm.com,
	borntraeger@de.ibm.com, mingo@redhat.com, rientjes@google.com,
	Brown, Len, aou@eecs.berkeley.edu, gor@linux.ibm.com,
	linux-pm@vger.kernel.org, hca@linux.ibm.com, bp@alien8.de,
	luto@kernel.org, paul.walmsley@sifive.com, kirill@shutemov.name,
	tglx@linutronix.de, iamjoonsoo.kim@lge.com,
	linux-arm-kernel@lists.infradead.org, rjw@rjwysocki.net,
	linux-kernel@vger.kernel.org, penberg@kernel.org,
	palmer@dabbelt.com, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org, davem@davemloft.net
In-Reply-To: <20201027083816.GG1154158@kernel.org>

On 27.10.20 09:38, Mike Rapoport wrote:
> On Mon, Oct 26, 2020 at 06:05:30PM +0000, Edgecombe, Rick P wrote:
>> On Mon, 2020-10-26 at 11:05 +0200, Mike Rapoport wrote:
>>> On Mon, Oct 26, 2020 at 01:13:52AM +0000, Edgecombe, Rick P wrote:
>>>> On Sun, 2020-10-25 at 12:15 +0200, Mike Rapoport wrote:
>>>>> Indeed, for architectures that define
>>>>> CONFIG_ARCH_HAS_SET_DIRECT_MAP
>>>>> it is
>>>>> possible that __kernel_map_pages() would fail, but since this
>>>>> function is
>>>>> void, the failure will go unnoticed.
>>>>
>>>> Could you elaborate on how this could happen? Do you mean during
>>>> runtime today or if something new was introduced?
>>>
>>> A failure in__kernel_map_pages() may happen today. For instance, on
>>> x86
>>> if the kernel is built with DEBUG_PAGEALLOC.
>>>
>>>          __kernel_map_pages(page, 1, 0);
>>>
>>> will need to split, say, 2M page and during the split an allocation
>>> of
>>> page table could fail.
>>
>> On x86 at least, DEBUG_PAGEALLOC expects to never have to break a page
>> on the direct map and even disables locking in cpa because it assumes
>> this. If this is happening somehow anyway then we should probably fix
>> that. Even if it's a debug feature, it will not be as useful if it is
>> causing its own crashes.
>>
>> I'm still wondering if there is something I'm missing here. It seems
>> like you are saying there is a bug in some arch's, so let's add a WARN
>> in cross-arch code to log it as it crashes. A warn and making things
>> clearer seem like good ideas, but if there is a bug we should fix it.
>> The code around the callers still functionally assume re-mapping can't
>> fail.
> 
> Oh, I've meant x86 kernel *without* DEBUG_PAGEALLOC, and indeed the call
> that unmaps pages back in safe_copy_page will just reset a 4K page to
> NP because whatever made it NP at the first place already did the split.
> 
> Still, on arm64 with DEBUG_PAGEALLOC=n there is a possibility of a race
> between map/unmap dance in __vunmap() and safe_copy_page() that may
> cause access to unmapped memory:
> 
> __vunmap()
>      vm_remove_mappings()
>          set_direct_map_invalid()
> 					safe_copy_page()	
> 					    __kernel_map_pages()
> 					    	return
> 					    do_copy_page() -> fault
> 					   	
> This is a theoretical bug, but it is still not nice :) 							
> 
>>> Currently, the only user of __kernel_map_pages() outside
>>> DEBUG_PAGEALLOC
>>> is hibernation, but I think it would be safer to entirely prevent
>>> usage
>>> of __kernel_map_pages() when DEBUG_PAGEALLOC=n.
>>
>> I totally agree it's error prone FWIW. On x86, my mental model of how
>> it is supposed to work is: If a page is 4k and NP it cannot fail to be
>> remapped. set_direct_map_invalid_noflush() should result in 4k NP
>> pages, and DEBUG_PAGEALLOC should result in all 4k pages on the direct
>> map. Are you seeing this violated or do I have wrong assumptions?
> 
> You are right, there is a set of assumptions about the remapping of the
> direct map pages that make it all work, at least on x86.
> But this is very subtle and it's not easy to wrap one's head around
> this.
> 
> That's why putting __kernel_map_pages() out of "common" use and
> keep it only for DEBUG_PAGEALLOC would make things clearer.
> 
>> Beyond whatever you are seeing, for the latter case of new things
>> getting introduced to an interface with hidden dependencies... Another
>> edge case could be a new caller to set_memory_np() could result in
>> large NP pages. None of the callers today should cause this AFAICT, but
>> it's not great to rely on the callers to know these details.
>   
> A caller of set_memory_*() or set_direct_map_*() should expect a failure
> and be ready for that. So adding a WARN to safe_copy_page() is the first
> step in that direction :)
> 

I am probably missing something important, but why are we 
saving/restoring the content of pages that were explicitly removed from 
the identity mapping such that nobody will access them?

Pages that are not allocated should contain garbage or be zero 
(init_on_free). That should be easy to handle without ever reading the 
page content.

The other user seems to be vm_remove_mappings(), where we only 
*temporarily* remove the mapping - while hibernating, that code 
shouldn't be active anymore I guess - or we could protect it from happening.

As I expressed in another mail, secretmem pages should rather not be 
saved when hibernating - hibernation should be rather be disabled.

What am I missing?

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 0/4] arch, mm: improve robustness of direct map manipulation
From: Mike Rapoport @ 2020-10-27  8:38 UTC (permalink / raw)
  To: Edgecombe, Rick P
  Cc: david@redhat.com, peterz@infradead.org, catalin.marinas@arm.com,
	dave.hansen@linux.intel.com, linux-mm@kvack.org, paulus@samba.org,
	pavel@ucw.cz, hpa@zytor.com, sparclinux@vger.kernel.org,
	cl@linux.com, will@kernel.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, x86@kernel.org, rppt@linux.ibm.com,
	borntraeger@de.ibm.com, mingo@redhat.com, rientjes@google.com,
	Brown, Len, aou@eecs.berkeley.edu, gor@linux.ibm.com,
	linux-pm@vger.kernel.org, hca@linux.ibm.com, bp@alien8.de,
	luto@kernel.org, paul.walmsley@sifive.com, kirill@shutemov.name,
	tglx@linutronix.de, iamjoonsoo.kim@lge.com,
	linux-arm-kernel@lists.infradead.org, rjw@rjwysocki.net,
	linux-kernel@vger.kernel.org, penberg@kernel.org,
	palmer@dabbelt.com, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org, davem@davemloft.net
In-Reply-To: <a0212b073b3b2f62c3dbf1bf398f03fa402997be.camel@intel.com>

On Mon, Oct 26, 2020 at 06:05:30PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2020-10-26 at 11:05 +0200, Mike Rapoport wrote:
> > On Mon, Oct 26, 2020 at 01:13:52AM +0000, Edgecombe, Rick P wrote:
> > > On Sun, 2020-10-25 at 12:15 +0200, Mike Rapoport wrote:
> > > > Indeed, for architectures that define
> > > > CONFIG_ARCH_HAS_SET_DIRECT_MAP
> > > > it is
> > > > possible that __kernel_map_pages() would fail, but since this
> > > > function is
> > > > void, the failure will go unnoticed.
> > > 
> > > Could you elaborate on how this could happen? Do you mean during
> > > runtime today or if something new was introduced?
> > 
> > A failure in__kernel_map_pages() may happen today. For instance, on
> > x86
> > if the kernel is built with DEBUG_PAGEALLOC.
> > 
> >         __kernel_map_pages(page, 1, 0);
> > 
> > will need to split, say, 2M page and during the split an allocation
> > of
> > page table could fail.
> 
> On x86 at least, DEBUG_PAGEALLOC expects to never have to break a page
> on the direct map and even disables locking in cpa because it assumes
> this. If this is happening somehow anyway then we should probably fix
> that. Even if it's a debug feature, it will not be as useful if it is
> causing its own crashes.
> 
> I'm still wondering if there is something I'm missing here. It seems
> like you are saying there is a bug in some arch's, so let's add a WARN
> in cross-arch code to log it as it crashes. A warn and making things
> clearer seem like good ideas, but if there is a bug we should fix it.
> The code around the callers still functionally assume re-mapping can't
> fail.

Oh, I've meant x86 kernel *without* DEBUG_PAGEALLOC, and indeed the call
that unmaps pages back in safe_copy_page will just reset a 4K page to
NP because whatever made it NP at the first place already did the split.

Still, on arm64 with DEBUG_PAGEALLOC=n there is a possibility of a race
between map/unmap dance in __vunmap() and safe_copy_page() that may
cause access to unmapped memory:

__vunmap()
    vm_remove_mappings()
        set_direct_map_invalid()
					safe_copy_page()	
					    __kernel_map_pages()
					    	return
					    do_copy_page() -> fault
					   	
This is a theoretical bug, but it is still not nice :) 							

> > Currently, the only user of __kernel_map_pages() outside
> > DEBUG_PAGEALLOC
> > is hibernation, but I think it would be safer to entirely prevent
> > usage
> > of __kernel_map_pages() when DEBUG_PAGEALLOC=n.
> 
> I totally agree it's error prone FWIW. On x86, my mental model of how
> it is supposed to work is: If a page is 4k and NP it cannot fail to be
> remapped. set_direct_map_invalid_noflush() should result in 4k NP
> pages, and DEBUG_PAGEALLOC should result in all 4k pages on the direct
> map. Are you seeing this violated or do I have wrong assumptions?

You are right, there is a set of assumptions about the remapping of the
direct map pages that make it all work, at least on x86.
But this is very subtle and it's not easy to wrap one's head around
this.

That's why putting __kernel_map_pages() out of "common" use and
keep it only for DEBUG_PAGEALLOC would make things clearer.

> Beyond whatever you are seeing, for the latter case of new things
> getting introduced to an interface with hidden dependencies... Another
> edge case could be a new caller to set_memory_np() could result in
> large NP pages. None of the callers today should cause this AFAICT, but
> it's not great to rely on the callers to know these details.
 
A caller of set_memory_*() or set_direct_map_*() should expect a failure
and be ready for that. So adding a WARN to safe_copy_page() is the first
step in that direction :)

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 5/5] powerpc/perf: use regs->nip when siar is zero
From: Madhavan Srinivasan @ 2020-10-27  2:31 UTC (permalink / raw)
  To: Michael Ellerman, Christophe Leroy; +Cc: atrajeev, linuxppc-dev
In-Reply-To: <87r1prxd9e.fsf@mpe.ellerman.id.au>


On 10/22/20 6:55 AM, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 21/10/2020 à 10:53, Madhavan Srinivasan a écrit :
>>> In power10 DD1, there is an issue where the
>>> Sampled Instruction Address Register (SIAR)
>>> not latching to the sampled address during
>>> random sampling. This results in value of 0s
>>> in the SIAR. Patch adds a check to use regs->nip
>>> when SIAR is zero.
>> Why not use regs->nip at all time in that case, and not read SPRN_SIAR at all ?
> Yeah that's a reasonable question.
>
> I can't really find anywhere in the ISA that explains it.
>
> I believe the main (or only?) reason is that interrupts might be
> disabled when the PMU samples the instruction. So in that case the SIAR
> will point at an instruction somewhere in interrupts-off code, whereas
> the NIP will point to the location where we re-enabled interrupts and
> took the PMU interrupt.

sorry for the delayed response, was out.

thats correct and also we see SIAR zeroing only for some of
the events. We still want to use the SIAR address for
the sample creation and use nip only if it  zero.

Maddy


>
> cheers

^ permalink raw reply

* Re: [PATCH] ibmvfc: add new fields for version 2 of several MADs
From: Martin K. Petersen @ 2020-10-27  1:56 UTC (permalink / raw)
  To: Tyrel Datwyler
  Cc: martin.petersen, linux-scsi, linux-kernel, james.bottomley,
	brking, linuxppc-dev
In-Reply-To: <20201026013649.10147-1-tyreld@linux.ibm.com>


Tyrel,

> Introduce a targetWWPN field to several MADs. Its possible that a scsi
> ID of a target can change due to some fabric changes. The WWPN of the
> scsi target provides a better way to identify the target. Also, add
> flags for receiving MAD versioning information and advertising client
> support for targetWWPN with the VIOS. This latter capability flag will
> be required for future clients capable of requesting multiple hardware
> queues from the host adapter.

Applied to 5.11/scsi-staging, thanks!

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 2/4] PM: hibernate: improve robustness of mapping pages in the direct map
From: Edgecombe, Rick P @ 2020-10-27  1:10 UTC (permalink / raw)
  To: rppt@kernel.org
  Cc: david@redhat.com, peterz@infradead.org, catalin.marinas@arm.com,
	dave.hansen@linux.intel.com, linux-mm@kvack.org, paulus@samba.org,
	pavel@ucw.cz, hpa@zytor.com, sparclinux@vger.kernel.org,
	cl@linux.com, will@kernel.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, x86@kernel.org, rppt@linux.ibm.com,
	borntraeger@de.ibm.com, mingo@redhat.com, rientjes@google.com,
	Brown, Len, aou@eecs.berkeley.edu, gor@linux.ibm.com,
	linux-pm@vger.kernel.org, hca@linux.ibm.com, bp@alien8.de,
	luto@kernel.org, paul.walmsley@sifive.com, kirill@shutemov.name,
	tglx@linutronix.de, iamjoonsoo.kim@lge.com,
	linux-arm-kernel@lists.infradead.org, rjw@rjwysocki.net,
	linux-kernel@vger.kernel.org, penberg@kernel.org,
	palmer@dabbelt.com, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org, davem@davemloft.net
In-Reply-To: <20201026091554.GB1154158@kernel.org>

On Mon, 2020-10-26 at 11:15 +0200, Mike Rapoport wrote:
> The intention of this series is to disallow usage of
> __kernel_map_pages() when DEBUG_PAGEALLOC=n. I'll update this patch
> to
> better handle possible errors, but I still want to keep WARN in the
> caller.

Sorry, I missed this snippet at the bottom, that your intention was to
actually handle errors somehow in the hibernate code. Please ignore my
other comment that assumed you wanted to just add a warn.

^ permalink raw reply

* [PATCH AUTOSEL 4.4 21/25] KVM: PPC: Book3S HV: Do not allocate HPT for a nested guest
From: Sasha Levin @ 2020-10-27  0:11 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Fabiano Rosas, Greg Kurz, kvm-ppc,
	Satheesh Rajendran, linuxppc-dev, David Gibson
In-Reply-To: <20201027001123.1027642-1-sashal@kernel.org>

From: Fabiano Rosas <farosas@linux.ibm.com>

[ Upstream commit 05e6295dc7de859c9d56334805485c4d20bebf25 ]

The current nested KVM code does not support HPT guests. This is
informed/enforced in some ways:

- Hosts < P9 will not be able to enable the nested HV feature;

- The nested hypervisor MMU capabilities will not contain
  KVM_CAP_PPC_MMU_HASH_V3;

- QEMU reflects the MMU capabilities in the
  'ibm,arch-vec-5-platform-support' device-tree property;

- The nested guest, at 'prom_parse_mmu_model' ignores the
  'disable_radix' kernel command line option if HPT is not supported;

- The KVM_PPC_CONFIGURE_V3_MMU ioctl will fail if trying to use HPT.

There is, however, still a way to start a HPT guest by using
max-compat-cpu=power8 at the QEMU machine options. This leads to the
guest being set to use hash after QEMU calls the KVM_PPC_ALLOCATE_HTAB
ioctl.

With the guest set to hash, the nested hypervisor goes through the
entry path that has no knowledge of nesting (kvmppc_run_vcpu) and
crashes when it tries to execute an hypervisor-privileged (mtspr
HDEC) instruction at __kvmppc_vcore_entry:

root@L1:~ $ qemu-system-ppc64 -machine pseries,max-cpu-compat=power8 ...

<snip>
[  538.543303] CPU: 83 PID: 25185 Comm: CPU 0/KVM Not tainted 5.9.0-rc4 #1
[  538.543355] NIP:  c00800000753f388 LR: c00800000753f368 CTR: c0000000001e5ec0
[  538.543417] REGS: c0000013e91e33b0 TRAP: 0700   Not tainted  (5.9.0-rc4)
[  538.543470] MSR:  8000000002843033 <SF,VEC,VSX,FP,ME,IR,DR,RI,LE>  CR: 22422882  XER: 20040000
[  538.543546] CFAR: c00800000753f4b0 IRQMASK: 3
               GPR00: c0080000075397a0 c0000013e91e3640 c00800000755e600 0000000080000000
               GPR04: 0000000000000000 c0000013eab19800 c000001394de0000 00000043a054db72
               GPR08: 00000000003b1652 0000000000000000 0000000000000000 c0080000075502e0
               GPR12: c0000000001e5ec0 c0000007ffa74200 c0000013eab19800 0000000000000008
               GPR16: 0000000000000000 c00000139676c6c0 c000000001d23948 c0000013e91e38b8
               GPR20: 0000000000000053 0000000000000000 0000000000000001 0000000000000000
               GPR24: 0000000000000001 0000000000000001 0000000000000000 0000000000000001
               GPR28: 0000000000000001 0000000000000053 c0000013eab19800 0000000000000001
[  538.544067] NIP [c00800000753f388] __kvmppc_vcore_entry+0x90/0x104 [kvm_hv]
[  538.544121] LR [c00800000753f368] __kvmppc_vcore_entry+0x70/0x104 [kvm_hv]
[  538.544173] Call Trace:
[  538.544196] [c0000013e91e3640] [c0000013e91e3680] 0xc0000013e91e3680 (unreliable)
[  538.544260] [c0000013e91e3820] [c0080000075397a0] kvmppc_run_core+0xbc8/0x19d0 [kvm_hv]
[  538.544325] [c0000013e91e39e0] [c00800000753d99c] kvmppc_vcpu_run_hv+0x404/0xc00 [kvm_hv]
[  538.544394] [c0000013e91e3ad0] [c0080000072da4fc] kvmppc_vcpu_run+0x34/0x48 [kvm]
[  538.544472] [c0000013e91e3af0] [c0080000072d61b8] kvm_arch_vcpu_ioctl_run+0x310/0x420 [kvm]
[  538.544539] [c0000013e91e3b80] [c0080000072c7450] kvm_vcpu_ioctl+0x298/0x778 [kvm]
[  538.544605] [c0000013e91e3ce0] [c0000000004b8c2c] sys_ioctl+0x1dc/0xc90
[  538.544662] [c0000013e91e3dc0] [c00000000002f9a4] system_call_exception+0xe4/0x1c0
[  538.544726] [c0000013e91e3e20] [c00000000000d140] system_call_common+0xf0/0x27c
[  538.544787] Instruction dump:
[  538.544821] f86d1098 60000000 60000000 48000099 e8ad0fe8 e8c500a0 e9264140 75290002
[  538.544886] 7d1602a6 7cec42a6 40820008 7d0807b4 <7d164ba6> 7d083a14 f90d10a0 480104fd
[  538.544953] ---[ end trace 74423e2b948c2e0c ]---

This patch makes the KVM_PPC_ALLOCATE_HTAB ioctl fail when running in
the nested hypervisor, causing QEMU to abort.

Reported-by: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>
Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kvm/book3s_hv.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 54c6ba87a25ad..b005ce9dc8f04 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3157,6 +3157,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 	case KVM_PPC_ALLOCATE_HTAB: {
 		u32 htab_order;
 
+		/* If we're a nested hypervisor, we currently only support radix */
+		if (kvmhv_on_pseries()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+
 		r = -EFAULT;
 		if (get_user(htab_order, (u32 __user *)argp))
 			break;
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.4 01/25] powerpc/powernv/smp: Fix spurious DBG() warning
From: Sasha Levin @ 2020-10-27  0:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Oliver O'Halloran, Joel Stanley

From: Oliver O'Halloran <oohall@gmail.com>

[ Upstream commit f6bac19cf65c5be21d14a0c9684c8f560f2096dd ]

When building with W=1 we get the following warning:

 arch/powerpc/platforms/powernv/smp.c: In function ‘pnv_smp_cpu_kill_self’:
 arch/powerpc/platforms/powernv/smp.c:276:16: error: suggest braces around
 	empty body in an ‘if’ statement [-Werror=empty-body]
   276 |      cpu, srr1);
       |                ^
 cc1: all warnings being treated as errors

The full context is this block:

 if (srr1 && !generic_check_cpu_restart(cpu))
 	DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
 			cpu, srr1);

When building with DEBUG undefined DBG() expands to nothing and GCC emits
the warning due to the lack of braces around an empty statement.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200804005410.146094-2-oohall@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/platforms/powernv/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index ad7b1a3dbed09..c605c78a80896 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -41,7 +41,7 @@
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
 #else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
 #endif
 
 static void pnv_smp_setup_cpu(int cpu)
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.9 01/30] powerpc/powernv/smp: Fix spurious DBG() warning
From: Sasha Levin @ 2020-10-27  0:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Oliver O'Halloran, Joel Stanley

From: Oliver O'Halloran <oohall@gmail.com>

[ Upstream commit f6bac19cf65c5be21d14a0c9684c8f560f2096dd ]

When building with W=1 we get the following warning:

 arch/powerpc/platforms/powernv/smp.c: In function ‘pnv_smp_cpu_kill_self’:
 arch/powerpc/platforms/powernv/smp.c:276:16: error: suggest braces around
 	empty body in an ‘if’ statement [-Werror=empty-body]
   276 |      cpu, srr1);
       |                ^
 cc1: all warnings being treated as errors

The full context is this block:

 if (srr1 && !generic_check_cpu_restart(cpu))
 	DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
 			cpu, srr1);

When building with DEBUG undefined DBG() expands to nothing and GCC emits
the warning due to the lack of braces around an empty statement.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200804005410.146094-2-oohall@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/platforms/powernv/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index eec0e8d0454d1..7e0f5fa0452b3 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -41,7 +41,7 @@
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
 #else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
 #endif
 
 static void pnv_smp_setup_cpu(int cpu)
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.14 02/46] powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
From: Sasha Levin @ 2020-10-27  0:09 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Nicholas Piggin
In-Reply-To: <20201027000946.1026923-1-sashal@kernel.org>

From: Nicholas Piggin <npiggin@gmail.com>

[ Upstream commit 66acd46080bd9e5ad2be4b0eb1d498d5145d058e ]

powerpc uses IPIs in some situations to switch a kernel thread away
from a lazy tlb mm, which is subject to the TLB flushing race
described in the changelog introducing ARCH_WANT_IRQS_OFF_ACTIVATE_MM.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200914045219.3736466-3-npiggin@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/Kconfig                   | 1 +
 arch/powerpc/include/asm/mmu_context.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 679e1e3c16953..7cc91d7f893cf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -154,6 +154,7 @@ config PPC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_EXTABLE_SORT
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 6f67ff5a52672..5f9ad4f4b9c0f 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -101,7 +101,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  */
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
-	switch_mm(prev, next, current);
+	switch_mm_irqs_off(prev, next, current);
 }
 
 /* We don't currently use enter_lazy_tlb() for anything */
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.14 01/46] powerpc/powernv/smp: Fix spurious DBG() warning
From: Sasha Levin @ 2020-10-27  0:09 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Oliver O'Halloran, Joel Stanley

From: Oliver O'Halloran <oohall@gmail.com>

[ Upstream commit f6bac19cf65c5be21d14a0c9684c8f560f2096dd ]

When building with W=1 we get the following warning:

 arch/powerpc/platforms/powernv/smp.c: In function ‘pnv_smp_cpu_kill_self’:
 arch/powerpc/platforms/powernv/smp.c:276:16: error: suggest braces around
 	empty body in an ‘if’ statement [-Werror=empty-body]
   276 |      cpu, srr1);
       |                ^
 cc1: all warnings being treated as errors

The full context is this block:

 if (srr1 && !generic_check_cpu_restart(cpu))
 	DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
 			cpu, srr1);

When building with DEBUG undefined DBG() expands to nothing and GCC emits
the warning due to the lack of braces around an empty statement.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200804005410.146094-2-oohall@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/platforms/powernv/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7d..11d8fde770c38 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -44,7 +44,7 @@
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
 #else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
 #endif
 
 static void pnv_smp_setup_cpu(int cpu)
-- 
2.25.1


^ permalink raw reply related

* Re: [REGRESSION] mm: process_vm_readv testcase no longer works after compat_prcoess_vm_readv removed
From: Jens Axboe @ 2020-10-27  0:09 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390, linux-scsi,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	linux-security-module, Robert O'Callahan, Linus Torvalds,
	Arnd Bergmann, linux-block, io-uring, moderated list:ARM PORT,
	linux-parisc, netdev, open list, Kyle Huey,
	open list:FILESYSTEMS (VFS and infrastructure), Andrew Morton,
	linuxppc-dev
In-Reply-To: <20201027000521.GD3576660@ZenIV.linux.org.uk>

On 10/26/20 6:05 PM, Al Viro wrote:
> On Mon, Oct 26, 2020 at 05:56:11PM -0600, Jens Axboe wrote:
>> On 10/26/20 4:55 PM, Kyle Huey wrote:
>>> A test program from the rr[0] test suite, vm_readv_writev[1], no
>>> longer works on 5.10-rc1 when compiled as a 32 bit binary and executed
>>> on a 64 bit kernel. The first process_vm_readv call (on line 35) now
>>> fails with EFAULT. I have bisected this to
>>> c3973b401ef2b0b8005f8074a10e96e3ea093823.
>>>
>>> It should be fairly straightforward to extract the test case from our
>>> repository into a standalone program.
>>
>> Can you check with this applied?
>>
>> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
>> index fd12da80b6f2..05676722d9cd 100644
>> --- a/mm/process_vm_access.c
>> +++ b/mm/process_vm_access.c
>> @@ -273,7 +273,8 @@ static ssize_t process_vm_rw(pid_t pid,
>>  		return rc;
>>  	if (!iov_iter_count(&iter))
>>  		goto free_iov_l;
>> -	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
>> +	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
>> +				in_compat_syscall());
> 
> _ouch_
> 
> There's a bug, all right, but I'm not sure that this is all there is
> to it. For now it's probably the right fix, but...  Consider the fun
> trying to use that from 32bit process to access the memory of 64bit
> one.  IOW, we might want to add an explicit flag for "force 64bit
> addresses/sizes in rvec".

Ouch yes good point, nice catch.

-- 
Jens Axboe


^ permalink raw reply

* Re: [REGRESSION] mm: process_vm_readv testcase no longer works after compat_prcoess_vm_readv removed
From: Al Viro @ 2020-10-27  0:05 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390, linux-scsi,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	linux-security-module, Robert O'Callahan, Linus Torvalds,
	Arnd Bergmann, linux-block, io-uring, moderated list:ARM PORT,
	linux-parisc, netdev, open list, Kyle Huey,
	open list:FILESYSTEMS (VFS and infrastructure), Andrew Morton,
	linuxppc-dev
In-Reply-To: <70d5569e-4ad6-988a-e047-5d12d298684c@kernel.dk>

On Mon, Oct 26, 2020 at 05:56:11PM -0600, Jens Axboe wrote:
> On 10/26/20 4:55 PM, Kyle Huey wrote:
> > A test program from the rr[0] test suite, vm_readv_writev[1], no
> > longer works on 5.10-rc1 when compiled as a 32 bit binary and executed
> > on a 64 bit kernel. The first process_vm_readv call (on line 35) now
> > fails with EFAULT. I have bisected this to
> > c3973b401ef2b0b8005f8074a10e96e3ea093823.
> > 
> > It should be fairly straightforward to extract the test case from our
> > repository into a standalone program.
> 
> Can you check with this applied?
> 
> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
> index fd12da80b6f2..05676722d9cd 100644
> --- a/mm/process_vm_access.c
> +++ b/mm/process_vm_access.c
> @@ -273,7 +273,8 @@ static ssize_t process_vm_rw(pid_t pid,
>  		return rc;
>  	if (!iov_iter_count(&iter))
>  		goto free_iov_l;
> -	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
> +	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
> +				in_compat_syscall());

_ouch_

There's a bug, all right, but I'm not sure that this is all there is to it.
For now it's probably the right fix, but...  Consider the fun trying to
use that from 32bit process to access the memory of 64bit one.  IOW, we
might want to add an explicit flag for "force 64bit addresses/sizes
in rvec".

^ permalink raw reply

* Re: [REGRESSION] mm: process_vm_readv testcase no longer works after compat_prcoess_vm_readv removed
From: Jens Axboe @ 2020-10-26 23:56 UTC (permalink / raw)
  To: Kyle Huey, open list, Christoph Hellwig
  Cc: linux-aio, David Howells, linux-mm, keyrings, sparclinux,
	Robert O'Callahan, linux-arch, linux-s390, linux-scsi,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Linus Torvalds,
	Arnd Bergmann, linux-block, Alexander Viro, io-uring,
	moderated list:ARM PORT, linux-parisc, netdev, linux-mips,
	linux-security-module,
	open list:FILESYSTEMS (VFS and infrastructure), Andrew Morton,
	linuxppc-dev
In-Reply-To: <CAP045Aqrsb=CXHDHx4nS-pgg+MUDj14r-kN8_Jcbn-NAUziVag@mail.gmail.com>

On 10/26/20 4:55 PM, Kyle Huey wrote:
> A test program from the rr[0] test suite, vm_readv_writev[1], no
> longer works on 5.10-rc1 when compiled as a 32 bit binary and executed
> on a 64 bit kernel. The first process_vm_readv call (on line 35) now
> fails with EFAULT. I have bisected this to
> c3973b401ef2b0b8005f8074a10e96e3ea093823.
> 
> It should be fairly straightforward to extract the test case from our
> repository into a standalone program.

Can you check with this applied?

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index fd12da80b6f2..05676722d9cd 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -273,7 +273,8 @@ static ssize_t process_vm_rw(pid_t pid,
 		return rc;
 	if (!iov_iter_count(&iter))
 		goto free_iov_l;
-	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
+	iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
+				in_compat_syscall());
 	if (IS_ERR(iov_r)) {
 		rc = PTR_ERR(iov_r);
 		goto free_iov_l;

-- 
Jens Axboe


^ permalink raw reply related

* [PATCH AUTOSEL 4.19 02/60] powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
From: Sasha Levin @ 2020-10-27  0:03 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Nicholas Piggin
In-Reply-To: <20201027000415.1026364-1-sashal@kernel.org>

From: Nicholas Piggin <npiggin@gmail.com>

[ Upstream commit 66acd46080bd9e5ad2be4b0eb1d498d5145d058e ]

powerpc uses IPIs in some situations to switch a kernel thread away
from a lazy tlb mm, which is subject to the TLB flushing race
described in the changelog introducing ARCH_WANT_IRQS_OFF_ACTIVATE_MM.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200914045219.3736466-3-npiggin@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/Kconfig                   | 1 +
 arch/powerpc/include/asm/mmu_context.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f38d153d25861..0bc53f0e37c0f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -152,6 +152,7 @@ config PPC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_EXTABLE_SORT
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index ae953958c0f33..d93bdcaa4a469 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -204,7 +204,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  */
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
-	switch_mm(prev, next, current);
+	switch_mm_irqs_off(prev, next, current);
 }
 
 /* We don't currently use enter_lazy_tlb() for anything */
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.19 01/60] powerpc/powernv/smp: Fix spurious DBG() warning
From: Sasha Levin @ 2020-10-27  0:03 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, linuxppc-dev, Oliver O'Halloran, Joel Stanley

From: Oliver O'Halloran <oohall@gmail.com>

[ Upstream commit f6bac19cf65c5be21d14a0c9684c8f560f2096dd ]

When building with W=1 we get the following warning:

 arch/powerpc/platforms/powernv/smp.c: In function ‘pnv_smp_cpu_kill_self’:
 arch/powerpc/platforms/powernv/smp.c:276:16: error: suggest braces around
 	empty body in an ‘if’ statement [-Werror=empty-body]
   276 |      cpu, srr1);
       |                ^
 cc1: all warnings being treated as errors

The full context is this block:

 if (srr1 && !generic_check_cpu_restart(cpu))
 	DBG("CPU%d Unexpected exit while offline srr1=%lx!\n",
 			cpu, srr1);

When building with DEBUG undefined DBG() expands to nothing and GCC emits
the warning due to the lack of braces around an empty statement.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200804005410.146094-2-oohall@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/platforms/powernv/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8d49ba370c504..889c3dbec6fb9 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -47,7 +47,7 @@
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
 #else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
 #endif
 
 static void pnv_smp_setup_cpu(int cpu)
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.4 67/80] KVM: PPC: Book3S HV: Do not allocate HPT for a nested guest
From: Sasha Levin @ 2020-10-26 23:55 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Fabiano Rosas, Greg Kurz, kvm-ppc,
	Satheesh Rajendran, linuxppc-dev, David Gibson
In-Reply-To: <20201026235516.1025100-1-sashal@kernel.org>

From: Fabiano Rosas <farosas@linux.ibm.com>

[ Upstream commit 05e6295dc7de859c9d56334805485c4d20bebf25 ]

The current nested KVM code does not support HPT guests. This is
informed/enforced in some ways:

- Hosts < P9 will not be able to enable the nested HV feature;

- The nested hypervisor MMU capabilities will not contain
  KVM_CAP_PPC_MMU_HASH_V3;

- QEMU reflects the MMU capabilities in the
  'ibm,arch-vec-5-platform-support' device-tree property;

- The nested guest, at 'prom_parse_mmu_model' ignores the
  'disable_radix' kernel command line option if HPT is not supported;

- The KVM_PPC_CONFIGURE_V3_MMU ioctl will fail if trying to use HPT.

There is, however, still a way to start a HPT guest by using
max-compat-cpu=power8 at the QEMU machine options. This leads to the
guest being set to use hash after QEMU calls the KVM_PPC_ALLOCATE_HTAB
ioctl.

With the guest set to hash, the nested hypervisor goes through the
entry path that has no knowledge of nesting (kvmppc_run_vcpu) and
crashes when it tries to execute an hypervisor-privileged (mtspr
HDEC) instruction at __kvmppc_vcore_entry:

root@L1:~ $ qemu-system-ppc64 -machine pseries,max-cpu-compat=power8 ...

<snip>
[  538.543303] CPU: 83 PID: 25185 Comm: CPU 0/KVM Not tainted 5.9.0-rc4 #1
[  538.543355] NIP:  c00800000753f388 LR: c00800000753f368 CTR: c0000000001e5ec0
[  538.543417] REGS: c0000013e91e33b0 TRAP: 0700   Not tainted  (5.9.0-rc4)
[  538.543470] MSR:  8000000002843033 <SF,VEC,VSX,FP,ME,IR,DR,RI,LE>  CR: 22422882  XER: 20040000
[  538.543546] CFAR: c00800000753f4b0 IRQMASK: 3
               GPR00: c0080000075397a0 c0000013e91e3640 c00800000755e600 0000000080000000
               GPR04: 0000000000000000 c0000013eab19800 c000001394de0000 00000043a054db72
               GPR08: 00000000003b1652 0000000000000000 0000000000000000 c0080000075502e0
               GPR12: c0000000001e5ec0 c0000007ffa74200 c0000013eab19800 0000000000000008
               GPR16: 0000000000000000 c00000139676c6c0 c000000001d23948 c0000013e91e38b8
               GPR20: 0000000000000053 0000000000000000 0000000000000001 0000000000000000
               GPR24: 0000000000000001 0000000000000001 0000000000000000 0000000000000001
               GPR28: 0000000000000001 0000000000000053 c0000013eab19800 0000000000000001
[  538.544067] NIP [c00800000753f388] __kvmppc_vcore_entry+0x90/0x104 [kvm_hv]
[  538.544121] LR [c00800000753f368] __kvmppc_vcore_entry+0x70/0x104 [kvm_hv]
[  538.544173] Call Trace:
[  538.544196] [c0000013e91e3640] [c0000013e91e3680] 0xc0000013e91e3680 (unreliable)
[  538.544260] [c0000013e91e3820] [c0080000075397a0] kvmppc_run_core+0xbc8/0x19d0 [kvm_hv]
[  538.544325] [c0000013e91e39e0] [c00800000753d99c] kvmppc_vcpu_run_hv+0x404/0xc00 [kvm_hv]
[  538.544394] [c0000013e91e3ad0] [c0080000072da4fc] kvmppc_vcpu_run+0x34/0x48 [kvm]
[  538.544472] [c0000013e91e3af0] [c0080000072d61b8] kvm_arch_vcpu_ioctl_run+0x310/0x420 [kvm]
[  538.544539] [c0000013e91e3b80] [c0080000072c7450] kvm_vcpu_ioctl+0x298/0x778 [kvm]
[  538.544605] [c0000013e91e3ce0] [c0000000004b8c2c] sys_ioctl+0x1dc/0xc90
[  538.544662] [c0000013e91e3dc0] [c00000000002f9a4] system_call_exception+0xe4/0x1c0
[  538.544726] [c0000013e91e3e20] [c00000000000d140] system_call_common+0xf0/0x27c
[  538.544787] Instruction dump:
[  538.544821] f86d1098 60000000 60000000 48000099 e8ad0fe8 e8c500a0 e9264140 75290002
[  538.544886] 7d1602a6 7cec42a6 40820008 7d0807b4 <7d164ba6> 7d083a14 f90d10a0 480104fd
[  538.544953] ---[ end trace 74423e2b948c2e0c ]---

This patch makes the KVM_PPC_ALLOCATE_HTAB ioctl fail when running in
the nested hypervisor, causing QEMU to abort.

Reported-by: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>
Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kvm/book3s_hv.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e2183fed947d4..dd9b19b1f459a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5191,6 +5191,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 	case KVM_PPC_ALLOCATE_HTAB: {
 		u32 htab_order;
 
+		/* If we're a nested hypervisor, we currently only support radix */
+		if (kvmhv_on_pseries()) {
+			r = -EOPNOTSUPP;
+			break;
+		}
+
 		r = -EFAULT;
 		if (get_user(htab_order, (u32 __user *)argp))
 			break;
-- 
2.25.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox