LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 03/14] swiotlb: move orig addr and size validation into swiotlb_bounce
From: Christoph Hellwig @ 2021-03-01  7:44 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: iommu, xen-devel, Claire Chang, linuxppc-dev, Dongli Zhang
In-Reply-To: <20210301074436.919889-1-hch@lst.de>

Move the code to find and validate the original buffer address and size
from the callers into swiotlb_bounce.  This means a tiny bit of extra
work in the swiotlb_map path, but avoids code duplication and a leads to
a better code structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 59 +++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 03aa614565e417..a9063092f6f566 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -460,12 +460,25 @@ void __init swiotlb_exit(void)
 /*
  * Bounce: copy the swiotlb buffer from or back to the original dma location
  */
-static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
-			   size_t size, enum dma_data_direction dir)
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+		enum dma_data_direction dir)
 {
+	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	size_t alloc_size = io_tlb_alloc_size[index];
+	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
 
+	if (orig_addr == INVALID_PHYS_ADDR)
+		return;
+
+	if (size > alloc_size) {
+		dev_WARN_ONCE(dev, 1,
+			"Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+			alloc_size, size);
+		size = alloc_size;
+	}
+
 	if (PageHighMem(pfn_to_page(pfn))) {
 		/* The buffer does not have a mapping.  Map it in and copy */
 		unsigned int offset = orig_addr & ~PAGE_MASK;
@@ -644,21 +657,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
+		swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size)
-{
-	if (*size > alloc_size) {
-		/* Warn and truncate mapping_size */
-		dev_WARN_ONCE(hwdev, 1,
-			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-			alloc_size, *size);
-		*size = alloc_size;
-	}
-}
-
 /*
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
@@ -669,19 +671,15 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
-	size_t alloc_size = io_tlb_alloc_size[index];
-	int i, count, nslots = nr_slots(alloc_size + offset);
-
-	validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size);
+	int nslots = nr_slots(io_tlb_alloc_size[index] + offset);
+	int count, i;
 
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
-	if (orig_addr != INVALID_PHYS_ADDR &&
-	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
-		swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+		swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
 
 	/*
 	 * Return the buffer to the free list by setting the corresponding
@@ -721,27 +719,16 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     size_t size, enum dma_data_direction dir,
 			     enum dma_sync_target target)
 {
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t alloc_size = io_tlb_alloc_size[index];
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
-
-	if (orig_addr == INVALID_PHYS_ADDR)
-		return;
-
-	validate_sync_size_and_truncate(hwdev, alloc_size, &size);
-
 	switch (target) {
 	case SYNC_FOR_CPU:
 		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_FROM_DEVICE);
+			swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE);
 		else
 			BUG_ON(dir != DMA_TO_DEVICE);
 		break;
 	case SYNC_FOR_DEVICE:
 		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(orig_addr, tlb_addr,
-				       size, DMA_TO_DEVICE);
+			swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE);
 		else
 			BUG_ON(dir != DMA_FROM_DEVICE);
 		break;
-- 
2.29.2


^ permalink raw reply related

* [PATCH 02/14] swiotlb: remove the alloc_size parameter to swiotlb_tbl_unmap_single
From: Christoph Hellwig @ 2021-03-01  7:44 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: iommu, xen-devel, Claire Chang, linuxppc-dev, Dongli Zhang
In-Reply-To: <20210301074436.919889-1-hch@lst.de>

Now that swiotlb remembers the allocation size there is no need to pass
it back to swiotlb_tbl_unmap_single.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 11 +++-------
 drivers/xen/swiotlb-xen.c |  4 ++--
 include/linux/swiotlb.h   |  1 -
 kernel/dma/direct.h       |  2 +-
 kernel/dma/swiotlb.c      | 45 ++++++++++++++++++++-------------------
 5 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9ab6ee22c11088..da2bd8f0885e6e 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -493,8 +493,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 		unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
-	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_domain *iovad = &cookie->iovad;
 	phys_addr_t phys;
 
 	phys = iommu_iova_to_phys(domain, dma_addr);
@@ -504,8 +502,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 	__iommu_dma_unmap(dev, dma_addr, size);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size,
-				iova_align(iovad, size), dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
 static bool dev_is_untrusted(struct device *dev)
@@ -580,10 +577,8 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
 	}
 
 	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-	if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys))
-		swiotlb_tbl_unmap_single(dev, phys, org_size,
-				aligned_size, dir, attrs);
-
+	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
 	return iova;
 }
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 2b385c1b4a99cb..d47f1b311caac0 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -406,7 +406,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * Ensure that the address returned is DMA'ble
 	 */
 	if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, map, size, size, dir,
+		swiotlb_tbl_unmap_single(dev, map, size, dir,
 				attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return DMA_MAPPING_ERROR;
 	}
@@ -445,7 +445,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(hwdev, dev_addr))
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
 }
 
 static void
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 5857a937c63722..59f421d041ed9e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -57,7 +57,6 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
 				     size_t mapping_size,
-				     size_t alloc_size,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index b9861557873768..e1bf721591c0cf 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c10e855a03bc16..03aa614565e417 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -102,7 +102,7 @@ static phys_addr_t *io_tlb_orig_addr;
 /*
  * The mapped buffer's size should be validated during a sync operation.
  */
-static size_t *io_tlb_orig_size;
+static size_t *io_tlb_alloc_size;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -253,15 +253,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		      __func__, alloc_size, PAGE_SIZE);
 
 	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
-	io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_orig_size)
+	io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!io_tlb_alloc_size)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -393,18 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!io_tlb_orig_addr)
 		goto cleanup4;
 
-	io_tlb_orig_size = (size_t *)
+	io_tlb_alloc_size = (size_t *)
 		__get_free_pages(GFP_KERNEL,
 				 get_order(io_tlb_nslabs *
 					   sizeof(size_t)));
-	if (!io_tlb_orig_size)
+	if (!io_tlb_alloc_size)
 		goto cleanup5;
 
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -436,7 +436,7 @@ void __init swiotlb_exit(void)
 		return;
 
 	if (late_alloc) {
-		free_pages((unsigned long)io_tlb_orig_size,
+		free_pages((unsigned long)io_tlb_alloc_size,
 			   get_order(io_tlb_nslabs * sizeof(size_t)));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
@@ -447,7 +447,7 @@ void __init swiotlb_exit(void)
 	} else {
 		memblock_free_late(__pa(io_tlb_orig_addr),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
-		memblock_free_late(__pa(io_tlb_orig_size),
+		memblock_free_late(__pa(io_tlb_alloc_size),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
 		memblock_free_late(__pa(io_tlb_list),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
@@ -639,7 +639,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 */
 	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
 		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-		io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+		io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
 	}
 	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
@@ -648,14 +648,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size)
 {
-	if (*size > orig_size) {
+	if (*size > alloc_size) {
 		/* Warn and truncate mapping_size */
 		dev_WARN_ONCE(hwdev, 1,
 			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-			orig_size, *size);
-		*size = orig_size;
+			alloc_size, *size);
+		*size = alloc_size;
 	}
 }
 
@@ -663,16 +663,17 @@ static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_si
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t mapping_size, size_t alloc_size,
-			      enum dma_data_direction dir, unsigned long attrs)
+			      size_t mapping_size, enum dma_data_direction dir,
+			      unsigned long attrs)
 {
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
-	int i, count, nslots = nr_slots(alloc_size + offset);
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
+	int i, count, nslots = nr_slots(alloc_size + offset);
 
-	validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size);
 
 	/*
 	 * First, sync the memory before unmapping the entry
@@ -701,7 +702,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	for (i = index + nslots - 1; i >= index; i--) {
 		io_tlb_list[i] = ++count;
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 
 	/*
@@ -721,13 +722,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     enum dma_sync_target target)
 {
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t orig_size = io_tlb_orig_size[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 
-	validate_sync_size_and_truncate(hwdev, orig_size, &size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &size);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -770,7 +771,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	/* Ensure that the address returned is DMA'ble */
 	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		dev_WARN_ONCE(dev, 1,
 			"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-- 
2.29.2


^ permalink raw reply related

* swiotlb cleanups v2
From: Christoph Hellwig @ 2021-03-01  7:44 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: iommu, xen-devel, Claire Chang, linuxppc-dev, Dongli Zhang

Hi Konrad,

this series contains a bunch of swiotlb cleanups, mostly to reduce the
amount of internals exposed to code outside of swiotlb.c, which should
helper to prepare for supporting multiple different bounce buffer pools.

Changes since v1:
 - rebased to v5.12-rc1
 - a few more cleanups
 - merge and forward port the patch from Claire to move all the global
   variables into a struct to prepare for multiple instances

^ permalink raw reply

* [PATCH 01/14] powerpc/svm: stop using io_tlb_start
From: Christoph Hellwig @ 2021-03-01  7:44 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: iommu, xen-devel, Claire Chang, linuxppc-dev, Dongli Zhang
In-Reply-To: <20210301074436.919889-1-hch@lst.de>

Use the local variable that is passed to swiotlb_init_with_tbl for
freeing the memory in the failure case to isolate the code a little
better from swiotlb internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/platforms/pseries/svm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 7b739cc7a8a93e..1d829e257996fb 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -55,9 +55,9 @@ void __init svm_swiotlb_init(void)
 	if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
 		return;
 
-	if (io_tlb_start)
-		memblock_free_early(io_tlb_start,
-				    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+
+	memblock_free_early(__pa(vstart),
+			    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
 	panic("SVM: Cannot allocate SWIOTLB buffer");
 }
 
-- 
2.29.2


^ permalink raw reply related

* Re: [PATCH] mm: Generalize HUGETLB_PAGE_SIZE_VARIABLE
From: Anshuman Khandual @ 2021-03-01  7:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-ia64, linux-kernel, linux-mm, Paul Mackerras, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20210301062358.GA25761@lst.de>



On 3/1/21 11:53 AM, Christoph Hellwig wrote:
> On Mon, Mar 01, 2021 at 11:20:53AM +0530, Anshuman Khandual wrote:
>> HUGETLB_PAGE_SIZE_VARIABLE need not be defined for each individual
>> platform subscribing it. Instead just make it generic.
>>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Christoph Hellwig <hch@lst.de>
>> Cc: linux-ia64@vger.kernel.org
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: linux-mm@kvack.org
>> Cc: linux-kernel@vger.kernel.org
>> Suggested-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> This change was originally suggested in an earilier discussion. This
>> applies on v5.12-rc1 and has been build tested on all applicable
>> platforms i.e ia64 and powerpc.
>>
>> https://patchwork.kernel.org/project/linux-mm/patch/1613024531-19040-3-git-send-email-anshuman.khandual@arm.com/
>>
>>  arch/ia64/Kconfig    | 6 +-----
>>  arch/powerpc/Kconfig | 6 +-----
>>  mm/Kconfig           | 8 ++++++++
>>  3 files changed, 10 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
>> index 2ad7a8d29fcc..6b3e3f6c29ae 100644
>> --- a/arch/ia64/Kconfig
>> +++ b/arch/ia64/Kconfig
>> @@ -32,6 +32,7 @@ config IA64
>>  	select TTY
>>  	select HAVE_ARCH_TRACEHOOK
>>  	select HAVE_VIRT_CPU_ACCOUNTING
>> +	select HUGETLB_PAGE_SIZE_VARIABLE
> 
> doesn't this need a 'if HUGETLB_PAGE'

While making HUGETLB_PAGE_SIZE_VARIABLE a generic option, also made it
dependent on HUGETLB_PAGE. Should not that gate HUGETLB_PAGE_SIZE_VARIABLE
when HUGETLB_PAGE is not available irrespective of the select statement on
the platforms ?

> 
> or did you verify that HUGETLB_PAGE_SIZE_VARIABLE checks are always
> nested inside of HUGETLB_PAGE ones?
>

^ permalink raw reply

* [PATCH kernel v2] powerpc/iommu: Annotate nested lock for lockdep
From: Alexey Kardashevskiy @ 2021-03-01  6:36 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, Frederic Barrat, kvm-ppc

The IOMMU table is divided into pools for concurrent mappings and each
pool has a separate spinlock. When taking the ownership of an IOMMU group
to pass through a device to a VM, we lock these spinlocks which triggers
a false negative warning in lockdep (below).

This fixes it by annotating the large pool's spinlock as a nest lock
which makes lockdep not complaining when locking nested locks if
the nest lock is locked already.

===
WARNING: possible recursive locking detected
5.11.0-le_syzkaller_a+fstn1 #100 Not tainted
--------------------------------------------
qemu-system-ppc/4129 is trying to acquire lock:
c0000000119bddb0 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0

but task is already holding lock:
c0000000119bdd30 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&(p->lock)/1);
  lock(&(p->lock)/1);
===

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v2:
* fixed iommu_release_ownership() as well

---
 arch/powerpc/kernel/iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c1a5c366a664..d0df3e5ff5e0 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1089,7 +1089,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
-		spin_lock(&tbl->pools[i].lock);
+		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
 
 	iommu_table_release_pages(tbl);
 
@@ -1117,7 +1117,7 @@ void iommu_release_ownership(struct iommu_table *tbl)
 
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
-		spin_lock(&tbl->pools[i].lock);
+		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
 
 	memset(tbl->it_map, 0, sz);
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] mm: Generalize HUGETLB_PAGE_SIZE_VARIABLE
From: Christoph Hellwig @ 2021-03-01  6:23 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: linux-ia64, linux-kernel, linux-mm, Paul Mackerras, Andrew Morton,
	linuxppc-dev, Christoph Hellwig
In-Reply-To: <1614577853-7452-1-git-send-email-anshuman.khandual@arm.com>

On Mon, Mar 01, 2021 at 11:20:53AM +0530, Anshuman Khandual wrote:
> HUGETLB_PAGE_SIZE_VARIABLE need not be defined for each individual
> platform subscribing it. Instead just make it generic.
> 
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: linux-ia64@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.kernel.org
> Suggested-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> This change was originally suggested in an earilier discussion. This
> applies on v5.12-rc1 and has been build tested on all applicable
> platforms i.e ia64 and powerpc.
> 
> https://patchwork.kernel.org/project/linux-mm/patch/1613024531-19040-3-git-send-email-anshuman.khandual@arm.com/
> 
>  arch/ia64/Kconfig    | 6 +-----
>  arch/powerpc/Kconfig | 6 +-----
>  mm/Kconfig           | 8 ++++++++
>  3 files changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
> index 2ad7a8d29fcc..6b3e3f6c29ae 100644
> --- a/arch/ia64/Kconfig
> +++ b/arch/ia64/Kconfig
> @@ -32,6 +32,7 @@ config IA64
>  	select TTY
>  	select HAVE_ARCH_TRACEHOOK
>  	select HAVE_VIRT_CPU_ACCOUNTING
> +	select HUGETLB_PAGE_SIZE_VARIABLE

doesn't this need a 'if HUGETLB_PAGE'

or did you verify that HUGETLB_PAGE_SIZE_VARIABLE checks are always
nested inside of HUGETLB_PAGE ones?

^ permalink raw reply

* [PATCH] mm: Generalize HUGETLB_PAGE_SIZE_VARIABLE
From: Anshuman Khandual @ 2021-03-01  5:50 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-ia64, Anshuman Khandual, linux-kernel, Paul Mackerras,
	Andrew Morton, linuxppc-dev, Christoph Hellwig

HUGETLB_PAGE_SIZE_VARIABLE need not be defined for each individual
platform subscribing it. Instead just make it generic.

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-ia64@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
This change was originally suggested in an earilier discussion. This
applies on v5.12-rc1 and has been build tested on all applicable
platforms i.e ia64 and powerpc.

https://patchwork.kernel.org/project/linux-mm/patch/1613024531-19040-3-git-send-email-anshuman.khandual@arm.com/

 arch/ia64/Kconfig    | 6 +-----
 arch/powerpc/Kconfig | 6 +-----
 mm/Kconfig           | 8 ++++++++
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 2ad7a8d29fcc..6b3e3f6c29ae 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -32,6 +32,7 @@ config IA64
 	select TTY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_VIRT_CPU_ACCOUNTING
+	select HUGETLB_PAGE_SIZE_VARIABLE
 	select VIRT_TO_BUS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
@@ -82,11 +83,6 @@ config STACKTRACE_SUPPORT
 config GENERIC_LOCKBREAK
 	def_bool n
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-	bool
-	depends on HUGETLB_PAGE
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3778ad17f56a..b8565bed284f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
 	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HUGETLB_PAGE_SIZE_VARIABLE	if PPC_BOOK3S_64
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
 	select HAVE_REGS_AND_STACK_ACCESS_API
@@ -416,11 +417,6 @@ config HIGHMEM
 
 source "kernel/Kconfig.hz"
 
-config HUGETLB_PAGE_SIZE_VARIABLE
-	bool
-	depends on HUGETLB_PAGE && PPC_BOOK3S_64
-	default y
-
 config MATH_EMULATION
 	bool "Math emulation"
 	depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
diff --git a/mm/Kconfig b/mm/Kconfig
index 24c045b24b95..e604a87862a4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -274,6 +274,14 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config ARCH_ENABLE_THP_MIGRATION
 	bool
 
+config HUGETLB_PAGE_SIZE_VARIABLE
+	def_bool n
+	depends on HUGETLB_PAGE
+	help
+	  When there are multiple HugeTLB sizes available on a platform
+	  and pageblock_order could then be a dynamic value instead of
+	  standard HUGETLB_PAGE_ORDER.
+
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH V2] mm/memtest: Add ARCH_USE_MEMTEST
From: Anshuman Khandual @ 2021-03-01  4:32 UTC (permalink / raw)
  To: linux-mm
  Cc: Chris Zankel, Thomas Bogendoerfer, Anshuman Khandual,
	linux-xtensa, linuxppc-dev, linux-kernel, Russell King,
	linux-mips, Max Filippov, Ingo Molnar, Paul Mackerras,
	Catalin Marinas, Thomas Gleixner, Will Deacon, linux-arm-kernel

early_memtest() does not get called from all architectures. Hence enabling
CONFIG_MEMTEST and providing a valid memtest=[1..N] kernel command line
option might not trigger the memory pattern tests as would be expected in
normal circumstances. This situation is misleading.

The change here prevents the above mentioned problem after introducing a
new config option ARCH_USE_MEMTEST that should be subscribed on platforms
that call early_memtest(), in order to enable the config CONFIG_MEMTEST.
Conversely CONFIG_MEMTEST cannot be enabled on platforms where it would
not be tested anyway.

Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
This patch applies on v5.12-rc1 and has been tested on arm64 platform.
But it has been just build tested on all other platforms.

Changes in V2:

- Added ARCH_USE_MEMTEST in the sorted alphabetical order on platforms

Changes in V1:

https://patchwork.kernel.org/project/linux-mm/patch/1612498242-31579-1-git-send-email-anshuman.khandual@arm.com/

 arch/arm/Kconfig     | 1 +
 arch/arm64/Kconfig   | 1 +
 arch/mips/Kconfig    | 1 +
 arch/powerpc/Kconfig | 1 +
 arch/x86/Kconfig     | 1 +
 arch/xtensa/Kconfig  | 1 +
 lib/Kconfig.debug    | 9 ++++++++-
 7 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 853aab5ab327..9ab047d4cd0a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -33,6 +33,7 @@ config ARM
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_USE_MEMTEST
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_LD_ORPHAN_WARN
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1f212b47a48a..d4fe5118e9c8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -67,6 +67,7 @@ config ARM64
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_GNU_PROPERTY
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_SYM_ANNOTATIONS
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d89efba3d8a4..93a4f502f962 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -14,6 +14,7 @@ config MIPS
 	select ARCH_SUPPORTS_UPROBES
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 386ae12d8523..3778ad17f56a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -149,6 +149,7 @@ config PPC
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC32 || PPC_BOOK3S_64
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS	if PPC_QUEUED_SPINLOCKS
 	select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..2cb76fd5258e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,6 +100,7 @@ config X86
 	select ARCH_SUPPORTS_LTO_CLANG		if X86_64
 	select ARCH_SUPPORTS_LTO_CLANG_THIN	if X86_64
 	select ARCH_USE_BUILTIN_BSWAP
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_SYM_ANNOTATIONS
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index a99dc39f6964..ca51896c53df 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -7,6 +7,7 @@ config XTENSA
 	select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if MMU
 	select ARCH_HAS_DMA_SET_UNCACHED if MMU
+	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_FRAME_POINTERS
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a2d04c00cda2..2c296535a4b3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2521,11 +2521,18 @@ config TEST_FPU
 
 endif # RUNTIME_TESTING_MENU
 
+config ARCH_USE_MEMTEST
+	bool
+	help
+	  An architecture should select this when it uses early_memtest()
+	  during boot process.
+
 config MEMTEST
 	bool "Memtest"
+	depends on ARCH_USE_MEMTEST
 	help
 	  This option adds a kernel parameter 'memtest', which allows memtest
-	  to be set.
+	  to be set and executed.
 	        memtest=0, mean disabled; -- default
 	        memtest=1, mean do 1 test pattern;
 	        ...
-- 
2.20.1


^ permalink raw reply related

* [Reoprt] Some compile warning on ppc dts
From: chenjun (AM) @ 2021-03-01  2:16 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Rob Herring, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, devicetree@vger.kernel.org
  Cc: Xiangrui (Euler)

Hi

After run the following commands
make distclean
make allmodconfig ARCH=powerpc CROSS_COMPILE=powerpc64-linux-gnu-
make oldconfig ARCH=powerpc CROSS_COMPILE=powerpc64-linux-gnu-
make -j64 ARCH=powerpc CROSS_COMPILE=powerpc64-linux-gnu-

I get some warning:
arch/powerpc/boot/dts/mpc5200b.dtsi:267.20-280.4: Warning (pci_bridge): 
/pci@f0000d00: missing ranges for PCI bridg
e (or not a bridge)
arch/powerpc/boot/dts/o2dnt2.dtb: Warning (pci_device_bus_num): Failed 
prerequisite 'pci_bridge'
arch/powerpc/boot/dts/mpc5200b.dtsi:182.18-186.5: Warning 
(spi_bus_bridge): /soc5200@f0000000/psc@2000: node name f
or SPI buses should be 'spi'
   also defined at arch/powerpc/boot/dts/o2d.dtsi:32.12-43.5
arch/powerpc/boot/dts/o2dnt2.dtb: Warning (spi_bus_reg): Failed 
prerequisite 'spi_bus_bridge'
...

For the problem about "node name for SPI buses should be 'spi'":
Rename the psc@2000 to spi@2000 in arch/powerpc/boot/dts/o2d.dtsi can 
fix it.
diff --git a/arch/powerpc/boot/dts/o2d.dtsi b/arch/powerpc/boot/dts/o2d.dtsi
index 6661955a2be4..cd3dc70cd72e 100644
--- a/arch/powerpc/boot/dts/o2d.dtsi
+++ b/arch/powerpc/boot/dts/o2d.dtsi
@@ -29,7 +29,7 @@ rtc@800 {
  >------>------->-------status = "disabled";
  >------>-------};
-
->------>-------psc@2000 {>----->-------// PSC1
+>------>-------spi@2000 {>----->-------// PSC1
  >------>------->-------compatible = 
"fsl,mpc5200b-psc-spi","fsl,mpc5200-psc-spi";
  >------>------->-------#address-cells = <1>;
  >------>------->-------#size-cells = <0>;
---

For the problem about "missing ranges for PCI bridge (or not a bridge)":
Ranges should be add in arch/powerpc/boot/dts/mpc5200b.dtsi.
 >-------pci: pci@f0000d00 {
 >------->-------#interrupt-cells = <1>;
 >------->-------#size-cells = <2>;
 >------->-------#address-cells = <3>;
 >------->-------device_type = "pci";
 >------->-------compatible = "fsl,mpc5200b-pci","fsl,mpc5200-pci";
 >------->-------reg = <0xf0000d00 0x100>;
 >------->-------// interrupt-map-mask = need to add
 >------->-------// interrupt-map = need to add
 >------->-------clock-frequency = <0>; // From boot loader
 >------->-------interrupts = <2 8 0 2 9 0 2 10 0>;
 >------->-------bus-range = <0 0>;
 >------->-------// ranges = need to add
 >-------};
I think the ranges should be add by someone who knows the mpc5200 better.

Can anyone fix this?

-- 
Regards
Chen Jun

^ permalink raw reply related

* Re: [PATCH V2] powerpc/perf: Fix handling of privilege level checks in perf interrupt context
From: Athira Rajeev @ 2021-03-01  2:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Madhavan Srinivasan, omosnace, acme, jolsa, linuxppc-dev,
	kan.liang
In-Reply-To: <YDjA0giNnkfHeYM5@hirez.programming.kicks-ass.net>



> On 26-Feb-2021, at 3:05 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Thu, Feb 25, 2021 at 05:10:39AM -0500, Athira Rajeev wrote:
>> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
>> index 4b4319d8..c8be44c 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -222,7 +222,7 @@ static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *
>> 	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
>> 		*addrp = mfspr(SPRN_SDAR);
>> 
>> -	if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0)
>> +	if (is_kernel_addr(mfspr(SPRN_SDAR)) && event->attr.exclude_kernel)
>> 		*addrp = 0;
>> }
>> 
>> @@ -507,7 +507,7 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *
>> 			 * addresses, hence include a check before filtering code
>> 			 */
>> 			if (!(ppmu->flags & PPMU_ARCH_31) &&
>> -				is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0)
>> +			    is_kernel_addr(addr) && event->attr.exclude_kernel)
>> 				continue;
>> 
>> 			/* Branches are read most recent first (ie. mfbhrb 0 is
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>


Thanks Peter for reviewing the patch.

Athira.

^ permalink raw reply

* Re: [PATCH v3 28/32] powerpc/64s: interrupt implement exit logic in C
From: Nicholas Piggin @ 2021-03-01  0:47 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev; +Cc: Michal Suchanek
In-Reply-To: <bc4ec2c0-28e5-4004-d1eb-54058a699af4@csgroup.eu>

Excerpts from Christophe Leroy's message of February 27, 2021 8:07 pm:
> 
> 
> Le 25/02/2020 à 18:35, Nicholas Piggin a écrit :
>> Implement the bulk of interrupt return logic in C. The asm return code
>> must handle a few cases: restoring full GPRs, and emulating stack store.
>> 
>> The stack store emulation is significantly simplfied, rather than creating
>> a new return frame and switching to that before performing the store, it
>> uses the PACA to keep a scratch register around to perform thestore.
>> 
>> The asm return code is moved into 64e for now. The new logic has made
>> allowance for 64e, but I don't have a full environment that works well
>> to test it, and even booting in emulated qemu is not great for stress
>> testing. 64e shouldn't be too far off working with this, given a bit
>> more testing and auditing of the logic.
>> 
>> This is slightly faster on a POWER9 (page fault speed increases about
>> 1.1%), probably due to reduced mtmsrd.
> 
> 
> This series, and especially this patch has added a awfull number of BUG_ON() traps.
> 
> We have an issue open at https://github.com/linuxppc/issues/issues/88 since 2017 for reducing the 
> number of BUG_ON()s
> 
> And the kernel Documentation is explicit on the willingness to deprecate BUG_ON(), see 
> https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=bug_on :
> 
> BUG() and BUG_ON()
> Use WARN() and WARN_ON() instead, and handle the “impossible” error condition as gracefully as 
> possible. While the BUG()-family of APIs were originally designed to act as an “impossible 
> situation” assert and to kill a kernel thread “safely”, they turn out to just be too risky. (e.g. 
> “In what order do locks need to be released? Have various states been restored?”) Very commonly, 
> using BUG() will destabilize a system or entirely break it, which makes it impossible to debug or 
> even get viable crash reports. Linus has very strong feelings about this.
> 
> So ... can we do something cleaner with all the BUG_ON()s recently added ?

Yeah you're right. Some of it is probably overkill due to paranoia when 
developing the series.

Now we have a bit more confidence we could probably look at cutting down 
on these.

I do get a bit concerned about detecting a problem in some code like 
this and attempting to just continue, it usually means the system is 
going to crash pretty badly anyway (and the WARN_ON trap interrupt is
probably going to finish you off anyway). So I think removing the more
obvious checks entirely (maybe with a PPC DEBUG config option) is the
right way to go.

Thanks,
Nick

^ permalink raw reply

* [PATCH] powerpc: Force inlining of mmu_has_feature to fix build failure
From: Christophe Leroy @ 2021-02-27 16:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

The test robot has managed to generate a random config leading
to following build failure:

  LD      .tmp_vmlinux.kallsyms1
powerpc64-linux-ld: arch/powerpc/mm/pgtable.o: in function `ptep_set_access_flags':
pgtable.c:(.text.ptep_set_access_flags+0xf0): undefined reference to `hash__flush_tlb_page'
powerpc64-linux-ld: arch/powerpc/mm/book3s32/mmu.o: in function `MMU_init_hw_patch':
mmu.c:(.init.text+0x452): undefined reference to `patch__hash_page_A0'
powerpc64-linux-ld: mmu.c:(.init.text+0x45e): undefined reference to `patch__hash_page_A0'
powerpc64-linux-ld: mmu.c:(.init.text+0x46a): undefined reference to `patch__hash_page_A1'
powerpc64-linux-ld: mmu.c:(.init.text+0x476): undefined reference to `patch__hash_page_A1'
powerpc64-linux-ld: mmu.c:(.init.text+0x482): undefined reference to `patch__hash_page_A2'
powerpc64-linux-ld: mmu.c:(.init.text+0x48e): undefined reference to `patch__hash_page_A2'
powerpc64-linux-ld: mmu.c:(.init.text+0x49e): undefined reference to `patch__hash_page_B'
powerpc64-linux-ld: mmu.c:(.init.text+0x4aa): undefined reference to `patch__hash_page_B'
powerpc64-linux-ld: mmu.c:(.init.text+0x4b6): undefined reference to `patch__hash_page_C'
powerpc64-linux-ld: mmu.c:(.init.text+0x4c2): undefined reference to `patch__hash_page_C'
powerpc64-linux-ld: mmu.c:(.init.text+0x4ce): undefined reference to `patch__flush_hash_A0'
powerpc64-linux-ld: mmu.c:(.init.text+0x4da): undefined reference to `patch__flush_hash_A0'
powerpc64-linux-ld: mmu.c:(.init.text+0x4e6): undefined reference to `patch__flush_hash_A1'
powerpc64-linux-ld: mmu.c:(.init.text+0x4f2): undefined reference to `patch__flush_hash_A1'
powerpc64-linux-ld: mmu.c:(.init.text+0x4fe): undefined reference to `patch__flush_hash_A2'
powerpc64-linux-ld: mmu.c:(.init.text+0x50a): undefined reference to `patch__flush_hash_A2'
powerpc64-linux-ld: mmu.c:(.init.text+0x522): undefined reference to `patch__flush_hash_B'
powerpc64-linux-ld: mmu.c:(.init.text+0x532): undefined reference to `patch__flush_hash_B'
powerpc64-linux-ld: arch/powerpc/mm/book3s32/mmu.o: in function `update_mmu_cache':
mmu.c:(.text.update_mmu_cache+0xa0): undefined reference to `add_hash_page'
powerpc64-linux-ld: mm/memory.o: in function `zap_pte_range':
memory.c:(.text.zap_pte_range+0x160): undefined reference to `flush_hash_pages'
powerpc64-linux-ld: mm/memory.o: in function `handle_pte_fault':
memory.c:(.text.handle_pte_fault+0x180): undefined reference to `hash__flush_tlb_page'

This is due to mmu_has_feature() not being inlined. See extract of build of
mmu.c with -Winline:

In file included from ./include/linux/mm_types.h:19,
                 from ./include/linux/mmzone.h:21,
                 from ./include/linux/gfp.h:6,
                 from ./include/linux/mm.h:10,
                 from arch/powerpc/mm/book3s32/mmu.c:21:
./arch/powerpc/include/asm/mmu.h: In function 'find_free_bat':
./arch/powerpc/include/asm/mmu.h:231:20: warning: inlining failed in call to 'early_mmu_has_feature': call is unlikely and code size would grow [-Winline]
  231 | static inline bool early_mmu_has_feature(unsigned long feature)
      |                    ^~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/mmu.h:291:9: note: called from here
  291 |  return early_mmu_has_feature(feature);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The code relies on constant folding of MMU_FTRS_POSSIBLE at buildtime
and elimination of non possible parts of code at compile time.
For this to work, mmu_has_feature() and early_mmu_has_feature()
must be inlined.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 259149cf7c3c ("powerpc/32s: Only build hash code when CONFIG_PPC_BOOK3S_604 is selected")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/mmu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 80b27f5d9648..607168b1aef4 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -228,7 +228,7 @@ enum {
 #define MMU_FTRS_ALWAYS		0
 #endif
 
-static inline bool early_mmu_has_feature(unsigned long feature)
+static __always_inline bool early_mmu_has_feature(unsigned long feature)
 {
 	if (MMU_FTRS_ALWAYS & feature)
 		return true;
@@ -286,7 +286,7 @@ static inline void mmu_feature_keys_init(void)
 
 }
 
-static inline bool mmu_has_feature(unsigned long feature)
+static __always_inline bool mmu_has_feature(unsigned long feature)
 {
 	return early_mmu_has_feature(feature);
 }
-- 
2.25.0


^ permalink raw reply related

* Re: Latest Git kernel doesn't compile because of the LINUX_VERSION_CODE issue
From: Masahiro Yamada @ 2021-02-27 15:42 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Sasha Levin, Darren Stevens, R.T.Dickinson, Christian Zigotzky,
	linuxppc-dev
In-Reply-To: <80696a7a-44fb-9ce0-76a4-42006ebe2928@csgroup.eu>

On Sat, Feb 27, 2021 at 4:34 PM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
>
> Le 27/02/2021 à 08:13, Christian Zigotzky a écrit :
> > Hello Christophe,
> >
> > Thanks a lot for compiling the latest git kernel.
> >
> > I have solved the compiling issue through setting up a value for the SUBLEVEL variable in
> > "a/Makefile". Before it wasn't necessary to set up a value for the SUBLEVEL variable.
>
> I see, so it is a regression introduced by commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255").
>
> In the past there had already been such a regression at some point, which had then been fixed by
> commit cacd54ef49b7 ("kbuild: Fix KERNELVERSION for empty SUBLEVEL or PATCHLEVEL")
>
> Sasha, can you fix it ?



Thanks for catching this.

I submitted a patch:
https://lore.kernel.org/patchwork/patch/1385119/






> Thanks
> Christophe
>
>
> >
> > Cheers,
> > Christian
> >
> > On 26 February 21 at 5:10 pm, Christophe Leroy wrote:
> >>
> >>
> >> Le 26/02/2021 à 13:34, Christian Zigotzky a écrit :
> >>> Hello,
> >>>
> >>> I tried to compile the latest Git kernel today. Unfortunately it doesn't compile.
> >>
> >> I have no such problem with latest git kernel.
> >>
> >> Christophe
> >>
> >>>
> >>> Error messages:
> >>>
> >>>    CC      arch/powerpc/kernel/udbg_16550.o
> >>> In file included from ./include/linux/stackprotector.h:10:0,
> >>>                   from arch/powerpc/kernel/smp.c:35:
> >>> ./arch/powerpc/include/asm/stackprotector.h: In function ‘boot_init_stack_canary’:
> >>> ./arch/powerpc/include/asm/stackprotector.h:29:30: error: expected expression before ‘;’ token
> >>>    canary ^= LINUX_VERSION_CODE;
> >>>                                ^
> >>> scripts/Makefile.build:271: recipe for target 'arch/powerpc/kernel/smp.o' failed
> >>> make[2]: *** [arch/powerpc/kernel/smp.o] Error 1
> >>>
> >>> ----
> >>>
> >>> drivers/media/cec/core/cec-api.c: In function ‘cec_adap_g_caps’:
> >>> drivers/media/cec/core/cec-api.c:85:35: error: expected expression before ‘;’ token
> >>>    caps.version = LINUX_VERSION_CODE;
> >>>
> >>> ----
> >>>
> >>> I have found the bad commit. It's "Merge tag 'kbuild-v5.12' of
> >>> git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild" [1]
> >>>
> >>> The changes in the Makefile (a/Makefile) are responsible for the compiling errors. [2]
> >>>
> >>> I was able to revert this bad commit. After that it compiled without any problems.
> >>>
> >>> Could you please compile the latest Git kernel and confirm this issue?
> >>>
> >>> Thanks,
> >>> Christian
> >>>
> >>> [1]
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab
> >>>
> >>> [2]
> >>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/Makefile?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab
> >>>



-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* Re: [PATCH] powerpc/bug: Remove specific powerpc BUG_ON()
From: Christophe Leroy @ 2021-02-27 10:31 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: linux-kernel, npiggin, Paul Mackerras, linuxppc-dev
In-Reply-To: <20210211143059.GE28121@gate.crashing.org>



Le 11/02/2021 à 15:30, Segher Boessenkool a écrit :
> On Thu, Feb 11, 2021 at 03:09:43PM +0100, Christophe Leroy wrote:
>> Le 11/02/2021 à 12:49, Segher Boessenkool a écrit :
>>> On Thu, Feb 11, 2021 at 07:41:52AM +0000, Christophe Leroy wrote:
>>>> powerpc BUG_ON() is based on using twnei or tdnei instruction,
>>>> which obliges gcc to format the condition into a 0 or 1 value
>>>> in a register.
>>>
>>> Huh?  Why is that?
>>>
>>> Will it work better if this used __builtin_trap?  Or does the kernel only
>>> detect very specific forms of trap instructions?
>>
>> We already made a try with __builtin_trap() 1,5 year ago, see
>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20510ce03cc9463f1c9e743c1d93b939de501b53.1566219503.git.christophe.leroy@c-s.fr/
>>
>> The main problems encountered are:
>> - It is only possible to use it for BUG_ON, not for WARN_ON because GCC
>> considers it as noreturn. Is there any workaround ?
> 
> A trap is noreturn by definition:
> 
>   -- Built-in Function: void __builtin_trap (void)
>       This function causes the program to exit abnormally.
> 
>> - The kernel (With CONFIG_DEBUG_BUGVERBOSE) needs to be able to identify
>> the source file and line corresponding to the trap. How can that be done
>> with __builtin_trap() ?
> 
> The DWARF debug info should be sufficient.  Perhaps you can post-process
> some way?
> 
> You can create a trap that falls through yourself (by having a trap-on
> condition with a condition that is always true, but make the compiler
> not see that).  This isn't efficient though.
> 
> Could you file a feature request (in bugzilla)?  It is probably useful
> for generic code as well, but we could implement this for powerpc only
> if needed.
> 

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99299

Christophe

^ permalink raw reply

* Re: [PATCH v3 28/32] powerpc/64s: interrupt implement exit logic in C
From: Christophe Leroy @ 2021-02-27 10:07 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Michal Suchanek
In-Reply-To: <20200225173541.1549955-29-npiggin@gmail.com>



Le 25/02/2020 à 18:35, Nicholas Piggin a écrit :
> Implement the bulk of interrupt return logic in C. The asm return code
> must handle a few cases: restoring full GPRs, and emulating stack store.
> 
> The stack store emulation is significantly simplfied, rather than creating
> a new return frame and switching to that before performing the store, it
> uses the PACA to keep a scratch register around to perform thestore.
> 
> The asm return code is moved into 64e for now. The new logic has made
> allowance for 64e, but I don't have a full environment that works well
> to test it, and even booting in emulated qemu is not great for stress
> testing. 64e shouldn't be too far off working with this, given a bit
> more testing and auditing of the logic.
> 
> This is slightly faster on a POWER9 (page fault speed increases about
> 1.1%), probably due to reduced mtmsrd.


This series, and especially this patch has added a awfull number of BUG_ON() traps.

We have an issue open at https://github.com/linuxppc/issues/issues/88 since 2017 for reducing the 
number of BUG_ON()s

And the kernel Documentation is explicit on the willingness to deprecate BUG_ON(), see 
https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=bug_on :

BUG() and BUG_ON()
Use WARN() and WARN_ON() instead, and handle the “impossible” error condition as gracefully as 
possible. While the BUG()-family of APIs were originally designed to act as an “impossible 
situation” assert and to kill a kernel thread “safely”, they turn out to just be too risky. (e.g. 
“In what order do locks need to be released? Have various states been restored?”) Very commonly, 
using BUG() will destabilize a system or entirely break it, which makes it impossible to debug or 
even get viable crash reports. Linus has very strong feelings about this.

So ... can we do something cleaner with all the BUG_ON()s recently added ?

Christophe

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> Signed-off-by: Michal Suchanek <msuchanek@suse.de>
> ---
> v2,rebase (from Michal):
> - Move the FP restore functions to restore_math. They are not used
>    anywhere else and when restore_math is not built gcc warns about them
>    being unused (ms)
> - Add asm/context_tracking.h include to exceptions-64e.S for SCHEDULE_USER
>    definition
> 
> v3:
> - Fix return from interrupt replay problem by replaying interrupts rather
>    than enabling irqs. This ends up being cleaner and __check_irq_replay
>    goes away completely for 64s. Should bring 64e up to speed and kill a lot
>    of cruft after it's proven on 64s.
> - Don't use _GLOBAL if it's not called from C
> - Simplify stack store emulation code further, add a bit more commenting.
> - Some missing no probe annotations
> 
>   .../powerpc/include/asm/book3s/64/kup-radix.h |  10 +
>   arch/powerpc/include/asm/hw_irq.h             |   1 +
>   arch/powerpc/include/asm/switch_to.h          |   6 +
>   arch/powerpc/kernel/entry_64.S                | 486 +++++-------------
>   arch/powerpc/kernel/exceptions-64e.S          | 255 ++++++++-
>   arch/powerpc/kernel/exceptions-64s.S          | 119 ++---
>   arch/powerpc/kernel/irq.c                     |  36 +-
>   arch/powerpc/kernel/process.c                 |  89 ++--
>   arch/powerpc/kernel/syscall_64.c              | 164 +++++-
>   arch/powerpc/kernel/vector.S                  |   2 +-
>   10 files changed, 642 insertions(+), 526 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
> index 71081d90f999..3bcef989a35d 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
> @@ -60,6 +60,12 @@
>   #include <asm/mmu.h>
>   #include <asm/ptrace.h>
>   
> +static inline void kuap_restore_amr(struct pt_regs *regs)
> +{
> +	if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
> +		mtspr(SPRN_AMR, regs->kuap);
> +}
> +
>   static inline void kuap_check_amr(void)
>   {
>   	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP))
> @@ -136,6 +142,10 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
>   		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>   }
>   #else /* CONFIG_PPC_KUAP */
> +static inline void kuap_restore_amr(struct pt_regs *regs)
> +{
> +}
> +
>   static inline void kuap_check_amr(void)
>   {
>   }
> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
> index 0e9a9598f91f..e0e71777961f 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -52,6 +52,7 @@
>   #ifndef __ASSEMBLY__
>   
>   extern void replay_system_reset(void);
> +extern void replay_soft_interrupts(void);
>   
>   extern void timer_interrupt(struct pt_regs *);
>   extern void timer_broadcast_interrupt(void);
> diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
> index 476008bc3d08..b867b58b1093 100644
> --- a/arch/powerpc/include/asm/switch_to.h
> +++ b/arch/powerpc/include/asm/switch_to.h
> @@ -23,7 +23,13 @@ extern void switch_booke_debug_regs(struct debug_reg *new_debug);
>   
>   extern int emulate_altivec(struct pt_regs *);
>   
> +#ifdef CONFIG_PPC_BOOK3S_64
>   void restore_math(struct pt_regs *regs);
> +#else
> +static inline void restore_math(struct pt_regs *regs)
> +{
> +}
> +#endif
>   
>   void restore_tm_state(struct pt_regs *regs);
>   
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index 0e2c56573a41..e13eac968dfc 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -16,6 +16,7 @@
>   
>   #include <linux/errno.h>
>   #include <linux/err.h>
> +#include <asm/cache.h>
>   #include <asm/unistd.h>
>   #include <asm/processor.h>
>   #include <asm/page.h>
> @@ -221,6 +222,7 @@ _GLOBAL(ret_from_kernel_thread)
>   	li	r3,0
>   	b	.Lsyscall_exit
>   
> +#ifdef CONFIG_PPC_BOOK3E
>   /* Save non-volatile GPRs, if not already saved. */
>   _GLOBAL(save_nvgprs)
>   	ld	r11,_TRAP(r1)
> @@ -231,6 +233,7 @@ _GLOBAL(save_nvgprs)
>   	std	r0,_TRAP(r1)
>   	blr
>   _ASM_NOKPROBE_SYMBOL(save_nvgprs);
> +#endif
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
>   
> @@ -294,7 +297,7 @@ flush_count_cache:
>    * state of one is saved on its kernel stack.  Then the state
>    * of the other is restored from its kernel stack.  The memory
>    * management hardware is updated to the second process's state.
> - * Finally, we can return to the second process, via ret_from_except.
> + * Finally, we can return to the second process, via interrupt_return.
>    * On entry, r3 points to the THREAD for the current task, r4
>    * points to the THREAD for the new task.
>    *
> @@ -446,408 +449,151 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
>   	addi	r1,r1,SWITCH_FRAME_SIZE
>   	blr
>   
> -	.align	7
> -_GLOBAL(ret_from_except)
> -	ld	r11,_TRAP(r1)
> -	andi.	r0,r11,1
> -	bne	ret_from_except_lite
> -	REST_NVGPRS(r1)
> -
> -_GLOBAL(ret_from_except_lite)
> +#ifdef CONFIG_PPC_BOOK3S
>   	/*
> -	 * Disable interrupts so that current_thread_info()->flags
> -	 * can't change between when we test it and when we return
> -	 * from the interrupt.
> -	 */
> -#ifdef CONFIG_PPC_BOOK3E
> -	wrteei	0
> -#else
> -	li	r10,MSR_RI
> -	mtmsrd	r10,1		  /* Update machine state */
> -#endif /* CONFIG_PPC_BOOK3E */
> +	 * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not
> +	 * touched, AMR not set, no exit work created, then this can be used.
> +	 */
> +	.balign IFETCH_ALIGN_BYTES
> +	.globl fast_interrupt_return
> +fast_interrupt_return:
> +_ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
> +	ld	r4,_MSR(r1)
> +	andi.	r0,r4,MSR_PR
> +	bne	.Lfast_user_interrupt_return
> +	andi.	r0,r4,MSR_RI
> +	bne+	.Lfast_kernel_interrupt_return
> +	addi	r3,r1,STACK_FRAME_OVERHEAD
> +	bl	unrecoverable_exception
> +	b	. /* should not get here */
>   
> -	ld	r9, PACA_THREAD_INFO(r13)
> -	ld	r3,_MSR(r1)
> -#ifdef CONFIG_PPC_BOOK3E
> -	ld	r10,PACACURRENT(r13)
> -#endif /* CONFIG_PPC_BOOK3E */
> -	ld	r4,TI_FLAGS(r9)
> -	andi.	r3,r3,MSR_PR
> -	beq	resume_kernel
> -#ifdef CONFIG_PPC_BOOK3E
> -	lwz	r3,(THREAD+THREAD_DBCR0)(r10)
> -#endif /* CONFIG_PPC_BOOK3E */
> +	.balign IFETCH_ALIGN_BYTES
> +	.globl interrupt_return
> +interrupt_return:
> +_ASM_NOKPROBE_SYMBOL(interrupt_return)
> +	REST_NVGPRS(r1)
>   
> -	/* Check current_thread_info()->flags */
> -	andi.	r0,r4,_TIF_USER_WORK_MASK
> -	bne	1f
> -#ifdef CONFIG_PPC_BOOK3E
> -	/*
> -	 * Check to see if the dbcr0 register is set up to debug.
> -	 * Use the internal debug mode bit to do this.
> -	 */
> -	andis.	r0,r3,DBCR0_IDM@h
> -	beq	restore
> -	mfmsr	r0
> -	rlwinm	r0,r0,0,~MSR_DE	/* Clear MSR.DE */
> -	mtmsr	r0
> -	mtspr	SPRN_DBCR0,r3
> -	li	r10, -1
> -	mtspr	SPRN_DBSR,r10
> -	b	restore
> -#else
> -	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	restore_math
> -	b	restore
> -#endif
> -1:	andi.	r0,r4,_TIF_NEED_RESCHED
> -	beq	2f
> -	bl	restore_interrupts
> -	SCHEDULE_USER
> -	b	ret_from_except_lite
> -2:
> -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> -	andi.	r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
> -	bne	3f		/* only restore TM if nothing else to do */
> +	.balign IFETCH_ALIGN_BYTES
> +	.globl interrupt_return_lite
> +interrupt_return_lite:
> +_ASM_NOKPROBE_SYMBOL(interrupt_return_lite)
> +	ld	r4,_MSR(r1)
> +	andi.	r0,r4,MSR_PR
> +	beq	.Lkernel_interrupt_return
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	restore_tm_state
> -	b	restore
> -3:
> -#endif
> -	bl	save_nvgprs
> -	/*
> -	 * Use a non volatile GPR to save and restore our thread_info flags
> -	 * across the call to restore_interrupts.
> -	 */
> -	mr	r30,r4
> -	bl	restore_interrupts
> -	mr	r4,r30
> -	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	do_notify_resume
> -	b	ret_from_except
> -
> -resume_kernel:
> -	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
> -	andis.	r8,r4,_TIF_EMULATE_STACK_STORE@h
> -	beq+	1f
> +	bl	interrupt_exit_user_prepare
> +	cmpdi	r3,0
> +	bne-	.Lrestore_nvgprs
>   
> -	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
> +.Lfast_user_interrupt_return:
> +	ld	r11,_NIP(r1)
> +	ld	r12,_MSR(r1)
> +BEGIN_FTR_SECTION
> +	ld	r10,_PPR(r1)
> +	mtspr	SPRN_PPR,r10
> +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> +	mtspr	SPRN_SRR0,r11
> +	mtspr	SPRN_SRR1,r12
>   
> -	ld	r3,GPR1(r1)
> -	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
> -	mr	r4,r1			/* src:  current exception frame */
> -	mr	r1,r3			/* Reroute the trampoline frame to r1 */
> +BEGIN_FTR_SECTION
> +	stdcx.	r0,0,r1		/* to clear the reservation */
> +FTR_SECTION_ELSE
> +	ldarx	r0,0,r1
> +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
>   
> -	/* Copy from the original to the trampoline. */
> -	li	r5,INT_FRAME_SIZE/8	/* size: INT_FRAME_SIZE */
> -	li	r6,0			/* start offset: 0 */
> -	mtctr	r5
> -2:	ldx	r0,r6,r4
> -	stdx	r0,r6,r3
> -	addi	r6,r6,8
> -	bdnz	2b
> -
> -	/* Do real store operation to complete stdu */
> -	ld	r5,GPR1(r1)
> -	std	r8,0(r5)
> -
> -	/* Clear _TIF_EMULATE_STACK_STORE flag */
> -	lis	r11,_TIF_EMULATE_STACK_STORE@h
> -	addi	r5,r9,TI_FLAGS
> -0:	ldarx	r4,0,r5
> -	andc	r4,r4,r11
> -	stdcx.	r4,0,r5
> -	bne-	0b
> -1:
> -
> -#ifdef CONFIG_PREEMPTION
> -	/* Check if we need to preempt */
> -	andi.	r0,r4,_TIF_NEED_RESCHED
> -	beq+	restore
> -	/* Check that preempt_count() == 0 and interrupts are enabled */
> -	lwz	r8,TI_PREEMPT(r9)
> -	cmpwi	cr0,r8,0
> -	bne	restore
> -	ld	r0,SOFTE(r1)
> -	andi.	r0,r0,IRQS_DISABLED
> -	bne	restore
> +	ld	r3,_CCR(r1)
> +	ld	r4,_LINK(r1)
> +	ld	r5,_CTR(r1)
> +	ld	r6,_XER(r1)
> +	li	r0,0
>   
> -	/*
> -	 * Here we are preempting the current task. We want to make
> -	 * sure we are soft-disabled first and reconcile irq state.
> -	 */
> -	RECONCILE_IRQ_STATE(r3,r4)
> -	bl	preempt_schedule_irq
> +	REST_4GPRS(7, r1)
> +	REST_2GPRS(11, r1)
> +	REST_GPR(13, r1)
>   
> -	/*
> -	 * arch_local_irq_restore() from preempt_schedule_irq above may
> -	 * enable hard interrupt but we really should disable interrupts
> -	 * when we return from the interrupt, and so that we don't get
> -	 * interrupted after loading SRR0/1.
> -	 */
> -#ifdef CONFIG_PPC_BOOK3E
> -	wrteei	0
> -#else
> -	li	r10,MSR_RI
> -	mtmsrd	r10,1		  /* Update machine state */
> -#endif /* CONFIG_PPC_BOOK3E */
> -#endif /* CONFIG_PREEMPTION */
> +	mtcr	r3
> +	mtlr	r4
> +	mtctr	r5
> +	mtspr	SPRN_XER,r6
>   
> -	.globl	fast_exc_return_irq
> -fast_exc_return_irq:
> -restore:
> -	/*
> -	 * This is the main kernel exit path. First we check if we
> -	 * are about to re-enable interrupts
> -	 */
> -	ld	r5,SOFTE(r1)
> -	lbz	r6,PACAIRQSOFTMASK(r13)
> -	andi.	r5,r5,IRQS_DISABLED
> -	bne	.Lrestore_irq_off
> +	REST_4GPRS(2, r1)
> +	REST_GPR(6, r1)
> +	REST_GPR(0, r1)
> +	REST_GPR(1, r1)
> +	RFI_TO_USER
> +	b	.	/* prevent speculative execution */
>   
> -	/* We are enabling, were we already enabled ? Yes, just return */
> -	andi.	r6,r6,IRQS_DISABLED
> -	beq	cr0,.Ldo_restore
> +.Lrestore_nvgprs:
> +	REST_NVGPRS(r1)
> +	b	.Lfast_user_interrupt_return
>   
> -	/*
> -	 * We are about to soft-enable interrupts (we are hard disabled
> -	 * at this point). We check if there's anything that needs to
> -	 * be replayed first.
> -	 */
> -	lbz	r0,PACAIRQHAPPENED(r13)
> -	cmpwi	cr0,r0,0
> -	bne-	.Lrestore_check_irq_replay
> +	.balign IFETCH_ALIGN_BYTES
> +.Lkernel_interrupt_return:
> +	addi	r3,r1,STACK_FRAME_OVERHEAD
> +	bl	interrupt_exit_kernel_prepare
> +	cmpdi	cr1,r3,0
>   
> -	/*
> -	 * Get here when nothing happened while soft-disabled, just
> -	 * soft-enable and move-on. We will hard-enable as a side
> -	 * effect of rfi
> -	 */
> -.Lrestore_no_replay:
> -	TRACE_ENABLE_INTS
> -	li	r0,IRQS_ENABLED
> -	stb	r0,PACAIRQSOFTMASK(r13);
> +.Lfast_kernel_interrupt_return:
> +	ld	r11,_NIP(r1)
> +	ld	r12,_MSR(r1)
> +	mtspr	SPRN_SRR0,r11
> +	mtspr	SPRN_SRR1,r12
>   
> -	/*
> -	 * Final return path. BookE is handled in a different file
> -	 */
> -.Ldo_restore:
> -#ifdef CONFIG_PPC_BOOK3E
> -	b	exception_return_book3e
> -#else
> -	/*
> -	 * Clear the reservation. If we know the CPU tracks the address of
> -	 * the reservation then we can potentially save some cycles and use
> -	 * a larx. On POWER6 and POWER7 this is significantly faster.
> -	 */
>   BEGIN_FTR_SECTION
>   	stdcx.	r0,0,r1		/* to clear the reservation */
>   FTR_SECTION_ELSE
> -	ldarx	r4,0,r1
> +	ldarx	r0,0,r1
>   ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
>   
> -	/*
> -	 * Some code path such as load_up_fpu or altivec return directly
> -	 * here. They run entirely hard disabled and do not alter the
> -	 * interrupt state. They also don't use lwarx/stwcx. and thus
> -	 * are known not to leave dangling reservations.
> -	 */
> -	.globl	fast_exception_return
> -fast_exception_return:
> -	ld	r3,_MSR(r1)
> +	ld	r3,_LINK(r1)
>   	ld	r4,_CTR(r1)
> -	ld	r0,_LINK(r1)
> -	mtctr	r4
> -	mtlr	r0
> -	ld	r4,_XER(r1)
> -	mtspr	SPRN_XER,r4
> -
> -	kuap_check_amr r5, r6
> -
> -	REST_8GPRS(5, r1)
> -
> -	andi.	r0,r3,MSR_RI
> -	beq-	.Lunrecov_restore
> -
> -	/*
> -	 * Clear RI before restoring r13.  If we are returning to
> -	 * userspace and we take an exception after restoring r13,
> -	 * we end up corrupting the userspace r13 value.
> -	 */
> -	li	r4,0
> -	mtmsrd	r4,1
> -
> -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> -	/* TM debug */
> -	std	r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */
> -#endif
> -	/*
> -	 * r13 is our per cpu area, only restore it if we are returning to
> -	 * userspace the value stored in the stack frame may belong to
> -	 * another CPU.
> -	 */
> -	andi.	r0,r3,MSR_PR
> -	beq	1f
> -BEGIN_FTR_SECTION
> -	/* Restore PPR */
> -	ld	r2,_PPR(r1)
> -	mtspr	SPRN_PPR,r2
> -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> -	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
> -	REST_GPR(13, r1)
> -
> -	/*
> -	 * We don't need to restore AMR on the way back to userspace for KUAP.
> -	 * The value of AMR only matters while we're in the kernel.
> -	 */
> -	mtspr	SPRN_SRR1,r3
> -
> -	ld	r2,_CCR(r1)
> -	mtcrf	0xFF,r2
> -	ld	r2,_NIP(r1)
> -	mtspr	SPRN_SRR0,r2
> -
> -	ld	r0,GPR0(r1)
> -	ld	r2,GPR2(r1)
> -	ld	r3,GPR3(r1)
> -	ld	r4,GPR4(r1)
> -	ld	r1,GPR1(r1)
> -	RFI_TO_USER
> -	b	.	/* prevent speculative execution */
> +	ld	r5,_XER(r1)
> +	ld	r6,_CCR(r1)
> +	li	r0,0
>   
> -1:	mtspr	SPRN_SRR1,r3
> +	REST_4GPRS(7, r1)
> +	REST_2GPRS(11, r1)
>   
> -	ld	r2,_CCR(r1)
> -	mtcrf	0xFF,r2
> -	ld	r2,_NIP(r1)
> -	mtspr	SPRN_SRR0,r2
> +	mtlr	r3
> +	mtctr	r4
> +	mtspr	SPRN_XER,r5
>   
>   	/*
>   	 * Leaving a stale exception_marker on the stack can confuse
>   	 * the reliable stack unwinder later on. Clear it.
>   	 */
> -	li	r2,0
> -	std	r2,STACK_FRAME_OVERHEAD-16(r1)
> +	std	r0,STACK_FRAME_OVERHEAD-16(r1)
>   
> -	ld	r0,GPR0(r1)
> -	ld	r2,GPR2(r1)
> -	ld	r3,GPR3(r1)
> +	REST_4GPRS(2, r1)
>   
> -	kuap_restore_amr r4
> -
> -	ld	r4,GPR4(r1)
> -	ld	r1,GPR1(r1)
> +	bne-	cr1,1f /* emulate stack store */
> +	mtcr	r6
> +	REST_GPR(6, r1)
> +	REST_GPR(0, r1)
> +	REST_GPR(1, r1)
>   	RFI_TO_KERNEL
>   	b	.	/* prevent speculative execution */
>   
> -#endif /* CONFIG_PPC_BOOK3E */
> -
> -	/*
> -	 * We are returning to a context with interrupts soft disabled.
> -	 *
> -	 * However, we may also about to hard enable, so we need to
> -	 * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
> -	 * or that bit can get out of sync and bad things will happen
> -	 */
> -.Lrestore_irq_off:
> -	ld	r3,_MSR(r1)
> -	lbz	r7,PACAIRQHAPPENED(r13)
> -	andi.	r0,r3,MSR_EE
> -	beq	1f
> -	rlwinm	r7,r7,0,~PACA_IRQ_HARD_DIS
> -	stb	r7,PACAIRQHAPPENED(r13)
> -1:
> -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG)
> -	/* The interrupt should not have soft enabled. */
> -	lbz	r7,PACAIRQSOFTMASK(r13)
> -1:	tdeqi	r7,IRQS_ENABLED
> -	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
> -#endif
> -	b	.Ldo_restore
> -
> -	/*
> -	 * Something did happen, check if a re-emit is needed
> -	 * (this also clears paca->irq_happened)
> -	 */
> -.Lrestore_check_irq_replay:
> -	/* XXX: We could implement a fast path here where we check
> -	 * for irq_happened being just 0x01, in which case we can
> -	 * clear it and return. That means that we would potentially
> -	 * miss a decrementer having wrapped all the way around.
> -	 *
> -	 * Still, this might be useful for things like hash_page
> -	 */
> -	bl	__check_irq_replay
> -	cmpwi	cr0,r3,0
> -	beq	.Lrestore_no_replay
> -
> -	/*
> -	 * We need to re-emit an interrupt. We do so by re-using our
> -	 * existing exception frame. We first change the trap value,
> -	 * but we need to ensure we preserve the low nibble of it
> -	 */
> -	ld	r4,_TRAP(r1)
> -	clrldi	r4,r4,60
> -	or	r4,r4,r3
> -	std	r4,_TRAP(r1)
> -
> -	/*
> -	 * PACA_IRQ_HARD_DIS won't always be set here, so set it now
> -	 * to reconcile the IRQ state. Tracing is already accounted for.
> -	 */
> -	lbz	r4,PACAIRQHAPPENED(r13)
> -	ori	r4,r4,PACA_IRQ_HARD_DIS
> -	stb	r4,PACAIRQHAPPENED(r13)
> -
> -	/*
> -	 * Then find the right handler and call it. Interrupts are
> -	 * still soft-disabled and we keep them that way.
> -	*/
> -	cmpwi	cr0,r3,0x500
> -	bne	1f
> -	addi	r3,r1,STACK_FRAME_OVERHEAD;
> - 	bl	do_IRQ
> -	b	ret_from_except
> -1:	cmpwi	cr0,r3,0xf00
> -	bne	1f
> -	addi	r3,r1,STACK_FRAME_OVERHEAD;
> -	bl	performance_monitor_exception
> -	b	ret_from_except
> -1:	cmpwi	cr0,r3,0xe60
> -	bne	1f
> -	addi	r3,r1,STACK_FRAME_OVERHEAD;
> -	bl	handle_hmi_exception
> -	b	ret_from_except
> -1:	cmpwi	cr0,r3,0x900
> -	bne	1f
> -	addi	r3,r1,STACK_FRAME_OVERHEAD;
> -	bl	timer_interrupt
> -	b	ret_from_except
> -#ifdef CONFIG_PPC_DOORBELL
> -1:
> -#ifdef CONFIG_PPC_BOOK3E
> -	cmpwi	cr0,r3,0x280
> -#else
> -	cmpwi	cr0,r3,0xa00
> -#endif /* CONFIG_PPC_BOOK3E */
> -	bne	1f
> -	addi	r3,r1,STACK_FRAME_OVERHEAD;
> -	bl	doorbell_exception
> -#endif /* CONFIG_PPC_DOORBELL */
> -1:	b	ret_from_except /* What else to do here ? */
> -
> -.Lunrecov_restore:
> -	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	unrecoverable_exception
> -	b	.Lunrecov_restore
> -
> -_ASM_NOKPROBE_SYMBOL(ret_from_except);
> -_ASM_NOKPROBE_SYMBOL(ret_from_except_lite);
> -_ASM_NOKPROBE_SYMBOL(resume_kernel);
> -_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq);
> -_ASM_NOKPROBE_SYMBOL(restore);
> -_ASM_NOKPROBE_SYMBOL(fast_exception_return);
> +1:	/*
> +	 * Emulate stack store with update. New r1 value was already calculated
> +	 * and updated in our interrupt regs by emulate_loadstore, but we can't
> +	 * store the previous value of r1 to the stack before re-loading our
> +	 * registers from it, otherwise they could be clobbered.  Use
> +	 * PACA_EXGEN as temporary storage to hold the store data, as
> +	 * interrupts are disabled here so it won't be clobbered.
> +	 */
> +	mtcr	r6
> +	std	r9,PACA_EXGEN+0(r13)
> +	addi	r9,r1,INT_FRAME_SIZE /* get original r1 */
> +	REST_GPR(6, r1)
> +	REST_GPR(0, r1)
> +	REST_GPR(1, r1)
> +	std	r9,0(r1) /* perform store component of stdu */
> +	ld	r9,PACA_EXGEN+0(r13)
>   
> +	RFI_TO_KERNEL
> +	b	.	/* prevent speculative execution */
> +#endif /* CONFIG_PPC_BOOK3S */
>   
>   #ifdef CONFIG_PPC_RTAS
>   /*
> diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
> index 4efac5490216..d9ed79415100 100644
> --- a/arch/powerpc/kernel/exceptions-64e.S
> +++ b/arch/powerpc/kernel/exceptions-64e.S
> @@ -24,6 +24,7 @@
>   #include <asm/kvm_asm.h>
>   #include <asm/kvm_booke_hv_asm.h>
>   #include <asm/feature-fixups.h>
> +#include <asm/context_tracking.h>
>   
>   /* XXX This will ultimately add space for a special exception save
>    *     structure used to save things like SRR0/SRR1, SPRGs, MAS, etc...
> @@ -1041,17 +1042,161 @@ alignment_more:
>   	bl	alignment_exception
>   	b	ret_from_except
>   
> -/*
> - * We branch here from entry_64.S for the last stage of the exception
> - * return code path. MSR:EE is expected to be off at that point
> - */
> -_GLOBAL(exception_return_book3e)
> -	b	1f
> +	.align	7
> +_GLOBAL(ret_from_except)
> +	ld	r11,_TRAP(r1)
> +	andi.	r0,r11,1
> +	bne	ret_from_except_lite
> +	REST_NVGPRS(r1)
> +
> +_GLOBAL(ret_from_except_lite)
> +	/*
> +	 * Disable interrupts so that current_thread_info()->flags
> +	 * can't change between when we test it and when we return
> +	 * from the interrupt.
> +	 */
> +	wrteei	0
> +
> +	ld	r9, PACA_THREAD_INFO(r13)
> +	ld	r3,_MSR(r1)
> +	ld	r10,PACACURRENT(r13)
> +	ld	r4,TI_FLAGS(r9)
> +	andi.	r3,r3,MSR_PR
> +	beq	resume_kernel
> +	lwz	r3,(THREAD+THREAD_DBCR0)(r10)
> +
> +	/* Check current_thread_info()->flags */
> +	andi.	r0,r4,_TIF_USER_WORK_MASK
> +	bne	1f
> +	/*
> +	 * Check to see if the dbcr0 register is set up to debug.
> +	 * Use the internal debug mode bit to do this.
> +	 */
> +	andis.	r0,r3,DBCR0_IDM@h
> +	beq	restore
> +	mfmsr	r0
> +	rlwinm	r0,r0,0,~MSR_DE	/* Clear MSR.DE */
> +	mtmsr	r0
> +	mtspr	SPRN_DBCR0,r3
> +	li	r10, -1
> +	mtspr	SPRN_DBSR,r10
> +	b	restore
> +1:	andi.	r0,r4,_TIF_NEED_RESCHED
> +	beq	2f
> +	bl	restore_interrupts
> +	SCHEDULE_USER
> +	b	ret_from_except_lite
> +2:
> +	bl	save_nvgprs
> +	/*
> +	 * Use a non volatile GPR to save and restore our thread_info flags
> +	 * across the call to restore_interrupts.
> +	 */
> +	mr	r30,r4
> +	bl	restore_interrupts
> +	mr	r4,r30
> +	addi	r3,r1,STACK_FRAME_OVERHEAD
> +	bl	do_notify_resume
> +	b	ret_from_except
> +
> +resume_kernel:
> +	/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
> +	andis.	r8,r4,_TIF_EMULATE_STACK_STORE@h
> +	beq+	1f
> +
> +	addi	r8,r1,INT_FRAME_SIZE	/* Get the kprobed function entry */
> +
> +	ld	r3,GPR1(r1)
> +	subi	r3,r3,INT_FRAME_SIZE	/* dst: Allocate a trampoline exception frame */
> +	mr	r4,r1			/* src:  current exception frame */
> +	mr	r1,r3			/* Reroute the trampoline frame to r1 */
> +
> +	/* Copy from the original to the trampoline. */
> +	li	r5,INT_FRAME_SIZE/8	/* size: INT_FRAME_SIZE */
> +	li	r6,0			/* start offset: 0 */
> +	mtctr	r5
> +2:	ldx	r0,r6,r4
> +	stdx	r0,r6,r3
> +	addi	r6,r6,8
> +	bdnz	2b
> +
> +	/* Do real store operation to complete stdu */
> +	ld	r5,GPR1(r1)
> +	std	r8,0(r5)
> +
> +	/* Clear _TIF_EMULATE_STACK_STORE flag */
> +	lis	r11,_TIF_EMULATE_STACK_STORE@h
> +	addi	r5,r9,TI_FLAGS
> +0:	ldarx	r4,0,r5
> +	andc	r4,r4,r11
> +	stdcx.	r4,0,r5
> +	bne-	0b
> +1:
> +
> +#ifdef CONFIG_PREEMPT
> +	/* Check if we need to preempt */
> +	andi.	r0,r4,_TIF_NEED_RESCHED
> +	beq+	restore
> +	/* Check that preempt_count() == 0 and interrupts are enabled */
> +	lwz	r8,TI_PREEMPT(r9)
> +	cmpwi	cr0,r8,0
> +	bne	restore
> +	ld	r0,SOFTE(r1)
> +	andi.	r0,r0,IRQS_DISABLED
> +	bne	restore
> +
> +	/*
> +	 * Here we are preempting the current task. We want to make
> +	 * sure we are soft-disabled first and reconcile irq state.
> +	 */
> +	RECONCILE_IRQ_STATE(r3,r4)
> +	bl	preempt_schedule_irq
> +
> +	/*
> +	 * arch_local_irq_restore() from preempt_schedule_irq above may
> +	 * enable hard interrupt but we really should disable interrupts
> +	 * when we return from the interrupt, and so that we don't get
> +	 * interrupted after loading SRR0/1.
> +	 */
> +	wrteei	0
> +#endif /* CONFIG_PREEMPT */
> +
> +restore:
> +	/*
> +	 * This is the main kernel exit path. First we check if we
> +	 * are about to re-enable interrupts
> +	 */
> +	ld	r5,SOFTE(r1)
> +	lbz	r6,PACAIRQSOFTMASK(r13)
> +	andi.	r5,r5,IRQS_DISABLED
> +	bne	.Lrestore_irq_off
> +
> +	/* We are enabling, were we already enabled ? Yes, just return */
> +	andi.	r6,r6,IRQS_DISABLED
> +	beq	cr0,fast_exception_return
> +
> +	/*
> +	 * We are about to soft-enable interrupts (we are hard disabled
> +	 * at this point). We check if there's anything that needs to
> +	 * be replayed first.
> +	 */
> +	lbz	r0,PACAIRQHAPPENED(r13)
> +	cmpwi	cr0,r0,0
> +	bne-	.Lrestore_check_irq_replay
> +
> +	/*
> +	 * Get here when nothing happened while soft-disabled, just
> +	 * soft-enable and move-on. We will hard-enable as a side
> +	 * effect of rfi
> +	 */
> +.Lrestore_no_replay:
> +	TRACE_ENABLE_INTS
> +	li	r0,IRQS_ENABLED
> +	stb	r0,PACAIRQSOFTMASK(r13);
>   
>   /* This is the return from load_up_fpu fast path which could do with
>    * less GPR restores in fact, but for now we have a single return path
>    */
> -	.globl fast_exception_return
>   fast_exception_return:
>   	wrteei	0
>   1:	mr	r0,r13
> @@ -1092,6 +1237,102 @@ fast_exception_return:
>   	mfspr	r13,SPRN_SPRG_GEN_SCRATCH
>   	rfi
>   
> +	/*
> +	 * We are returning to a context with interrupts soft disabled.
> +	 *
> +	 * However, we may also about to hard enable, so we need to
> +	 * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
> +	 * or that bit can get out of sync and bad things will happen
> +	 */
> +.Lrestore_irq_off:
> +	ld	r3,_MSR(r1)
> +	lbz	r7,PACAIRQHAPPENED(r13)
> +	andi.	r0,r3,MSR_EE
> +	beq	1f
> +	rlwinm	r7,r7,0,~PACA_IRQ_HARD_DIS
> +	stb	r7,PACAIRQHAPPENED(r13)
> +1:
> +#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG)
> +	/* The interrupt should not have soft enabled. */
> +	lbz	r7,PACAIRQSOFTMASK(r13)
> +1:	tdeqi	r7,IRQS_ENABLED
> +	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
> +#endif
> +	b	fast_exception_return
> +
> +	/*
> +	 * Something did happen, check if a re-emit is needed
> +	 * (this also clears paca->irq_happened)
> +	 */
> +.Lrestore_check_irq_replay:
> +	/* XXX: We could implement a fast path here where we check
> +	 * for irq_happened being just 0x01, in which case we can
> +	 * clear it and return. That means that we would potentially
> +	 * miss a decrementer having wrapped all the way around.
> +	 *
> +	 * Still, this might be useful for things like hash_page
> +	 */
> +	bl	__check_irq_replay
> +	cmpwi	cr0,r3,0
> +	beq	.Lrestore_no_replay
> +
> +	/*
> +	 * We need to re-emit an interrupt. We do so by re-using our
> +	 * existing exception frame. We first change the trap value,
> +	 * but we need to ensure we preserve the low nibble of it
> +	 */
> +	ld	r4,_TRAP(r1)
> +	clrldi	r4,r4,60
> +	or	r4,r4,r3
> +	std	r4,_TRAP(r1)
> +
> +	/*
> +	 * PACA_IRQ_HARD_DIS won't always be set here, so set it now
> +	 * to reconcile the IRQ state. Tracing is already accounted for.
> +	 */
> +	lbz	r4,PACAIRQHAPPENED(r13)
> +	ori	r4,r4,PACA_IRQ_HARD_DIS
> +	stb	r4,PACAIRQHAPPENED(r13)
> +
> +	/*
> +	 * Then find the right handler and call it. Interrupts are
> +	 * still soft-disabled and we keep them that way.
> +	*/
> +	cmpwi	cr0,r3,0x500
> +	bne	1f
> +	addi	r3,r1,STACK_FRAME_OVERHEAD;
> +	bl	do_IRQ
> +	b	ret_from_except
> +1:	cmpwi	cr0,r3,0xf00
> +	bne	1f
> +	addi	r3,r1,STACK_FRAME_OVERHEAD;
> +	bl	performance_monitor_exception
> +	b	ret_from_except
> +1:	cmpwi	cr0,r3,0xe60
> +	bne	1f
> +	addi	r3,r1,STACK_FRAME_OVERHEAD;
> +	bl	handle_hmi_exception
> +	b	ret_from_except
> +1:	cmpwi	cr0,r3,0x900
> +	bne	1f
> +	addi	r3,r1,STACK_FRAME_OVERHEAD;
> +	bl	timer_interrupt
> +	b	ret_from_except
> +#ifdef CONFIG_PPC_DOORBELL
> +1:
> +	cmpwi	cr0,r3,0x280
> +	bne	1f
> +	addi	r3,r1,STACK_FRAME_OVERHEAD;
> +	bl	doorbell_exception
> +#endif /* CONFIG_PPC_DOORBELL */
> +1:	b	ret_from_except /* What else to do here ? */
> +
> +_ASM_NOKPROBE_SYMBOL(ret_from_except);
> +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite);
> +_ASM_NOKPROBE_SYMBOL(resume_kernel);
> +_ASM_NOKPROBE_SYMBOL(restore);
> +_ASM_NOKPROBE_SYMBOL(fast_exception_return);
> +
>   /*
>    * Trampolines used when spotting a bad kernel stack pointer in
>    * the exception entry code.
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index bad8cd9e7dba..d635fd4e40ea 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -575,6 +575,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>   	std	r10,GPR12(r1)
>   	std	r11,GPR13(r1)
>   
> +	SAVE_NVGPRS(r1)
> +
>   	.if IDAR
>   	.if IISIDE
>   	ld	r10,_NIP(r1)
> @@ -611,7 +613,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
>   	mfspr	r11,SPRN_XER		/* save XER in stackframe	*/
>   	std	r10,SOFTE(r1)
>   	std	r11,_XER(r1)
> -	li	r9,(IVEC)+1
> +	li	r9,IVEC
>   	std	r9,_TRAP(r1)		/* set trap number		*/
>   	li	r10,0
>   	ld	r11,exception_marker@toc(r2)
> @@ -918,7 +920,6 @@ EXC_COMMON_BEGIN(system_reset_common)
>   	ld	r1,PACA_NMI_EMERG_SP(r13)
>   	subi	r1,r1,INT_FRAME_SIZE
>   	__GEN_COMMON_BODY system_reset
> -	bl	save_nvgprs
>   	/*
>   	 * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
>   	 * the right thing. We do not want to reconcile because that goes
> @@ -1099,7 +1100,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>   	li	r10,MSR_RI
>   	mtmsrd	r10,1
>   
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	machine_check_early
>   	std	r3,RESULT(r1)	/* Save result */
> @@ -1192,10 +1192,9 @@ EXC_COMMON_BEGIN(machine_check_common)
>   	/* Enable MSR_RI when finished with PACA_EXMC */
>   	li	r10,MSR_RI
>   	mtmsrd 	r10,1
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	machine_check_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM machine_check
>   
> @@ -1362,20 +1361,19 @@ BEGIN_MMU_FTR_SECTION
>   	bl	do_slb_fault
>   	cmpdi	r3,0
>   	bne-	1f
> -	b	fast_exception_return
> +	b	fast_interrupt_return
>   1:	/* Error case */
>   MMU_FTR_SECTION_ELSE
>   	/* Radix case, access is outside page table range */
>   	li	r3,-EFAULT
>   ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
>   	std	r3,RESULT(r1)
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	ld	r4,_DAR(r1)
>   	ld	r5,RESULT(r1)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	do_bad_slb_fault
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM data_access_slb
>   
> @@ -1455,20 +1453,19 @@ BEGIN_MMU_FTR_SECTION
>   	bl	do_slb_fault
>   	cmpdi	r3,0
>   	bne-	1f
> -	b	fast_exception_return
> +	b	fast_interrupt_return
>   1:	/* Error case */
>   MMU_FTR_SECTION_ELSE
>   	/* Radix case, access is outside page table range */
>   	li	r3,-EFAULT
>   ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
>   	std	r3,RESULT(r1)
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	ld	r4,_DAR(r1)
>   	ld	r5,RESULT(r1)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	do_bad_slb_fault
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM instruction_access_slb
>   
> @@ -1516,7 +1513,7 @@ EXC_COMMON_BEGIN(hardware_interrupt_common)
>   	RUNLATCH_ON
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	do_IRQ
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM hardware_interrupt
>   
> @@ -1542,10 +1539,9 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100)
>   EXC_VIRT_END(alignment, 0x4600, 0x100)
>   EXC_COMMON_BEGIN(alignment_common)
>   	GEN_COMMON alignment
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	alignment_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM alignment
>   
> @@ -1606,10 +1602,9 @@ EXC_COMMON_BEGIN(program_check_common)
>   	__ISTACK(program_check)=1
>   	__GEN_COMMON_BODY program_check
>   3:
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	program_check_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM program_check
>   
> @@ -1640,7 +1635,6 @@ EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
>   EXC_COMMON_BEGIN(fp_unavailable_common)
>   	GEN_COMMON fp_unavailable
>   	bne	1f			/* if from user, just load it up */
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	kernel_fp_unavailable_exception
> @@ -1657,14 +1651,13 @@ BEGIN_FTR_SECTION
>   END_FTR_SECTION_IFSET(CPU_FTR_TM)
>   #endif
>   	bl	load_up_fpu
> -	b	fast_exception_return
> +	b	fast_interrupt_return
>   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   2:	/* User process was in a transaction */
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	fp_unavailable_tm
> -	b	ret_from_except
> +	b	interrupt_return
>   #endif
>   
>   	GEN_KVM fp_unavailable
> @@ -1707,7 +1700,7 @@ EXC_COMMON_BEGIN(decrementer_common)
>   	RUNLATCH_ON
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	timer_interrupt
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM decrementer
>   
> @@ -1798,7 +1791,7 @@ EXC_COMMON_BEGIN(doorbell_super_common)
>   #else
>   	bl	unknown_exception
>   #endif
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM doorbell_super
>   
> @@ -1970,10 +1963,9 @@ EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100)
>   EXC_VIRT_END(single_step, 0x4d00, 0x100)
>   EXC_COMMON_BEGIN(single_step_common)
>   	GEN_COMMON single_step
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	single_step_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM single_step
>   
> @@ -2008,7 +2000,6 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20)
>   EXC_VIRT_END(h_data_storage, 0x4e00, 0x20)
>   EXC_COMMON_BEGIN(h_data_storage_common)
>   	GEN_COMMON h_data_storage
> -	bl      save_nvgprs
>   	addi    r3,r1,STACK_FRAME_OVERHEAD
>   BEGIN_MMU_FTR_SECTION
>   	ld	r4,_DAR(r1)
> @@ -2017,7 +2008,7 @@ BEGIN_MMU_FTR_SECTION
>   MMU_FTR_SECTION_ELSE
>   	bl      unknown_exception
>   ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
> -	b       ret_from_except
> +	b       interrupt_return
>   
>   	GEN_KVM h_data_storage
>   
> @@ -2042,10 +2033,9 @@ EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20)
>   EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20)
>   EXC_COMMON_BEGIN(h_instr_storage_common)
>   	GEN_COMMON h_instr_storage
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	unknown_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM h_instr_storage
>   
> @@ -2068,10 +2058,9 @@ EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20)
>   EXC_VIRT_END(emulation_assist, 0x4e40, 0x20)
>   EXC_COMMON_BEGIN(emulation_assist_common)
>   	GEN_COMMON emulation_assist
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	emulation_assist_interrupt
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM emulation_assist
>   
> @@ -2151,10 +2140,9 @@ EXC_COMMON_BEGIN(hmi_exception_common)
>   	GEN_COMMON hmi_exception
>   	FINISH_NAP
>   	RUNLATCH_ON
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	handle_hmi_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM hmi_exception
>   
> @@ -2188,7 +2176,7 @@ EXC_COMMON_BEGIN(h_doorbell_common)
>   #else
>   	bl	unknown_exception
>   #endif
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM h_doorbell
>   
> @@ -2218,7 +2206,7 @@ EXC_COMMON_BEGIN(h_virt_irq_common)
>   	RUNLATCH_ON
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	do_IRQ
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM h_virt_irq
>   
> @@ -2265,7 +2253,7 @@ EXC_COMMON_BEGIN(performance_monitor_common)
>   	RUNLATCH_ON
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	performance_monitor_exception
> -	b	ret_from_except_lite
> +	b	interrupt_return_lite
>   
>   	GEN_KVM performance_monitor
>   
> @@ -2305,23 +2293,21 @@ BEGIN_FTR_SECTION
>     END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
>   #endif
>   	bl	load_up_altivec
> -	b	fast_exception_return
> +	b	fast_interrupt_return
>   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   2:	/* User process was in a transaction */
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	altivec_unavailable_tm
> -	b	ret_from_except
> +	b	interrupt_return
>   #endif
>   1:
>   END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
>   #endif
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	altivec_unavailable_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM altivec_unavailable
>   
> @@ -2363,20 +2349,18 @@ BEGIN_FTR_SECTION
>   	b	load_up_vsx
>   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   2:	/* User process was in a transaction */
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	vsx_unavailable_tm
> -	b	ret_from_except
> +	b	interrupt_return
>   #endif
>   1:
>   END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>   #endif
> -	bl	save_nvgprs
>   	RECONCILE_IRQ_STATE(r10, r11)
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	vsx_unavailable_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM vsx_unavailable
>   
> @@ -2403,10 +2387,9 @@ EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20)
>   EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20)
>   EXC_COMMON_BEGIN(facility_unavailable_common)
>   	GEN_COMMON facility_unavailable
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	facility_unavailable_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM facility_unavailable
>   
> @@ -2433,10 +2416,9 @@ EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20)
>   EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20)
>   EXC_COMMON_BEGIN(h_facility_unavailable_common)
>   	GEN_COMMON h_facility_unavailable
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	facility_unavailable_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM h_facility_unavailable
>   
> @@ -2467,10 +2449,9 @@ EXC_REAL_END(cbe_system_error, 0x1200, 0x100)
>   EXC_VIRT_NONE(0x5200, 0x100)
>   EXC_COMMON_BEGIN(cbe_system_error_common)
>   	GEN_COMMON cbe_system_error
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	cbe_system_error_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM cbe_system_error
>   
> @@ -2496,10 +2477,9 @@ EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100)
>   EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100)
>   EXC_COMMON_BEGIN(instruction_breakpoint_common)
>   	GEN_COMMON instruction_breakpoint
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	instruction_breakpoint_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM instruction_breakpoint
>   
> @@ -2619,10 +2599,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
>   
>   EXC_COMMON_BEGIN(denorm_exception_common)
>   	GEN_COMMON denorm_exception
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	unknown_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM denorm_exception
>   
> @@ -2641,10 +2620,9 @@ EXC_REAL_END(cbe_maintenance, 0x1600, 0x100)
>   EXC_VIRT_NONE(0x5600, 0x100)
>   EXC_COMMON_BEGIN(cbe_maintenance_common)
>   	GEN_COMMON cbe_maintenance
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	cbe_maintenance_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM cbe_maintenance
>   
> @@ -2669,14 +2647,13 @@ EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100)
>   EXC_VIRT_END(altivec_assist, 0x5700, 0x100)
>   EXC_COMMON_BEGIN(altivec_assist_common)
>   	GEN_COMMON altivec_assist
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   #ifdef CONFIG_ALTIVEC
>   	bl	altivec_assist_exception
>   #else
>   	bl	unknown_exception
>   #endif
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM altivec_assist
>   
> @@ -2695,10 +2672,9 @@ EXC_REAL_END(cbe_thermal, 0x1800, 0x100)
>   EXC_VIRT_NONE(0x5800, 0x100)
>   EXC_COMMON_BEGIN(cbe_thermal_common)
>   	GEN_COMMON cbe_thermal
> -	bl	save_nvgprs
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	cbe_thermal_exception
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   	GEN_KVM cbe_thermal
>   
> @@ -2731,7 +2707,6 @@ EXC_COMMON_BEGIN(soft_nmi_common)
>   	ld	r1,PACAEMERGSP(r13)
>   	subi	r1,r1,INT_FRAME_SIZE
>   	__GEN_COMMON_BODY soft_nmi
> -	bl	save_nvgprs
>   
>   	/*
>   	 * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
> @@ -3063,7 +3038,7 @@ do_hash_page:
>           cmpdi	r3,0			/* see if __hash_page succeeded */
>   
>   	/* Success */
> -	beq	fast_exc_return_irq	/* Return from exception on success */
> +	beq	interrupt_return_lite	/* Return from exception on success */
>   
>   	/* Error */
>   	blt-	13f
> @@ -3080,17 +3055,15 @@ handle_page_fault:
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	do_page_fault
>   	cmpdi	r3,0
> -	beq+	ret_from_except_lite
> -	bl	save_nvgprs
> +	beq+	interrupt_return_lite
>   	mr	r5,r3
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	ld	r4,_DAR(r1)
>   	bl	bad_page_fault
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   /* We have a data breakpoint exception - handle it */
>   handle_dabr_fault:
> -	bl	save_nvgprs
>   	ld      r4,_DAR(r1)
>   	ld      r5,_DSISR(r1)
>   	addi    r3,r1,STACK_FRAME_OVERHEAD
> @@ -3098,21 +3071,20 @@ handle_dabr_fault:
>   	/*
>   	 * do_break() may have changed the NV GPRS while handling a breakpoint.
>   	 * If so, we need to restore them with their updated values. Don't use
> -	 * ret_from_except_lite here.
> +	 * interrupt_return_lite here.
>   	 */
> -	b       ret_from_except
> +	b       interrupt_return
>   
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
>   /* We have a page fault that hash_page could handle but HV refused
>    * the PTE insertion
>    */
> -13:	bl	save_nvgprs
> -	mr	r5,r3
> +13:	mr	r5,r3
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	ld	r4,_DAR(r1)
>   	bl	low_hash_fault
> -	b	ret_from_except
> +	b	interrupt_return
>   #endif
>   
>   /*
> @@ -3122,11 +3094,10 @@ handle_dabr_fault:
>    * were soft-disabled.  We want to invoke the exception handler for
>    * the access, or panic if there isn't a handler.
>    */
> -77:	bl	save_nvgprs
> -	addi	r3,r1,STACK_FRAME_OVERHEAD
> +77:	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	li	r5,SIGSEGV
>   	bl	bad_page_fault
> -	b	ret_from_except
> +	b	interrupt_return
>   
>   /*
>    * When doorbell is triggered from system reset wakeup, the message is
> diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
> index afd74eba70aa..6ea27dbcb872 100644
> --- a/arch/powerpc/kernel/irq.c
> +++ b/arch/powerpc/kernel/irq.c
> @@ -110,6 +110,8 @@ static inline notrace int decrementer_check_overflow(void)
>   	return now >= *next_tb;
>   }
>   
> +#ifdef CONFIG_PPC_BOOK3E
> +
>   /* This is called whenever we are re-enabling interrupts
>    * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if
>    * there's an EE, DEC or DBELL to generate.
> @@ -169,41 +171,16 @@ notrace unsigned int __check_irq_replay(void)
>   		}
>   	}
>   
> -	/*
> -	 * Force the delivery of pending soft-disabled interrupts on PS3.
> -	 * Any HV call will have this side effect.
> -	 */
> -	if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
> -		u64 tmp, tmp2;
> -		lv1_get_version_info(&tmp, &tmp2);
> -	}
> -
> -	/*
> -	 * Check if an hypervisor Maintenance interrupt happened.
> -	 * This is a higher priority interrupt than the others, so
> -	 * replay it first.
> -	 */
> -	if (happened & PACA_IRQ_HMI) {
> -		local_paca->irq_happened &= ~PACA_IRQ_HMI;
> -		return 0xe60;
> -	}
> -
>   	if (happened & PACA_IRQ_DEC) {
>   		local_paca->irq_happened &= ~PACA_IRQ_DEC;
>   		return 0x900;
>   	}
>   
> -	if (happened & PACA_IRQ_PMI) {
> -		local_paca->irq_happened &= ~PACA_IRQ_PMI;
> -		return 0xf00;
> -	}
> -
>   	if (happened & PACA_IRQ_EE) {
>   		local_paca->irq_happened &= ~PACA_IRQ_EE;
>   		return 0x500;
>   	}
>   
> -#ifdef CONFIG_PPC_BOOK3E
>   	/*
>   	 * Check if an EPR external interrupt happened this bit is typically
>   	 * set if we need to handle another "edge" interrupt from within the
> @@ -218,20 +195,15 @@ notrace unsigned int __check_irq_replay(void)
>   		local_paca->irq_happened &= ~PACA_IRQ_DBELL;
>   		return 0x280;
>   	}
> -#else
> -	if (happened & PACA_IRQ_DBELL) {
> -		local_paca->irq_happened &= ~PACA_IRQ_DBELL;
> -		return 0xa00;
> -	}
> -#endif /* CONFIG_PPC_BOOK3E */
>   
>   	/* There should be nothing left ! */
>   	BUG_ON(local_paca->irq_happened != 0);
>   
>   	return 0;
>   }
> +#endif /* CONFIG_PPC_BOOK3E */
>   
> -static void replay_soft_interrupts(void)
> +void replay_soft_interrupts(void)
>   {
>   	/*
>   	 * We use local_paca rather than get_paca() to avoid all
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index fad50db9dcf2..1dea4d280f6f 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -236,23 +236,9 @@ void enable_kernel_fp(void)
>   	}
>   }
>   EXPORT_SYMBOL(enable_kernel_fp);
> -
> -static int restore_fp(struct task_struct *tsk)
> -{
> -	if (tsk->thread.load_fp) {
> -		load_fp_state(&current->thread.fp_state);
> -		current->thread.load_fp++;
> -		return 1;
> -	}
> -	return 0;
> -}
> -#else
> -static int restore_fp(struct task_struct *tsk) { return 0; }
>   #endif /* CONFIG_PPC_FPU */
>   
>   #ifdef CONFIG_ALTIVEC
> -#define loadvec(thr) ((thr).load_vec)
> -
>   static void __giveup_altivec(struct task_struct *tsk)
>   {
>   	unsigned long msr;
> @@ -318,21 +304,6 @@ void flush_altivec_to_thread(struct task_struct *tsk)
>   	}
>   }
>   EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
> -
> -static int restore_altivec(struct task_struct *tsk)
> -{
> -	if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) {
> -		load_vr_state(&tsk->thread.vr_state);
> -		tsk->thread.used_vr = 1;
> -		tsk->thread.load_vec++;
> -
> -		return 1;
> -	}
> -	return 0;
> -}
> -#else
> -#define loadvec(thr) 0
> -static inline int restore_altivec(struct task_struct *tsk) { return 0; }
>   #endif /* CONFIG_ALTIVEC */
>   
>   #ifdef CONFIG_VSX
> @@ -400,18 +371,6 @@ void flush_vsx_to_thread(struct task_struct *tsk)
>   	}
>   }
>   EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
> -
> -static int restore_vsx(struct task_struct *tsk)
> -{
> -	if (cpu_has_feature(CPU_FTR_VSX)) {
> -		tsk->thread.used_vsr = 1;
> -		return 1;
> -	}
> -
> -	return 0;
> -}
> -#else
> -static inline int restore_vsx(struct task_struct *tsk) { return 0; }
>   #endif /* CONFIG_VSX */
>   
>   #ifdef CONFIG_SPE
> @@ -511,6 +470,53 @@ void giveup_all(struct task_struct *tsk)
>   }
>   EXPORT_SYMBOL(giveup_all);
>   
> +#ifdef CONFIG_PPC_BOOK3S_64
> +#ifdef CONFIG_PPC_FPU
> +static int restore_fp(struct task_struct *tsk)
> +{
> +	if (tsk->thread.load_fp) {
> +		load_fp_state(&current->thread.fp_state);
> +		current->thread.load_fp++;
> +		return 1;
> +	}
> +	return 0;
> +}
> +#else
> +static int restore_fp(struct task_struct *tsk) { return 0; }
> +#endif /* CONFIG_PPC_FPU */
> +
> +#ifdef CONFIG_ALTIVEC
> +#define loadvec(thr) ((thr).load_vec)
> +static int restore_altivec(struct task_struct *tsk)
> +{
> +	if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) {
> +		load_vr_state(&tsk->thread.vr_state);
> +		tsk->thread.used_vr = 1;
> +		tsk->thread.load_vec++;
> +
> +		return 1;
> +	}
> +	return 0;
> +}
> +#else
> +#define loadvec(thr) 0
> +static inline int restore_altivec(struct task_struct *tsk) { return 0; }
> +#endif /* CONFIG_ALTIVEC */
> +
> +#ifdef CONFIG_VSX
> +static int restore_vsx(struct task_struct *tsk)
> +{
> +	if (cpu_has_feature(CPU_FTR_VSX)) {
> +		tsk->thread.used_vsr = 1;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +#else
> +static inline int restore_vsx(struct task_struct *tsk) { return 0; }
> +#endif /* CONFIG_VSX */
> +
>   /*
>    * The exception exit path calls restore_math() with interrupts hard disabled
>    * but the soft irq state not "reconciled". ftrace code that calls
> @@ -551,6 +557,7 @@ void notrace restore_math(struct pt_regs *regs)
>   
>   	regs->msr = msr;
>   }
> +#endif
>   
>   static void save_all(struct task_struct *tsk)
>   {
> diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
> index 20f77cc19df8..08e0bebbd3b6 100644
> --- a/arch/powerpc/kernel/syscall_64.c
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -26,7 +26,11 @@ notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7,
>   	unsigned long ti_flags;
>   	syscall_fn f;
>   
> +	if (IS_ENABLED(CONFIG_PPC_BOOK3S))
> +		BUG_ON(!(regs->msr & MSR_RI));
>   	BUG_ON(!(regs->msr & MSR_PR));
> +	BUG_ON(!FULL_REGS(regs));
> +	BUG_ON(regs->softe != IRQS_ENABLED);
>   
>   	if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
>   	    unlikely(regs->msr & MSR_TS_T))
> @@ -195,7 +199,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>   		trace_hardirqs_off();
>   		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
>   		local_irq_enable();
> -		/* Took an interrupt which may have more exit work to do. */
> +		/* Took an interrupt, may have more exit work to do. */
>   		goto again;
>   	}
>   	local_paca->irq_happened = 0;
> @@ -211,3 +215,161 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
>   
>   	return ret;
>   }
> +
> +#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
> +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
> +{
> +#ifdef CONFIG_PPC_BOOK3E
> +	struct thread_struct *ts = &current->thread;
> +#endif
> +	unsigned long *ti_flagsp = &current_thread_info()->flags;
> +	unsigned long ti_flags;
> +	unsigned long flags;
> +	unsigned long ret = 0;
> +
> +	if (IS_ENABLED(CONFIG_PPC_BOOK3S))
> +		BUG_ON(!(regs->msr & MSR_RI));
> +	BUG_ON(!(regs->msr & MSR_PR));
> +	BUG_ON(!FULL_REGS(regs));
> +	BUG_ON(regs->softe != IRQS_ENABLED);
> +
> +	local_irq_save(flags);
> +
> +again:
> +	ti_flags = READ_ONCE(*ti_flagsp);
> +	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
> +		local_irq_enable(); /* returning to user: may enable */
> +		if (ti_flags & _TIF_NEED_RESCHED) {
> +			schedule();
> +		} else {
> +			if (ti_flags & _TIF_SIGPENDING)
> +				ret |= _TIF_RESTOREALL;
> +			do_notify_resume(regs, ti_flags);
> +		}
> +		local_irq_disable();
> +		ti_flags = READ_ONCE(*ti_flagsp);
> +	}
> +
> +	if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
> +		unsigned long mathflags = 0;
> +
> +		if (IS_ENABLED(CONFIG_PPC_FPU))
> +			mathflags |= MSR_FP;
> +		if (IS_ENABLED(CONFIG_ALTIVEC))
> +			mathflags |= MSR_VEC;
> +
> +		if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
> +						(ti_flags & _TIF_RESTORE_TM))
> +			restore_tm_state(regs);
> +		else if ((regs->msr & mathflags) != mathflags)
> +			restore_math(regs);
> +	}
> +
> +	trace_hardirqs_on();
> +	__hard_EE_RI_disable();
> +	if (unlikely(lazy_irq_pending())) {
> +		__hard_RI_enable();
> +		trace_hardirqs_off();
> +		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
> +		local_irq_enable();
> +		local_irq_disable();
> +		/* Took an interrupt, may have more exit work to do. */
> +		goto again;
> +	}
> +	local_paca->irq_happened = 0;
> +	irq_soft_mask_set(IRQS_ENABLED);
> +
> +#ifdef CONFIG_PPC_BOOK3E
> +	if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
> +		/*
> +		 * Check to see if the dbcr0 register is set up to debug.
> +		 * Use the internal debug mode bit to do this.
> +		 */
> +		mtmsr(mfmsr() & ~MSR_DE);
> +		mtspr(SPRN_DBCR0, ts->debug.dbcr0);
> +		mtspr(SPRN_DBSR, -1);
> +	}
> +#endif
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +	local_paca->tm_scratch = regs->msr;
> +#endif
> +
> +	kuap_check_amr();
> +
> +	account_cpu_user_exit();
> +
> +	return ret;
> +}
> +
> +void unrecoverable_exception(struct pt_regs *regs);
> +void preempt_schedule_irq(void);
> +
> +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)
> +{
> +	unsigned long *ti_flagsp = &current_thread_info()->flags;
> +	unsigned long flags;
> +
> +	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI)))
> +		unrecoverable_exception(regs);
> +	BUG_ON(regs->msr & MSR_PR);
> +	BUG_ON(!FULL_REGS(regs));
> +
> +	local_irq_save(flags);
> +
> +	if (regs->softe == IRQS_ENABLED) {
> +		/* Returning to a kernel context with local irqs enabled. */
> +		WARN_ON_ONCE(!(regs->msr & MSR_EE));
> +again:
> +		if (IS_ENABLED(CONFIG_PREEMPT)) {
> +			/* Return to preemptible kernel context */
> +			if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) {
> +				if (preempt_count() == 0)
> +					preempt_schedule_irq();
> +			}
> +		}
> +
> +		trace_hardirqs_on();
> +		__hard_EE_RI_disable();
> +		if (unlikely(lazy_irq_pending())) {
> +			__hard_RI_enable();
> +			irq_soft_mask_set(IRQS_ALL_DISABLED);
> +			trace_hardirqs_off();
> +			local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
> +			/*
> +			 * Can't local_irq_enable in case we are in interrupt
> +			 * context. Must replay directly.
> +			 */
> +			replay_soft_interrupts();
> +			irq_soft_mask_set(flags);
> +			/* Took an interrupt, may have more exit work to do. */
> +			goto again;
> +		}
> +		local_paca->irq_happened = 0;
> +		irq_soft_mask_set(IRQS_ENABLED);
> +	} else {
> +		/* Returning to a kernel context with local irqs disabled. */
> +		trace_hardirqs_on();
> +		__hard_EE_RI_disable();
> +		if (regs->msr & MSR_EE)
> +			local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
> +	}
> +
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +	local_paca->tm_scratch = regs->msr;
> +#endif
> +
> +	/*
> +	 * We don't need to restore AMR on the way back to userspace for KUAP.
> +	 * The value of AMR only matters while we're in the kernel.
> +	 */
> +	kuap_restore_amr(regs);
> +
> +	if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) {
> +		clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp);
> +		return 1;
> +	}
> +	return 0;
> +}
> +#endif
> diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> index 25c14a0981bf..d20c5e79e03c 100644
> --- a/arch/powerpc/kernel/vector.S
> +++ b/arch/powerpc/kernel/vector.S
> @@ -134,7 +134,7 @@ _GLOBAL(load_up_vsx)
>   	/* enable use of VSX after return */
>   	oris	r12,r12,MSR_VSX@h
>   	std	r12,_MSR(r1)
> -	b	fast_exception_return
> +	b	fast_interrupt_return
>   
>   #endif /* CONFIG_VSX */
>   
> 

^ permalink raw reply

* Re: Latest Git kernel doesn't compile because of the LINUX_VERSION_CODE issue
From: Christophe Leroy @ 2021-02-27  7:34 UTC (permalink / raw)
  To: Christian Zigotzky, Sasha Levin, Masahiro Yamada
  Cc: Darren Stevens, linuxppc-dev, R.T.Dickinson
In-Reply-To: <93173e85-3663-8d79-ca4b-f38b505e6e05@xenosoft.de>


Le 27/02/2021 à 08:13, Christian Zigotzky a écrit :
> Hello Christophe,
> 
> Thanks a lot for compiling the latest git kernel.
> 
> I have solved the compiling issue through setting up a value for the SUBLEVEL variable in 
> "a/Makefile". Before it wasn't necessary to set up a value for the SUBLEVEL variable.

I see, so it is a regression introduced by commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255").

In the past there had already been such a regression at some point, which had then been fixed by 
commit cacd54ef49b7 ("kbuild: Fix KERNELVERSION for empty SUBLEVEL or PATCHLEVEL")

Sasha, can you fix it ?

Thanks
Christophe


> 
> Cheers,
> Christian
> 
> On 26 February 21 at 5:10 pm, Christophe Leroy wrote:
>>
>>
>> Le 26/02/2021 à 13:34, Christian Zigotzky a écrit :
>>> Hello,
>>>
>>> I tried to compile the latest Git kernel today. Unfortunately it doesn't compile.
>>
>> I have no such problem with latest git kernel.
>>
>> Christophe
>>
>>>
>>> Error messages:
>>>
>>>    CC      arch/powerpc/kernel/udbg_16550.o
>>> In file included from ./include/linux/stackprotector.h:10:0,
>>>                   from arch/powerpc/kernel/smp.c:35:
>>> ./arch/powerpc/include/asm/stackprotector.h: In function ‘boot_init_stack_canary’:
>>> ./arch/powerpc/include/asm/stackprotector.h:29:30: error: expected expression before ‘;’ token
>>>    canary ^= LINUX_VERSION_CODE;
>>>                                ^
>>> scripts/Makefile.build:271: recipe for target 'arch/powerpc/kernel/smp.o' failed
>>> make[2]: *** [arch/powerpc/kernel/smp.o] Error 1
>>>
>>> ----
>>>
>>> drivers/media/cec/core/cec-api.c: In function ‘cec_adap_g_caps’:
>>> drivers/media/cec/core/cec-api.c:85:35: error: expected expression before ‘;’ token
>>>    caps.version = LINUX_VERSION_CODE;
>>>
>>> ----
>>>
>>> I have found the bad commit. It's "Merge tag 'kbuild-v5.12' of 
>>> git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild" [1]
>>>
>>> The changes in the Makefile (a/Makefile) are responsible for the compiling errors. [2]
>>>
>>> I was able to revert this bad commit. After that it compiled without any problems.
>>>
>>> Could you please compile the latest Git kernel and confirm this issue?
>>>
>>> Thanks,
>>> Christian
>>>
>>> [1] 
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab 
>>>
>>> [2] 
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/Makefile?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab 
>>>

^ permalink raw reply

* Re: Latest Git kernel doesn't compile because of the LINUX_VERSION_CODE issue
From: Christian Zigotzky @ 2021-02-27  7:13 UTC (permalink / raw)
  To: Christophe Leroy, Michael Ellerman, linuxppc-dev
  Cc: Darren Stevens, R.T.Dickinson
In-Reply-To: <e73a866b-f755-f227-e60a-4be05f467221@csgroup.eu>

Hello Christophe,

Thanks a lot for compiling the latest git kernel.

I have solved the compiling issue through setting up a value for the 
SUBLEVEL variable in "a/Makefile". Before it wasn't necessary to set up 
a value for the SUBLEVEL variable.

Cheers,
Christian

On 26 February 21 at 5:10 pm, Christophe Leroy wrote:
>
>
> Le 26/02/2021 à 13:34, Christian Zigotzky a écrit :
>> Hello,
>>
>> I tried to compile the latest Git kernel today. Unfortunately it 
>> doesn't compile.
>
> I have no such problem with latest git kernel.
>
> Christophe
>
>>
>> Error messages:
>>
>>    CC      arch/powerpc/kernel/udbg_16550.o
>> In file included from ./include/linux/stackprotector.h:10:0,
>>                   from arch/powerpc/kernel/smp.c:35:
>> ./arch/powerpc/include/asm/stackprotector.h: In function 
>> ‘boot_init_stack_canary’:
>> ./arch/powerpc/include/asm/stackprotector.h:29:30: error: expected 
>> expression before ‘;’ token
>>    canary ^= LINUX_VERSION_CODE;
>>                                ^
>> scripts/Makefile.build:271: recipe for target 
>> 'arch/powerpc/kernel/smp.o' failed
>> make[2]: *** [arch/powerpc/kernel/smp.o] Error 1
>>
>> ----
>>
>> drivers/media/cec/core/cec-api.c: In function ‘cec_adap_g_caps’:
>> drivers/media/cec/core/cec-api.c:85:35: error: expected expression 
>> before ‘;’ token
>>    caps.version = LINUX_VERSION_CODE;
>>
>> ----
>>
>> I have found the bad commit. It's "Merge tag 'kbuild-v5.12' of 
>> git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild" 
>> [1]
>>
>> The changes in the Makefile (a/Makefile) are responsible for the 
>> compiling errors. [2]
>>
>> I was able to revert this bad commit. After that it compiled without 
>> any problems.
>>
>> Could you please compile the latest Git kernel and confirm this issue?
>>
>> Thanks,
>> Christian
>>
>> [1] 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab 
>>
>> [2] 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/Makefile?id=6fbd6cf85a3be127454a1ad58525a3adcf8612ab 
>>


^ permalink raw reply

* [PATCH v7 10/10] powerpc/signal: Use __get_user() to copy sigset_t
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

Usually sigset_t is exactly 8B which is a "trivial" size and does not
warrant using __copy_from_user(). Use __get_user() directly in
anticipation of future work to remove the trivial size optimizations
from __copy_from_user().

The ppc32 implementation of get_sigset_t() previously called
copy_from_user() which, unlike __copy_from_user(), calls access_ok().
Replacing this w/ __get_user() (no access_ok()) is fine here since both
callsites in signal_32.c are preceded by an earlier access_ok().

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
 arch/powerpc/kernel/signal.h    | 7 +++++++
 arch/powerpc/kernel/signal_32.c | 2 +-
 arch/powerpc/kernel/signal_64.c | 4 ++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index d8dd76b1dc94..1393876f3814 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -19,6 +19,13 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 			      struct task_struct *tsk);
 
+static inline int __get_user_sigset(sigset_t *dst, const sigset_t __user *src)
+{
+	BUILD_BUG_ON(sizeof(sigset_t) != sizeof(u64));
+
+	return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]);
+}
+
 #ifdef CONFIG_VSX
 extern unsigned long copy_vsx_to_user(void __user *to,
 				      struct task_struct *task);
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 75ee918a120a..c505b444a613 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -144,7 +144,7 @@ static inline int restore_general_regs(struct pt_regs *regs,
 
 static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset)
 {
-	return copy_from_user(set, uset, sizeof(*uset));
+	return __get_user_sigset(set, uset);
 }
 
 #define to_user_ptr(p)		((unsigned long)(p))
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 00c907022707..debe88055f38 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -708,7 +708,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	 * We kill the task with a SIGSEGV in this situation.
 	 */
 
-	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
+	if (__get_user_sigset(&set, &new_ctx->uc_sigmask))
 		do_exit(SIGSEGV);
 	set_current_blocked(&set);
 
@@ -747,7 +747,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	if (!access_ok(uc, sizeof(*uc)))
 		goto badframe;
 
-	if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
+	if (__get_user_sigset(&set, &uc->uc_sigmask))
 		goto badframe;
 	set_current_blocked(&set);
 
-- 
2.26.1


^ permalink raw reply related

* [PATCH v7 06/10] powerpc/signal64: Replace setup_sigcontext() w/ unsafe_setup_sigcontext()
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

Previously setup_sigcontext() performed a costly KUAP switch on every
uaccess operation. These repeated uaccess switches cause a significant
drop in signal handling performance.

Rewrite setup_sigcontext() to assume that a userspace write access window
is open by replacing all uaccess functions with their 'unsafe' versions.
Modify the callers to first open, call unsafe_setup_sigcontext() and
then close the uaccess window.

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
v7:	* Don't use unsafe_op_wrap() since Christophe indicates this
	  macro may go away in the future.
---
 arch/powerpc/kernel/signal_64.c | 72 ++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index bd8d210c9115..78ae4bb4e590 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -101,9 +101,14 @@ static void prepare_setup_sigcontext(struct task_struct *tsk)
  * Set up the sigcontext for the signal frame.
  */
 
-static long setup_sigcontext(struct sigcontext __user *sc,
-		struct task_struct *tsk, int signr, sigset_t *set,
-		unsigned long handler, int ctx_has_vsx_region)
+#define unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region, label)\
+do {											\
+	if (__unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region))\
+		goto label;								\
+} while (0)
+static long notrace __unsafe_setup_sigcontext(struct sigcontext __user *sc,
+					struct task_struct *tsk, int signr, sigset_t *set,
+					unsigned long handler, int ctx_has_vsx_region)
 {
 	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
 	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
@@ -118,20 +123,19 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 #endif
 	struct pt_regs *regs = tsk->thread.regs;
 	unsigned long msr = regs->msr;
-	long err = 0;
 	/* Force usr to alway see softe as 1 (interrupts enabled) */
 	unsigned long softe = 0x1;
 
 	BUG_ON(tsk != current);
 
 #ifdef CONFIG_ALTIVEC
-	err |= __put_user(v_regs, &sc->v_regs);
+	unsafe_put_user(v_regs, &sc->v_regs, efault_out);
 
 	/* save altivec registers */
 	if (tsk->thread.used_vr) {
 		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
-		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
-				      33 * sizeof(vector128));
+		unsafe_copy_to_user(v_regs, &tsk->thread.vr_state,
+				    33 * sizeof(vector128), efault_out);
 		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
 		 * contains valid data.
 		 */
@@ -140,12 +144,12 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 	/* We always copy to/from vrsave, it's 0 if we don't have or don't
 	 * use altivec.
 	 */
-	err |= __put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
+	unsafe_put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out);
 #else /* CONFIG_ALTIVEC */
-	err |= __put_user(0, &sc->v_regs);
+	unsafe_put_user(0, &sc->v_regs, efault_out);
 #endif /* CONFIG_ALTIVEC */
 	/* copy fpr regs and fpscr */
-	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
+	unsafe_copy_fpr_to_user(&sc->fp_regs, tsk, efault_out);
 
 	/*
 	 * Clear the MSR VSX bit to indicate there is no valid state attached
@@ -160,24 +164,27 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 	 */
 	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
 		v_regs += ELF_NVRREG;
-		err |= copy_vsx_to_user(v_regs, tsk);
+		unsafe_copy_vsx_to_user(v_regs, tsk, efault_out);
 		/* set MSR_VSX in the MSR value in the frame to
 		 * indicate that sc->vs_reg) contains valid data.
 		 */
 		msr |= MSR_VSX;
 	}
 #endif /* CONFIG_VSX */
-	err |= __put_user(&sc->gp_regs, &sc->regs);
+	unsafe_put_user(&sc->gp_regs, &sc->regs, efault_out);
 	WARN_ON(!FULL_REGS(regs));
-	err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
-	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
-	err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]);
-	err |= __put_user(signr, &sc->signal);
-	err |= __put_user(handler, &sc->handler);
+	unsafe_copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE, efault_out);
+	unsafe_put_user(msr, &sc->gp_regs[PT_MSR], efault_out);
+	unsafe_put_user(softe, &sc->gp_regs[PT_SOFTE], efault_out);
+	unsafe_put_user(signr, &sc->signal, efault_out);
+	unsafe_put_user(handler, &sc->handler, efault_out);
 	if (set != NULL)
-		err |=  __put_user(set->sig[0], &sc->oldmask);
+		unsafe_put_user(set->sig[0], &sc->oldmask, efault_out);
 
-	return err;
+	return 0;
+
+efault_out:
+	return -EFAULT;
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -670,12 +677,15 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 
 	if (old_ctx != NULL) {
 		prepare_setup_sigcontext(current);
-		if (!access_ok(old_ctx, ctx_size)
-		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
-					ctx_has_vsx_region)
-		    || __copy_to_user(&old_ctx->uc_sigmask,
-				      &current->blocked, sizeof(sigset_t)))
+		if (!user_write_access_begin(old_ctx, ctx_size))
 			return -EFAULT;
+
+		unsafe_setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL,
+					0, ctx_has_vsx_region, efault_out);
+		unsafe_copy_to_user(&old_ctx->uc_sigmask, &current->blocked,
+				    sizeof(sigset_t), efault_out);
+
+		user_write_access_end();
 	}
 	if (new_ctx == NULL)
 		return 0;
@@ -704,6 +714,10 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	/* This returns like rt_sigreturn */
 	set_thread_flag(TIF_RESTOREALL);
 	return 0;
+
+efault_out:
+	user_write_access_end();
+	return -EFAULT;
 }
 
 
@@ -854,9 +868,13 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	} else {
 		err |= __put_user(0, &frame->uc.uc_link);
 		prepare_setup_sigcontext(tsk);
-		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
-					NULL, (unsigned long)ksig->ka.sa.sa_handler,
-					1);
+		if (!user_write_access_begin(&frame->uc.uc_mcontext,
+					     sizeof(frame->uc.uc_mcontext)))
+			return -EFAULT;
+		err |= __unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk,
+						ksig->sig, NULL,
+						(unsigned long)ksig->ka.sa.sa_handler, 1);
+		user_write_access_end();
 	}
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
-- 
2.26.1


^ permalink raw reply related

* [PATCH v7 07/10] powerpc/signal64: Replace restore_sigcontext() w/ unsafe_restore_sigcontext()
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

Previously restore_sigcontext() performed a costly KUAP switch on every
uaccess operation. These repeated uaccess switches cause a significant
drop in signal handling performance.

Rewrite restore_sigcontext() to assume that a userspace read access
window is open by replacing all uaccess functions with their 'unsafe'
versions. Modify the callers to first open, call
unsafe_restore_sigcontext(), and then close the uaccess window.

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
v7:	* Don't use unsafe_op_wrap() since Christophe indicates this
	  macro may go away in the future.
---
 arch/powerpc/kernel/signal_64.c | 68 ++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 78ae4bb4e590..23a44ec3ac01 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -327,14 +327,16 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 /*
  * Restore the sigcontext from the signal frame.
  */
-
-static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
-			      struct sigcontext __user *sc)
+#define unsafe_restore_sigcontext(tsk, set, sig, sc, label) do {	\
+	if (__unsafe_restore_sigcontext(tsk, set, sig, sc))		\
+		goto label;						\
+} while (0)
+static long notrace __unsafe_restore_sigcontext(struct task_struct *tsk, sigset_t *set,
+						int sig, struct sigcontext __user *sc)
 {
 #ifdef CONFIG_ALTIVEC
 	elf_vrreg_t __user *v_regs;
 #endif
-	unsigned long err = 0;
 	unsigned long save_r13 = 0;
 	unsigned long msr;
 	struct pt_regs *regs = tsk->thread.regs;
@@ -349,27 +351,27 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
 		save_r13 = regs->gpr[13];
 
 	/* copy the GPRs */
-	err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr));
-	err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]);
+	unsafe_copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr), efault_out);
+	unsafe_get_user(regs->nip, &sc->gp_regs[PT_NIP], efault_out);
 	/* get MSR separately, transfer the LE bit if doing signal return */
-	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+	unsafe_get_user(msr, &sc->gp_regs[PT_MSR], efault_out);
 	if (sig)
 		regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
-	err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]);
-	err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]);
-	err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]);
-	err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]);
-	err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]);
+	unsafe_get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3], efault_out);
+	unsafe_get_user(regs->ctr, &sc->gp_regs[PT_CTR], efault_out);
+	unsafe_get_user(regs->link, &sc->gp_regs[PT_LNK], efault_out);
+	unsafe_get_user(regs->xer, &sc->gp_regs[PT_XER], efault_out);
+	unsafe_get_user(regs->ccr, &sc->gp_regs[PT_CCR], efault_out);
 	/* Don't allow userspace to set SOFTE */
 	set_trap_norestart(regs);
-	err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
-	err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
-	err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
+	unsafe_get_user(regs->dar, &sc->gp_regs[PT_DAR], efault_out);
+	unsafe_get_user(regs->dsisr, &sc->gp_regs[PT_DSISR], efault_out);
+	unsafe_get_user(regs->result, &sc->gp_regs[PT_RESULT], efault_out);
 
 	if (!sig)
 		regs->gpr[13] = save_r13;
 	if (set != NULL)
-		err |=  __get_user(set->sig[0], &sc->oldmask);
+		unsafe_get_user(set->sig[0], &sc->oldmask, efault_out);
 
 	/*
 	 * Force reload of FP/VEC.
@@ -379,29 +381,27 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
 	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
 
 #ifdef CONFIG_ALTIVEC
-	err |= __get_user(v_regs, &sc->v_regs);
-	if (err)
-		return err;
+	unsafe_get_user(v_regs, &sc->v_regs, efault_out);
 	if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
 		return -EFAULT;
 	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
 	if (v_regs != NULL && (msr & MSR_VEC) != 0) {
-		err |= __copy_from_user(&tsk->thread.vr_state, v_regs,
-					33 * sizeof(vector128));
+		unsafe_copy_from_user(&tsk->thread.vr_state, v_regs,
+				      33 * sizeof(vector128), efault_out);
 		tsk->thread.used_vr = true;
 	} else if (tsk->thread.used_vr) {
 		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
 	}
 	/* Always get VRSAVE back */
 	if (v_regs != NULL)
-		err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
+		unsafe_get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out);
 	else
 		tsk->thread.vrsave = 0;
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
 		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
 #endif /* CONFIG_ALTIVEC */
 	/* restore floating point */
-	err |= copy_fpr_from_user(tsk, &sc->fp_regs);
+	unsafe_copy_fpr_from_user(tsk, &sc->fp_regs, efault_out);
 #ifdef CONFIG_VSX
 	/*
 	 * Get additional VSX data. Update v_regs to point after the
@@ -410,14 +410,17 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
 	 */
 	v_regs += ELF_NVRREG;
 	if ((msr & MSR_VSX) != 0) {
-		err |= copy_vsx_from_user(tsk, v_regs);
+		unsafe_copy_vsx_from_user(tsk, v_regs, efault_out);
 		tsk->thread.used_vsr = true;
 	} else {
 		for (i = 0; i < 32 ; i++)
 			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
 	}
 #endif
-	return err;
+	return 0;
+
+efault_out:
+	return -EFAULT;
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -708,8 +711,14 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
 		do_exit(SIGSEGV);
 	set_current_blocked(&set);
-	if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext))
+
+	if (!user_read_access_begin(new_ctx, ctx_size))
+		return -EFAULT;
+	if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) {
+		user_read_access_end();
 		do_exit(SIGSEGV);
+	}
+	user_read_access_end();
 
 	/* This returns like rt_sigreturn */
 	set_thread_flag(TIF_RESTOREALL);
@@ -812,8 +821,13 @@ SYSCALL_DEFINE0(rt_sigreturn)
 		 * causing a TM bad thing.
 		 */
 		current->thread.regs->msr &= ~MSR_TS_MASK;
-		if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+		if (!user_read_access_begin(&uc->uc_mcontext, sizeof(uc->uc_mcontext)))
+			return -EFAULT;
+		if (__unsafe_restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) {
+			user_read_access_end();
 			goto badframe;
+		}
+		user_read_access_end();
 	}
 
 	if (restore_altstack(&uc->uc_stack))
-- 
2.26.1


^ permalink raw reply related

* [PATCH v7 08/10] powerpc/signal64: Rewrite handle_rt_signal64() to minimise uaccess switches
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Daniel Axtens
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

From: Daniel Axtens <dja@axtens.net>

Add uaccess blocks and use the 'unsafe' versions of functions doing user
access where possible to reduce the number of times uaccess has to be
opened/closed.

There is no 'unsafe' version of copy_siginfo_to_user, so move it
slightly to allow for a "longer" uaccess block.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Co-developed-by: Christopher M. Riedl <cmr@codefail.de>
Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
 arch/powerpc/kernel/signal_64.c | 56 ++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 23a44ec3ac01..788854734b9a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -854,45 +854,52 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	unsigned long msr = regs->msr;
 
 	frame = get_sigframe(ksig, tsk, sizeof(*frame), 0);
-	if (!access_ok(frame, sizeof(*frame)))
-		goto badframe;
 
-	err |= __put_user(&frame->info, &frame->pinfo);
-	err |= __put_user(&frame->uc, &frame->puc);
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-	if (err)
+	/* This only applies when calling unsafe_setup_sigcontext() and must be
+	 * called before opening the uaccess window.
+	 */
+	if (!MSR_TM_ACTIVE(msr))
+		prepare_setup_sigcontext(tsk);
+
+	if (!user_write_access_begin(frame, sizeof(*frame)))
 		goto badframe;
 
+	unsafe_put_user(&frame->info, &frame->pinfo, badframe_block);
+	unsafe_put_user(&frame->uc, &frame->puc, badframe_block);
+
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
-	err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
+	unsafe_put_user(0, &frame->uc.uc_flags, badframe_block);
+	unsafe_save_altstack(&frame->uc.uc_stack, regs->gpr[1], badframe_block);
 
 	if (MSR_TM_ACTIVE(msr)) {
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 		/* The ucontext_t passed to userland points to the second
 		 * ucontext_t (for transactional state) with its uc_link ptr.
 		 */
-		err |= __put_user(&frame->uc_transact, &frame->uc.uc_link);
+		unsafe_put_user(&frame->uc_transact, &frame->uc.uc_link, badframe_block);
+
+		user_write_access_end();
+
 		err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
 					    &frame->uc_transact.uc_mcontext,
 					    tsk, ksig->sig, NULL,
 					    (unsigned long)ksig->ka.sa.sa_handler,
 					    msr);
+
+		if (!user_write_access_begin(&frame->uc.uc_sigmask,
+					     sizeof(frame->uc.uc_sigmask)))
+			goto badframe;
+
 #endif
 	} else {
-		err |= __put_user(0, &frame->uc.uc_link);
-		prepare_setup_sigcontext(tsk);
-		if (!user_write_access_begin(&frame->uc.uc_mcontext,
-					     sizeof(frame->uc.uc_mcontext)))
-			return -EFAULT;
-		err |= __unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk,
-						ksig->sig, NULL,
-						(unsigned long)ksig->ka.sa.sa_handler, 1);
-		user_write_access_end();
+		unsafe_put_user(0, &frame->uc.uc_link, badframe_block);
+		unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
+					NULL, (unsigned long)ksig->ka.sa.sa_handler,
+					1, badframe_block);
 	}
-	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-	if (err)
-		goto badframe;
+
+	unsafe_copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set), badframe_block);
+	user_write_access_end();
 
 	/* Make sure signal handler doesn't get spurious FP exceptions */
 	tsk->thread.fp_state.fpscr = 0;
@@ -907,6 +914,11 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		regs->nip = (unsigned long) &frame->tramp[0];
 	}
 
+
+	/* Save the siginfo outside of the unsafe block. */
+	if (copy_siginfo_to_user(&frame->info, &ksig->info))
+		goto badframe;
+
 	/* Allocate a dummy caller frame for the signal handler. */
 	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
 	err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
@@ -946,6 +958,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 
 	return 0;
 
+badframe_block:
+	user_write_access_end();
 badframe:
 	signal_fault(current, regs, "handle_rt_signal64", frame);
 
-- 
2.26.1


^ permalink raw reply related

* [PATCH v7 00/10] Improve signal performance on PPC64 with KUAP
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev

As reported by Anton, there is a large penalty to signal handling
performance on radix systems using KUAP. The signal handling code
performs many user access operations, each of which needs to switch the
KUAP permissions bit to open and then close user access. This involves a
costly 'mtspr' operation [0].

There is existing work done on x86 and by Christophe Leroy for PPC32 to
instead open up user access in "blocks" using user_*_access_{begin,end}.
We can do the same in PPC64 to bring performance back up on KUAP-enabled
radix and now also hash MMU systems [1].

Hash MMU KUAP support along with uaccess flush has landed in linuxppc/next
since the last revision. This series also provides a large benefit on hash
with KUAP. However, in the hash implementation of KUAP the user AMR is
always restored during system_call_exception() which cannot be avoided.
Fewer user access switches naturally also result in less uaccess flushing.

The first two patches add some needed 'unsafe' versions of copy-from
functions. While these do not make use of asm-goto they still allow for
avoiding the repeated uaccess switches.

The third patch moves functions called by setup_sigcontext() into a new
prepare_setup_sigcontext() to simplify converting setup_sigcontext()
into an 'unsafe' version which assumes an open uaccess window later.

The fourth and fifths patches clean-up some of the Transactional Memory
ifdef stuff to simplify using uaccess blocks later.

The next two patches rewrite some of the signal64 helper functions to
be 'unsafe'. Finally, the last three patches update the main signal
handling functions to make use of the new 'unsafe' helpers and eliminate
some additional uaccess switching.

I used the will-it-scale signal1 benchmark to measure and compare
performance [2]. The below results are from running a minimal
kernel+initramfs QEMU/KVM guest on a POWER9 Blackbird:

	signal1_threads -t1 -s10

	|                              | hash   | radix  |
	| ---------------------------- | ------ | ------ |
	| linuxppc/next                | 117898 | 135884 |
	| linuxppc/next w/o KUAP+KUEP  | 225502 | 227509 |
	| unsafe-signal64              | 195351 | 230922 |

[0]: https://github.com/linuxppc/issues/issues/277
[1]: https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=196278
[2]: https://github.com/antonblanchard/will-it-scale/blob/master/tests/signal1.c

v7:	* Address feedback from Christophe Leroy

v6:	* Rebase on latest linuxppc/next and address feedback comments from
	  Daniel Axtens and friends (also pick up some Reviewed-by tags)
	* Simplify __get_user_sigset(), fix sparse warnings, and use it
	  in ppc32 signal handling
	* Remove ctx_has_vsx_region arg to prepare_setup_sigcontext()
	* Remove local buffer in copy_{fpr,vsx}_from_user()
	* Rework the TM ifdefery-removal and remove one of the ifdef
	  pairs altogether

v5:	* Use sizeof(buf) in copy_{vsx,fpr}_from_user() (Thanks David Laight)
	* Rebase on latest linuxppc/next

v4:	* Fix issues identified by Christophe Leroy (Thanks for review)
	* Use __get_user() directly to copy the 8B sigset_t

v3:	* Rebase on latest linuxppc/next
	* Reword confusing commit messages
	* Add missing comma in macro in signal.h which broke compiles without
	  CONFIG_ALTIVEC
	* Validate hash KUAP signal performance improvements

v2:	* Rebase on latest linuxppc/next + Christophe Leroy's PPC32
	  signal series
	* Simplify/remove TM ifdefery similar to PPC32 series and clean
	  up the uaccess begin/end calls
	* Isolate non-inline functions so they are not called when
	  uaccess window is open

Christopher M. Riedl (8):
  powerpc/uaccess: Add unsafe_copy_from_user()
  powerpc/signal: Add unsafe_copy_{vsx,fpr}_from_user()
  powerpc/signal64: Remove non-inline calls from setup_sigcontext()
  powerpc: Reference parameter in MSR_TM_ACTIVE() macro
  powerpc/signal64: Remove TM ifdefery in middle of if/else block
  powerpc/signal64: Replace setup_sigcontext() w/
    unsafe_setup_sigcontext()
  powerpc/signal64: Replace restore_sigcontext() w/
    unsafe_restore_sigcontext()
  powerpc/signal: Use __get_user() to copy sigset_t

Daniel Axtens (2):
  powerpc/signal64: Rewrite handle_rt_signal64() to minimise uaccess
    switches
  powerpc/signal64: Rewrite rt_sigreturn() to minimise uaccess switches

 arch/powerpc/include/asm/reg.h     |   2 +-
 arch/powerpc/include/asm/uaccess.h |  21 ++
 arch/powerpc/kernel/process.c      |   3 +-
 arch/powerpc/kernel/signal.h       |  33 +++
 arch/powerpc/kernel/signal_32.c    |   2 +-
 arch/powerpc/kernel/signal_64.c    | 316 +++++++++++++++++------------
 6 files changed, 246 insertions(+), 131 deletions(-)

-- 
2.26.1


^ permalink raw reply

* [PATCH v7 05/10] powerpc/signal64: Remove TM ifdefery in middle of if/else block
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

Both rt_sigreturn() and handle_rt_signal_64() contain TM-related ifdefs
which break-up an if/else block. Provide stubs for the ifdef-guarded TM
functions and remove the need for an ifdef in rt_sigreturn().

Rework the remaining TM ifdef in handle_rt_signal64() similar to
commit f1cf4f93de2f ("powerpc/signal32: Remove ifdefery in middle of if/else").

Unlike in the commit for ppc32, the ifdef can't be removed entirely
since uc_transact in sigframe depends on CONFIG_PPC_TRANSACTIONAL_MEM.

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
 arch/powerpc/kernel/process.c   |   3 +-
 arch/powerpc/kernel/signal_64.c | 102 ++++++++++++++++----------------
 2 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 924d023dad0a..08c3fbe45921 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1117,9 +1117,10 @@ void restore_tm_state(struct pt_regs *regs)
 	regs->msr |= msr_diff;
 }
 
-#else
+#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */
 #define tm_recheckpoint_new_task(new)
 #define __switch_to_tm(prev, new)
+void tm_reclaim_current(uint8_t cause) {}
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 static inline void save_sprs(struct thread_struct *t)
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 6ca546192cbf..bd8d210c9115 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -594,6 +594,12 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 
 	return err;
 }
+#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */
+static long restore_tm_sigcontexts(struct task_struct *tsk, struct sigcontext __user *sc,
+				   struct sigcontext __user *tm_sc)
+{
+	return -EINVAL;
+}
 #endif
 
 /*
@@ -710,9 +716,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	struct pt_regs *regs = current_pt_regs();
 	struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1];
 	sigset_t set;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	unsigned long msr;
-#endif
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
@@ -724,48 +728,50 @@ SYSCALL_DEFINE0(rt_sigreturn)
 		goto badframe;
 	set_current_blocked(&set);
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/*
-	 * If there is a transactional state then throw it away.
-	 * The purpose of a sigreturn is to destroy all traces of the
-	 * signal frame, this includes any transactional state created
-	 * within in. We only check for suspended as we can never be
-	 * active in the kernel, we are active, there is nothing better to
-	 * do than go ahead and Bad Thing later.
-	 * The cause is not important as there will never be a
-	 * recheckpoint so it's not user visible.
-	 */
-	if (MSR_TM_SUSPENDED(mfmsr()))
-		tm_reclaim_current(0);
+	if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM)) {
+		/*
+		 * If there is a transactional state then throw it away.
+		 * The purpose of a sigreturn is to destroy all traces of the
+		 * signal frame, this includes any transactional state created
+		 * within in. We only check for suspended as we can never be
+		 * active in the kernel, we are active, there is nothing better to
+		 * do than go ahead and Bad Thing later.
+		 * The cause is not important as there will never be a
+		 * recheckpoint so it's not user visible.
+		 */
+		if (MSR_TM_SUSPENDED(mfmsr()))
+			tm_reclaim_current(0);
 
-	/*
-	 * Disable MSR[TS] bit also, so, if there is an exception in the
-	 * code below (as a page fault in copy_ckvsx_to_user()), it does
-	 * not recheckpoint this task if there was a context switch inside
-	 * the exception.
-	 *
-	 * A major page fault can indirectly call schedule(). A reschedule
-	 * process in the middle of an exception can have a side effect
-	 * (Changing the CPU MSR[TS] state), since schedule() is called
-	 * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended
-	 * (switch_to() calls tm_recheckpoint() for the 'new' process). In
-	 * this case, the process continues to be the same in the CPU, but
-	 * the CPU state just changed.
-	 *
-	 * This can cause a TM Bad Thing, since the MSR in the stack will
-	 * have the MSR[TS]=0, and this is what will be used to RFID.
-	 *
-	 * Clearing MSR[TS] state here will avoid a recheckpoint if there
-	 * is any process reschedule in kernel space. The MSR[TS] state
-	 * does not need to be saved also, since it will be replaced with
-	 * the MSR[TS] that came from user context later, at
-	 * restore_tm_sigcontexts.
-	 */
-	regs->msr &= ~MSR_TS_MASK;
+		/*
+		 * Disable MSR[TS] bit also, so, if there is an exception in the
+		 * code below (as a page fault in copy_ckvsx_to_user()), it does
+		 * not recheckpoint this task if there was a context switch inside
+		 * the exception.
+		 *
+		 * A major page fault can indirectly call schedule(). A reschedule
+		 * process in the middle of an exception can have a side effect
+		 * (Changing the CPU MSR[TS] state), since schedule() is called
+		 * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended
+		 * (switch_to() calls tm_recheckpoint() for the 'new' process). In
+		 * this case, the process continues to be the same in the CPU, but
+		 * the CPU state just changed.
+		 *
+		 * This can cause a TM Bad Thing, since the MSR in the stack will
+		 * have the MSR[TS]=0, and this is what will be used to RFID.
+		 *
+		 * Clearing MSR[TS] state here will avoid a recheckpoint if there
+		 * is any process reschedule in kernel space. The MSR[TS] state
+		 * does not need to be saved also, since it will be replaced with
+		 * the MSR[TS] that came from user context later, at
+		 * restore_tm_sigcontexts.
+		 */
+		regs->msr &= ~MSR_TS_MASK;
 
-	if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
-		goto badframe;
-	if (MSR_TM_ACTIVE(msr)) {
+		if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
+			goto badframe;
+	}
+
+	if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && MSR_TM_ACTIVE(msr)) {
 		/* We recheckpoint on return. */
 		struct ucontext __user *uc_transact;
 
@@ -778,9 +784,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
 		if (restore_tm_sigcontexts(current, &uc->uc_mcontext,
 					   &uc_transact->uc_mcontext))
 			goto badframe;
-	} else
-#endif
-	{
+	} else {
 		/*
 		 * Fall through, for non-TM restore
 		 *
@@ -818,10 +822,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	unsigned long newsp = 0;
 	long err = 0;
 	struct pt_regs *regs = tsk->thread.regs;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	/* Save the thread's msr before get_tm_stackpointer() changes it */
 	unsigned long msr = regs->msr;
-#endif
 
 	frame = get_sigframe(ksig, tsk, sizeof(*frame), 0);
 	if (!access_ok(frame, sizeof(*frame)))
@@ -836,8 +838,9 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	/* Create the ucontext.  */
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+
 	if (MSR_TM_ACTIVE(msr)) {
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 		/* The ucontext_t passed to userland points to the second
 		 * ucontext_t (for transactional state) with its uc_link ptr.
 		 */
@@ -847,9 +850,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 					    tsk, ksig->sig, NULL,
 					    (unsigned long)ksig->ka.sa.sa_handler,
 					    msr);
-	} else
 #endif
-	{
+	} else {
 		err |= __put_user(0, &frame->uc.uc_link);
 		prepare_setup_sigcontext(tsk);
 		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
-- 
2.26.1


^ permalink raw reply related

* [PATCH v7 03/10] powerpc/signal64: Remove non-inline calls from setup_sigcontext()
From: Christopher M. Riedl @ 2021-02-27  1:12 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20210227011259.11992-1-cmr@codefail.de>

The majority of setup_sigcontext() can be refactored to execute in an
"unsafe" context assuming an open uaccess window except for some
non-inline function calls. Move these out into a separate
prepare_setup_sigcontext() function which must be called first and
before opening up a uaccess window. Non-inline function calls should be
avoided during a uaccess window for a few reasons:

	- KUAP should be enabled for as much kernel code as possible.
	  Opening a uaccess window disables KUAP which means any code
	  executed during this time contributes to a potential attack
	  surface.

	- Non-inline functions default to traceable which means they are
	  instrumented for ftrace. This adds more code which could run
	  with KUAP disabled.

	- Powerpc does not currently support the objtool UACCESS checks.
	  All code running with uaccess must be audited manually which
	  means: less code -> less work -> fewer problems (in theory).

A follow-up commit converts setup_sigcontext() to be "unsafe".

Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
---
 arch/powerpc/kernel/signal_64.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f9e4a1ac440f..6ca546192cbf 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -79,6 +79,24 @@ static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc)
 }
 #endif
 
+static void prepare_setup_sigcontext(struct task_struct *tsk)
+{
+#ifdef CONFIG_ALTIVEC
+	/* save altivec registers */
+	if (tsk->thread.used_vr)
+		flush_altivec_to_thread(tsk);
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		tsk->thread.vrsave = mfspr(SPRN_VRSAVE);
+#endif /* CONFIG_ALTIVEC */
+
+	flush_fp_to_thread(tsk);
+
+#ifdef CONFIG_VSX
+	if (tsk->thread.used_vsr)
+		flush_vsx_to_thread(tsk);
+#endif /* CONFIG_VSX */
+}
+
 /*
  * Set up the sigcontext for the signal frame.
  */
@@ -97,7 +115,6 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 	 */
 #ifdef CONFIG_ALTIVEC
 	elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
-	unsigned long vrsave;
 #endif
 	struct pt_regs *regs = tsk->thread.regs;
 	unsigned long msr = regs->msr;
@@ -112,7 +129,6 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 
 	/* save altivec registers */
 	if (tsk->thread.used_vr) {
-		flush_altivec_to_thread(tsk);
 		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
 		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
 				      33 * sizeof(vector128));
@@ -124,17 +140,10 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 	/* We always copy to/from vrsave, it's 0 if we don't have or don't
 	 * use altivec.
 	 */
-	vrsave = 0;
-	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
-		vrsave = mfspr(SPRN_VRSAVE);
-		tsk->thread.vrsave = vrsave;
-	}
-
-	err |= __put_user(vrsave, (u32 __user *)&v_regs[33]);
+	err |= __put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
 #else /* CONFIG_ALTIVEC */
 	err |= __put_user(0, &sc->v_regs);
 #endif /* CONFIG_ALTIVEC */
-	flush_fp_to_thread(tsk);
 	/* copy fpr regs and fpscr */
 	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
 
@@ -150,7 +159,6 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 	 * VMX data.
 	 */
 	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
-		flush_vsx_to_thread(tsk);
 		v_regs += ELF_NVRREG;
 		err |= copy_vsx_to_user(v_regs, tsk);
 		/* set MSR_VSX in the MSR value in the frame to
@@ -655,6 +663,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 		ctx_has_vsx_region = 1;
 
 	if (old_ctx != NULL) {
+		prepare_setup_sigcontext(current);
 		if (!access_ok(old_ctx, ctx_size)
 		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
 					ctx_has_vsx_region)
@@ -842,6 +851,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 #endif
 	{
 		err |= __put_user(0, &frame->uc.uc_link);
+		prepare_setup_sigcontext(tsk);
 		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
 					NULL, (unsigned long)ksig->ka.sa.sa_handler,
 					1);
-- 
2.26.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox