LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 1/5] powerpc: Use regular rcu_dereference_raw API
From: Joel Fernandes (Google) @ 2019-05-24 23:49 UTC (permalink / raw)
  To: linux-kernel
  Cc: rcu, Jonathan Corbet, linux-doc, Lai Jiangshan, Josh Triplett,
	kvm-ppc, Miguel Ojeda, Ingo Molnar, Mathieu Desnoyers,
	Steven Rostedt, Joel Fernandes (Google), Paul E. McKenney,
	linuxppc-dev
In-Reply-To: <20190524234933.5133-1-joel@joelfernandes.org>

rcu_dereference_raw already does not do any tracing. There is no need to
use the _notrace variant of it and this series removes that API, so let us
use the regular variant here.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 21b1ed5df888..c15c9bbf0206 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -546,7 +546,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-	return rcu_dereference_raw_notrace(kvm->memslots[0]);
+	return rcu_dereference_raw(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
-- 
2.22.0.rc1.257.g3120a18244-goog


^ permalink raw reply related

* [PATCH v2] mm: add account_locked_vm utility function
From: Daniel Jordan @ 2019-05-24 17:50 UTC (permalink / raw)
  To: akpm
  Cc: Mark Rutland, Davidlohr Bueso, kvm, Alan Tull,
	Alexey Kardashevskiy, linux-fpga, linux-kernel, kvm-ppc,
	Daniel Jordan, linux-mm, Alex Williamson, Jason Gunthorpe,
	Moritz Fischer, Steve Sistare, Christoph Lameter, linuxppc-dev,
	Wu Hao
In-Reply-To: <de375582-2c35-8e8a-4737-c816052a8e58@ozlabs.ru>

locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.  Standardize the debug prints, which vary
slightly, but include the helper's caller to disambiguate between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Alan Tull <atull@kernel.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Wu Hao <hao.wu@intel.com>
Cc: linux-mm@kvack.org
Cc: kvm@vger.kernel.org
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-fpga@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---

Against v5.2-rc1.

v2:
 - applied review comments from Alexey
 - added _RET_IP_ to debug print to disambiguate callers

 arch/powerpc/kvm/book3s_64_vio.c     | 44 +++--------------------
 arch/powerpc/mm/book3s64/iommu_api.c | 41 +++------------------
 drivers/fpga/dfl-afu-dma-region.c    | 53 +++------------------------
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-------------------------
 drivers/vfio/vfio_iommu_type1.c      | 17 ++-------
 include/linux/mm.h                   | 19 ++++++++++
 mm/util.c                            | 46 ++++++++++++++++++++++++
 7 files changed, 84 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 66270e07449a..768b645c7edf 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,6 +30,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/iommu.h>
 #include <linux/file.h>
+#include <linux/mm.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-	long ret = 0;
-
-	if (!current || !current->mm)
-		return ret; /* process exited */
-
-	down_write(&current->mm->mmap_sem);
-
-	if (inc) {
-		unsigned long locked, lock_limit;
-
-		locked = current->mm->locked_vm + stt_pages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += stt_pages;
-	} else {
-		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-			stt_pages = current->mm->locked_vm;
-
-		current->mm->locked_vm -= stt_pages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-			inc ? '+' : '-',
-			stt_pages << PAGE_SHIFT,
-			current->mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -302,7 +266,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 
 	kvm_put_kvm(stt->kvm);
 
-	kvmppc_account_memlimit(
+	account_locked_vm(current->mm,
 		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
 	call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -327,7 +291,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		return -EINVAL;
 
 	npages = kvmppc_tce_pages(size);
-	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
 	if (ret)
 		return ret;
 
@@ -373,7 +337,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
 	kfree(stt);
  fail_acct:
-	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+	account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 5c521f3924a5..18d22eec0ebd 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -19,6 +19,7 @@
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
+#include <linux/mm.h>
 #include <asm/mmu_context.h>
 #include <asm/pte-walk.h>
 #include <linux/mm_inline.h>
@@ -51,40 +52,6 @@ struct mm_iommu_table_group_mem_t {
 	u64 dev_hpa;		/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-		unsigned long npages, bool incr)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-
-	if (incr) {
-		locked = mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > mm->locked_vm))
-			npages = mm->locked_vm;
-		mm->locked_vm -= npages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
 bool mm_iommu_preregistered(struct mm_struct *mm)
 {
 	return !list_empty(&mm->context.iommu_group_mem_list);
@@ -101,7 +68,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	unsigned long entry, chunk;
 
 	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		ret = account_locked_vm(mm, entries, true);
 		if (ret)
 			return ret;
 
@@ -216,7 +183,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	kfree(mem);
 
 unlock_exit:
-	mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+	account_locked_vm(mm, locked_entries, false);
 
 	return ret;
 }
@@ -316,7 +283,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
 
-	mm_iommu_adjust_locked_vm(mm, unlock_entries, false);
+	account_locked_vm(mm, unlock_entries, false);
 
 	return ret;
 }
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index c438722bf4e1..0a532c602d8f 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -12,6 +12,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 #include "dfl-afu.h"
 
@@ -31,52 +32,6 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
 	afu->dma_regions = RB_ROOT;
 }
 
-/**
- * afu_dma_adjust_locked_vm - adjust locked memory
- * @dev: port device
- * @npages: number of pages
- * @incr: increase or decrease locked memory
- *
- * Increase or decrease the locked memory size with npages input.
- *
- * Return 0 on success.
- * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
- */
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
-{
-	unsigned long locked, lock_limit;
-	int ret = 0;
-
-	/* the task is exiting. */
-	if (!current->mm)
-		return 0;
-
-	down_write(&current->mm->mmap_sem);
-
-	if (incr) {
-		locked = current->mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-			npages = current->mm->locked_vm;
-		current->mm->locked_vm -= npages;
-	}
-
-	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
-		incr ? '+' : '-', npages << PAGE_SHIFT,
-		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
-		ret ? "- exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 /**
  * afu_dma_pin_pages - pin pages of given dma memory region
  * @pdata: feature device platform data
@@ -92,7 +47,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 	struct device *dev = &pdata->dev->dev;
 	int ret, pinned;
 
-	ret = afu_dma_adjust_locked_vm(dev, npages, true);
+	ret = account_locked_vm(current->mm, npages, true);
 	if (ret)
 		return ret;
 
@@ -121,7 +76,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 free_pages:
 	kfree(region->pages);
 unlock_vm:
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 	return ret;
 }
 
@@ -141,7 +96,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
 
 	put_all_pages(region->pages, npages);
 	kfree(region->pages);
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 
 	dev_dbg(dev, "%ld pages unpinned\n", npages);
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 40ddc0c5f677..d06e8e291924 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/mm.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
@@ -34,51 +35,6 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (WARN_ON_ONCE(!mm))
-		return -EPERM;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-	locked = mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-		ret = -ENOMEM;
-	else
-		mm->locked_vm += npages;
-
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
-{
-	if (!mm || !npages)
-		return;
-
-	down_write(&mm->mmap_sem);
-	if (WARN_ON_ONCE(npages > mm->locked_vm))
-		npages = mm->locked_vm;
-	mm->locked_vm -= npages;
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-}
-
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -336,7 +292,7 @@ static int tce_iommu_enable(struct tce_container *container)
 		return ret;
 
 	locked = table_group->tce32_size >> PAGE_SHIFT;
-	ret = try_increment_locked_vm(container->mm, locked);
+	ret = account_locked_vm(container->mm, locked, true);
 	if (ret)
 		return ret;
 
@@ -355,7 +311,7 @@ static void tce_iommu_disable(struct tce_container *container)
 	container->enabled = false;
 
 	BUG_ON(!container->mm);
-	decrement_locked_vm(container->mm, container->locked_pages);
+	account_locked_vm(container->mm, container->locked_pages, false);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -659,7 +615,7 @@ static long tce_iommu_create_table(struct tce_container *container,
 	if (!table_size)
 		return -EINVAL;
 
-	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 	if (ret)
 		return ret;
 
@@ -678,7 +634,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	iommu_tce_table_put(tbl);
-	decrement_locked_vm(container->mm, pages);
+	account_locked_vm(container->mm, pages, false);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3ddc375e7063..bf449ace1676 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -275,21 +275,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 
 	ret = down_write_killable(&mm->mmap_sem);
 	if (!ret) {
-		if (npage > 0) {
-			if (!dma->lock_cap) {
-				unsigned long limit;
-
-				limit = task_rlimit(dma->task,
-						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-				if (mm->locked_vm + npage > limit)
-					ret = -ENOMEM;
-			}
-		}
-
-		if (!ret)
-			mm->locked_vm += npage;
-
+		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
+					  dma->lock_cap);
 		up_write(&mm->mmap_sem);
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e8834ac32b7..72c1034d2ec7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim);
+
+static inline int account_locked_vm(struct mm_struct *mm, unsigned long pages,
+				    bool inc)
+{
+	int ret;
+
+	if (pages == 0 || !mm)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	ret = __account_locked_vm(mm, pages, inc, current,
+				  capable(CAP_IPC_LOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
 /* Container for pinned pfns / pages */
 struct frame_vector {
 	unsigned int nr_allocated;	/* Number of frames we have space for */
diff --git a/mm/util.c b/mm/util.c
index e2e4f8c3fa12..bd3bdf16a084 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
@@ -346,6 +347,51 @@ int __weak get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against, may be NULL
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ * @task:        task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0       on success
+ * * 0       if @mm is NULL (can happen for example if the task is exiting)
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim)
+{
+	unsigned long locked_vm, limit;
+	int ret = 0;
+
+	locked_vm = mm->locked_vm;
+	if (inc) {
+		if (!bypass_rlim) {
+			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+			if (locked_vm + pages > limit)
+				ret = -ENOMEM;
+		}
+		if (!ret)
+			mm->locked_vm = locked_vm + pages;
+	} else {
+		WARN_ON_ONCE(pages > locked_vm);
+		mm->locked_vm = locked_vm - pages;
+	}
+
+	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+		 ret ? " - exceeded" : "");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long pgoff)

base-commit: a188339ca5a396acc588e5851ed7e19f66b0ebd9
-- 
2.21.0


^ permalink raw reply related

* Re: [RFC PATCH v2] powerpc: fix kexec failure on book3s/32
From: Christophe Leroy @ 2019-05-24 13:38 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <8164abbe117d8353bb88132d7cfa8bc26a60ca66.1558677767.git.christophe.leroy@c-s.fr>



Le 24/05/2019 à 08:05, Christophe Leroy a écrit :
> Fixes: 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX")
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Reported-by: Aaro Koskinen <aaro.koskinen@iki.fi>

> ---
>   arch/powerpc/kernel/machine_kexec_32.c | 8 ++++++++
>   arch/powerpc/mm/book3s32/mmu.c         | 7 +++++--
>   arch/powerpc/mm/mmu_decl.h             | 2 ++
>   3 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
> index affe5dcce7f4..83e61a8f8468 100644
> --- a/arch/powerpc/kernel/machine_kexec_32.c
> +++ b/arch/powerpc/kernel/machine_kexec_32.c
> @@ -15,6 +15,7 @@
>   #include <asm/cacheflush.h>
>   #include <asm/hw_irq.h>
>   #include <asm/io.h>
> +#include <mm/mmu_decl.h>
>   
>   typedef void (*relocate_new_kernel_t)(
>   				unsigned long indirection_page,
> @@ -35,6 +36,8 @@ void default_machine_kexec(struct kimage *image)
>   	unsigned long page_list;
>   	unsigned long reboot_code_buffer, reboot_code_buffer_phys;
>   	relocate_new_kernel_t rnk;
> +	unsigned long bat_size = 128 << 10;
> +	unsigned long bat_mask = ~(bat_size - 1);
>   
>   	/* Interrupts aren't acceptable while we reboot */
>   	local_irq_disable();
> @@ -54,6 +57,11 @@ void default_machine_kexec(struct kimage *image)
>   	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
>   						relocate_new_kernel_size);
>   
> +	printk(KERN_INFO "Reboot code buffer at %lx\n", reboot_code_buffer);
> +	mtsrin(mfsrin(reboot_code_buffer) & ~SR_NX, reboot_code_buffer);
> +	setibat(7, reboot_code_buffer & bat_mask, reboot_code_buffer_phys & bat_mask,
> +		bat_size, PAGE_KERNEL_TEXT);

A call to update_bats() have to be added here after setibat()

Christophe

> +
>   	flush_icache_range(reboot_code_buffer,
>   				reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
>   	printk(KERN_INFO "Bye!\n");
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index fc073cb2c517..7124700edb0f 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -124,8 +124,8 @@ static unsigned int block_size(unsigned long base, unsigned long top)
>    * of 2 between 128k and 256M.
>    * Only for 603+ ...
>    */
> -static void setibat(int index, unsigned long virt, phys_addr_t phys,
> -		    unsigned int size, pgprot_t prot)
> +void setibat(int index, unsigned long virt, phys_addr_t phys,
> +	     unsigned int size, pgprot_t prot)
>   {
>   	unsigned int bl = (size >> 17) - 1;
>   	int wimgxpp;
> @@ -197,6 +197,9 @@ void mmu_mark_initmem_nx(void)
>   	if (cpu_has_feature(CPU_FTR_601))
>   		return;
>   
> +	if (IS_ENABLED(CONFIG_KEXEC))
> +		nb--;
> +
>   	for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
>   		size = block_size(base, top);
>   		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
> diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
> index 7bac0aa2026a..478584d50cf2 100644
> --- a/arch/powerpc/mm/mmu_decl.h
> +++ b/arch/powerpc/mm/mmu_decl.h
> @@ -103,6 +103,8 @@ void print_system_hash_info(void);
>   extern void mapin_ram(void);
>   extern void setbat(int index, unsigned long virt, phys_addr_t phys,
>   		   unsigned int size, pgprot_t prot);
> +void setibat(int index, unsigned long virt, phys_addr_t phys,
> +	     unsigned int size, pgprot_t prot);
>   
>   extern int __map_without_bats;
>   extern unsigned int rtas_data, rtas_size;
> 

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-24 13:35 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev
In-Reply-To: <20190524132907.GE5234@darkstar.musicnaut.iki.fi>



Le 24/05/2019 à 15:29, Aaro Koskinen a écrit :
> Hi,
> 
> On Fri, May 24, 2019 at 09:40:30AM +0200, Christophe Leroy wrote:
>> Le 24/05/2019 à 09:36, Aaro Koskinen a écrit :
>>> On Fri, May 24, 2019 at 08:08:36AM +0200, Christophe Leroy wrote:
>>>>> Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
>>>>>> Unfortunately still no luck... The crash is pretty much the same with
>>>>>> both
>>>>>> changes.
>>>>>
>>>>> Right. In fact change_page_attr() does nothing because this part of RAM is
>>>>> mapped by DBATs so v_block_mapped() returns not NULL.
>>>>>
>>>>> So, we have to set an IBAT for this area. I'll try and send you a new
>>>>> patch for that before noon (CET).
>>>>>
>>>>
>>>> patch sent out. In the patch I have also added a printk to print the buffer
>>>> address, so if the problem still occurs, we'll know if the problem is really
>>>> at the address of the buffer or if we are wrong from the beginning.
>>>
>>> Reboot code buffer at ef0c3000
>>> Bye!
>>> BUG: Unable to handle kernel instruction fetch
>>> Faulting instruction address: 0xef0c3000
>>>
>>
>> Oops, I forgot to call update_bats() after setibat().
>>
>> Can you add it and retry ?
> 
> Thanks, that was it, now it finally works!
> 

Thanks for reporting the issue and testing.

I'll work on a clean fix patch in the begining of June.

Christophe

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Aaro Koskinen @ 2019-05-24 13:29 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <969271d1-0943-42e6-8992-77b20e305e48@c-s.fr>

Hi,

On Fri, May 24, 2019 at 09:40:30AM +0200, Christophe Leroy wrote:
> Le 24/05/2019 à 09:36, Aaro Koskinen a écrit :
> >On Fri, May 24, 2019 at 08:08:36AM +0200, Christophe Leroy wrote:
> >>>Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
> >>>>Unfortunately still no luck... The crash is pretty much the same with
> >>>>both
> >>>>changes.
> >>>
> >>>Right. In fact change_page_attr() does nothing because this part of RAM is
> >>>mapped by DBATs so v_block_mapped() returns not NULL.
> >>>
> >>>So, we have to set an IBAT for this area. I'll try and send you a new
> >>>patch for that before noon (CET).
> >>>
> >>
> >>patch sent out. In the patch I have also added a printk to print the buffer
> >>address, so if the problem still occurs, we'll know if the problem is really
> >>at the address of the buffer or if we are wrong from the beginning.
> >
> >Reboot code buffer at ef0c3000
> >Bye!
> >BUG: Unable to handle kernel instruction fetch
> >Faulting instruction address: 0xef0c3000
> >
> 
> Oops, I forgot to call update_bats() after setibat().
> 
> Can you add it and retry ?

Thanks, that was it, now it finally works!

A.

^ permalink raw reply

* [RFC PATCH v2] powerpc/xmon: restrict when kernel is locked down
From: Christopher M. Riedl @ 2019-05-24 12:38 UTC (permalink / raw)
  To: linuxppc-dev, kernel-hardening; +Cc: Christopher M. Riedl, ajd, mjg59, dja

Xmon should be either fully or partially disabled depending on the
kernel lockdown state.

Put xmon into read-only mode for lockdown=integrity and completely
disable xmon when lockdown=confidentiality. Xmon checks the lockdown
state and takes appropriate action:

 (1) during xmon_setup to prevent early xmon'ing

 (2) when triggered via sysrq

 (3) when toggled via debugfs

 (4) when triggered via a previously enabled breakpoint

The following lockdown state transitions are handled:

 (1) lockdown=none -> lockdown=integrity
     clear all breakpoints, set xmon read-only mode

 (2) lockdown=none -> lockdown=confidentiality
     clear all breakpoints, prevent re-entry into xmon

 (3) lockdown=integrity -> lockdown=confidentiality
     prevent re-entry into xmon

Suggested-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---

Applies on top of this series:
	https://patchwork.kernel.org/cover/10884631/

I've done some limited testing of the scenarios mentioned in the commit
message on a single CPU QEMU config.

v1->v2:
	Fix subject line
	Submit to linuxppc-dev and kernel-hardening

 arch/powerpc/xmon/xmon.c | 56 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 3e7be19aa208..8c4a5a0c28f0 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -191,6 +191,9 @@ static void dump_tlb_44x(void);
 static void dump_tlb_book3e(void);
 #endif
 
+static void clear_all_bpt(void);
+static void xmon_init(int);
+
 #ifdef CONFIG_PPC64
 #define REG		"%.16lx"
 #else
@@ -291,6 +294,39 @@ Commands:\n\
   zh	halt\n"
 ;
 
+#ifdef CONFIG_LOCK_DOWN_KERNEL
+static bool xmon_check_lockdown(void)
+{
+	static bool lockdown = false;
+
+	if (!lockdown) {
+		lockdown = kernel_is_locked_down("Using xmon",
+						 LOCKDOWN_CONFIDENTIALITY);
+		if (lockdown) {
+			printf("xmon: Disabled by strict kernel lockdown\n");
+			xmon_on = 0;
+			xmon_init(0);
+		}
+	}
+
+	if (!xmon_is_ro) {
+		xmon_is_ro = kernel_is_locked_down("Using xmon write-access",
+						   LOCKDOWN_INTEGRITY);
+		if (xmon_is_ro) {
+			printf("xmon: Read-only due to kernel lockdown\n");
+			clear_all_bpt();
+		}
+	}
+
+	return lockdown;
+}
+#else
+inline static bool xmon_check_lockdown(void)
+{
+	return false;
+}
+#endif /* CONFIG_LOCK_DOWN_KERNEL */
+
 static struct pt_regs *xmon_regs;
 
 static inline void sync(void)
@@ -708,6 +744,9 @@ static int xmon_bpt(struct pt_regs *regs)
 	struct bpt *bp;
 	unsigned long offset;
 
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 
@@ -739,6 +778,9 @@ static int xmon_sstep(struct pt_regs *regs)
 
 static int xmon_break_match(struct pt_regs *regs)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 	if (dabr.enabled == 0)
@@ -749,6 +791,9 @@ static int xmon_break_match(struct pt_regs *regs)
 
 static int xmon_iabr_match(struct pt_regs *regs)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 	if (iabr == NULL)
@@ -3742,6 +3787,9 @@ static void xmon_init(int enable)
 #ifdef CONFIG_MAGIC_SYSRQ
 static void sysrq_handle_xmon(int key)
 {
+	if (xmon_check_lockdown())
+		return;
+
 	/* ensure xmon is enabled */
 	xmon_init(1);
 	debugger(get_irq_regs());
@@ -3763,7 +3811,6 @@ static int __init setup_xmon_sysrq(void)
 device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#ifdef CONFIG_DEBUG_FS
 static void clear_all_bpt(void)
 {
 	int i;
@@ -3785,8 +3832,12 @@ static void clear_all_bpt(void)
 	printf("xmon: All breakpoints cleared\n");
 }
 
+#ifdef CONFIG_DEBUG_FS
 static int xmon_dbgfs_set(void *data, u64 val)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	xmon_on = !!val;
 	xmon_init(xmon_on);
 
@@ -3845,6 +3896,9 @@ early_param("xmon", early_parse_xmon);
 
 void __init xmon_setup(void)
 {
+	if (xmon_check_lockdown())
+		return;
+
 	if (xmon_on)
 		xmon_init(1);
 	if (xmon_early)
-- 
2.21.0


^ permalink raw reply related

* [PATCH v3 2/3] arch: wire-up close_range()
From: Christian Brauner @ 2019-05-24 11:10 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, torvalds, fweimer
  Cc: linux-ia64, linux-sh, ldv, dhowells, sparclinux, shuah,
	linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, linux-parisc, linux-api, oleg, linux-alpha,
	linuxppc-dev
In-Reply-To: <20190524111047.6892-1-christian@brauner.io>

This wires up the close_range() syscall into all arches at once.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
Cc: linux-alpha@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-m68k@lists.linux-m68k.org
Cc: linux-mips@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linux-xtensa@linux-xtensa.org
Cc: linux-arch@vger.kernel.org
Cc: x86@kernel.org
---
v1:
v2:
v3: added
- Arnd Bergmann <arnd@arndb.de>:
  - split into two patches:
    1. add close_range()
    2. add syscall to all arches at once
  - bump __NR_compat_syscalls in arch/arm64/include/asm/unistd.h
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/uapi/asm-generic/unistd.h           | 4 +++-
 19 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..b55d93af8096 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 541	common	fsconfig			sys_fsconfig
 542	common	fsmount				sys_fsmount
 543	common	fspick				sys_fspick
+545	common	close_range			sys_close_range
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..0125c97c75dd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 70e6882853c0..d04eb26cfaeb 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		434
+#define __NR_compat_syscalls		436
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c39e90600bb3..9a3270d29b42 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -886,6 +886,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index e01df3f2f80d..1a90b464e96f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -354,3 +354,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7e3d0734b2f3..2dee2050f9ef 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 26339e417695..923ef69e5a76 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0e2dd68ade57..967ed9de51cd 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -372,3 +372,4 @@
 431	n32	fsconfig			sys_fsconfig
 432	n32	fsmount				sys_fsmount
 433	n32	fspick				sys_fspick
+435	n32	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 5eebfa0d155c..71de731102b1 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -348,3 +348,4 @@
 431	n64	fsconfig			sys_fsconfig
 432	n64	fsmount				sys_fsmount
 433	n64	fspick				sys_fspick
+435	n64	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3cc1374e02d0..5a325ab29f88 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -421,3 +421,4 @@
 431	o32	fsconfig			sys_fsconfig
 432	o32	fsmount				sys_fsmount
 433	o32	fspick				sys_fspick
+435	o32	close_range			sys_close_range
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index c9e377d59232..dcc0a0879139 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -430,3 +430,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 103655d84b4b..ba2c1f078cbd 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index e822b2964a83..d7c9043d2902 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431  common	fsconfig		sys_fsconfig			sys_fsconfig
 432  common	fsmount			sys_fsmount			sys_fsmount
 433  common	fspick			sys_fspick			sys_fspick
+435  common	close_range		sys_close_range			sys_close_range
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 016a727d4357..9b5e6bf0ce32 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index e047480b1605..8c674a1e0072 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..7f7a89a96707 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 431	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
 432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
+435	i386	close_range		sys_close_range			__ia32_sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..0f7d47ae921c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 431	common	fsconfig		__x64_sys_fsconfig
 432	common	fsmount			__x64_sys_fsmount
 433	common	fspick			__x64_sys_fspick
+435	common	close_range		__x64_sys_close_range
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5fa0ee1c8e00..b489532265d0 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -404,3 +404,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a87904daf103..3f36c8745d24 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 436
 
 /*
  * 32 bit systems traditionally used different
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v3 1/2] pid: add pidfd_open()
From: Christian Brauner @ 2019-05-24 10:54 UTC (permalink / raw)
  To: jannh, oleg, viro, torvalds, arnd
  Cc: linux-ia64, linux-sh, linux-mips, dhowells, joel, linux-kselftest,
	sparclinux, elena.reshetova, linux-arch, linux-s390, dancol,
	kernel-team, serge, linux-xtensa, keescook, linux-m68k, luto,
	tglx, surenb, linux-arm-kernel, linux-parisc, linux-api, cyphar,
	luto, ebiederm, linux-alpha, akpm, linuxppc-dev
In-Reply-To: <20190521143220.crb2zyvdov3fl4g7@brauner.io>

On Tue, May 21, 2019 at 04:32:20PM +0200, Christian Brauner wrote:
> On Mon, May 20, 2019 at 05:56:29PM +0200, Christian Brauner wrote:
> > This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> > pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> > process that is created via traditional fork()/clone() calls that is only
> > referenced by a PID:
> > 
> > int pidfd = pidfd_open(1234, 0);
> > ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
> > 
> > With the introduction of pidfds through CLONE_PIDFD it is possible to
> > created pidfds at process creation time.
> > However, a lot of processes get created with traditional PID-based calls
> > such as fork() or clone() (without CLONE_PIDFD). For these processes a
> > caller can currently not create a pollable pidfd. This is a problem for
> > Android's low memory killer (LMK) and service managers such as systemd.
> > Both are examples of tools that want to make use of pidfds to get reliable
> > notification of process exit for non-parents (pidfd polling) and race-free
> > signal sending (pidfd_send_signal()). They intend to switch to this API for
> > process supervision/management as soon as possible. Having no way to get
> > pollable pidfds from PID-only processes is one of the biggest blockers for
> > them in adopting this api. With pidfd_open() making it possible to retrieve
> > pidfds for PID-based processes we enable them to adopt this api.
> > 
> > In line with Arnd's recent changes to consolidate syscall numbers across
> > architectures, I have added the pidfd_open() syscall to all architectures
> > at the same time.
> > 
> > Signed-off-by: Christian Brauner <christian@brauner.io>
> > Reviewed-by: Oleg Nesterov <oleg@redhat.com>
> 
> This now also carries a Reviewed-by from David.
> 
> > Acked-by: Arnd Bergmann <arnd@arndb.de>
> > Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Jann Horn <jannh@google.com>
> > Cc: David Howells <dhowells@redhat.com>
> > Cc: Andy Lutomirsky <luto@kernel.org>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Aleksa Sarai <cyphar@cyphar.com>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: linux-api@vger.kernel.org
> 
> I've moved pidfd_open() into my for-next branch together with Joel's
> pidfd polling changes. Everything is based on v5.2-rc1.
> 
> The chosen syscall number for now is 434. David is going to send out
> another pile of mount api related syscalls. I'll coordinate with him
> accordingly prior to the 5.3 merge window.

After talking to Arnd, I split the syscall addition and the per-arch
wiring-up of pidfd_open() into two patches. There are no functional
changes and everything is still sitting in for-next.

Thanks!
Christian

^ permalink raw reply

* [PATCH 2/2] tools/perf: Set 'trace_cycles' as defaultevent for perf kvm record in powerpc
From: Anju T Sudhakar @ 2019-05-24 10:51 UTC (permalink / raw)
  To: mpe; +Cc: anju, linuxppc-dev, linux-kernel, maddy
In-Reply-To: <20190524105107.324-1-anju@linux.vnet.ibm.com>

Use 'trace_imc/trace_cycles' as the default event for 'perf kvm record'
in powerpc.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index 66f8fe500945..b552884263df 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -177,8 +177,9 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 /*
  * Incase of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
- * valid samples. It is better to fail the "perf kvm record"
- * with default "cycles" event to monitor guest in powerpc.
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
  *
  * Function to parse the arguments and return appropriate values.
  */
@@ -202,8 +203,14 @@ int kvm_add_default_arch_event(int *argc, const char **argv)
 
 	parse_options(j, tmp, event_options, NULL, 0);
 	if (!event) {
-		free(tmp);
-		return -EINVAL;
+		if (pmu_have_event("trace_imc", "trace_cycles")) {
+			argv[j++] = strdup("-e");
+			argv[j++] = strdup("trace_imc/trace_cycles/");
+			*argc += 2;
+		} else {
+			free(tmp);
+			return -EINVAL;
+		}
 	}
 
 	free(tmp);
-- 
2.17.2


^ permalink raw reply related

* [PATCH 1/2] tools/perf: Add arch neutral function to choose event for perf kvm record
From: Anju T Sudhakar @ 2019-05-24 10:51 UTC (permalink / raw)
  To: mpe; +Cc: anju, linuxppc-dev, linux-kernel, maddy

'perf kvm record' uses 'cycles'(if the user did not specify any event) as
the default event to profile the guest.
This will not provide any proper samples from the guest incase of
powerpc architecture, since in powerpc the PMUs are controlled by
the guest rather than the host.

Patch adds a function to pick an arch specific event for 'perf kvm record',
instead of selecting 'cycles' as a default event for all architectures.

For powerpc this function checks for any user specified event, and if there
isn't any it returns invalid instead of proceeding with 'cycles' event.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
 tools/perf/arch/powerpc/util/kvm-stat.c | 37 +++++++++++++++++++++++++
 tools/perf/builtin-kvm.c                | 12 +++++++-
 tools/perf/util/kvm-stat.h              |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index f9db341c47b6..66f8fe500945 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -8,6 +8,7 @@
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include <subcmd/parse-options.h>
 
 #define NR_TPS 4
 
@@ -172,3 +173,39 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 
 	return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples. It is better to fail the "perf kvm record"
+ * with default "cycles" event to monitor guest in powerpc.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+	const char **tmp;
+	bool event = false;
+	int i, j = *argc;
+
+	const struct option event_options[] = {
+		OPT_BOOLEAN('e', "event", &event, NULL),
+		OPT_END()
+	};
+
+	tmp = calloc(j + 1, sizeof(char *));
+	if (!tmp)
+		return -EINVAL;
+
+	for (i = 0; i < j; i++)
+		tmp[i] = argv[i];
+
+	parse_options(j, tmp, event_options, NULL, 0);
+	if (!event) {
+		free(tmp);
+		return -EINVAL;
+	}
+
+	free(tmp);
+	return 0;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dbb6f737a3e2..fe33b3ec55c9 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1510,11 +1510,21 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+					const char **argv __maybe_unused)
+{
+	return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-	int rec_argc, i = 0, j;
+	int rec_argc, i = 0, j, ret;
 	const char **rec_argv;
 
+	ret = kvm_add_default_arch_event(&argc, argv);
+	if (ret)
+		return -EINVAL;
+
 	rec_argc = argc + 2;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 	rec_argv[i++] = strdup("record");
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 1403dec189b4..da38b56c46cb 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -144,5 +144,5 @@ extern const int decode_str_len;
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
-
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
2.17.2


^ permalink raw reply related

* [PATCH v2] powerpc/power: Expose pfn_is_nosave prototype
From: Mathieu Malaterre @ 2019-05-24 10:44 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Rafael J. Wysocki, linux-s390, Len Brown, Mathieu Malaterre,
	linux-pm, Heiko Carstens, linux-kernel, Paul Mackerras,
	Pavel Machek, Martin Schwidefsky, linuxppc-dev
In-Reply-To: <20190523114736.30268-1-malat@debian.org>

The declaration for pfn_is_nosave is only available in
kernel/power/power.h. Since this function can be override in arch,
expose it globally. Having a prototype will make sure to avoid warning
(sometime treated as error with W=1) such as:

  arch/powerpc/kernel/suspend.c:18:5: error: no previous prototype for 'pfn_is_nosave' [-Werror=missing-prototypes]

This moves the declaration into a globally visible header file and add
missing include to avoid a warning on powerpc. Also remove the
duplicated prototypes since not required anymore.

Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Mathieu Malaterre <malat@debian.org>
---
v2: As suggestion by christophe remove duplicates prototypes

 arch/powerpc/kernel/suspend.c | 1 +
 arch/s390/kernel/entry.h      | 1 -
 include/linux/suspend.h       | 1 +
 kernel/power/power.h          | 2 --
 4 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c
index a531154cc0f3..9e1b6b894245 100644
--- a/arch/powerpc/kernel/suspend.c
+++ b/arch/powerpc/kernel/suspend.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/suspend.h>
 #include <asm/page.h>
 #include <asm/sections.h>
 
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 20420c2b8a14..b2956d49b6ad 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -63,7 +63,6 @@ void __init startup_init(void);
 void die(struct pt_regs *regs, const char *str);
 int setup_profiling_timer(unsigned int multiplier);
 void __init time_init(void);
-int pfn_is_nosave(unsigned long);
 void s390_early_resume(void);
 unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 6b3ea9ea6a9e..e8b8a7bede90 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -395,6 +395,7 @@ extern bool system_entering_hibernation(void);
 extern bool hibernation_available(void);
 asmlinkage int swsusp_save(void);
 extern struct pbe *restore_pblist;
+int pfn_is_nosave(unsigned long pfn);
 #else /* CONFIG_HIBERNATION */
 static inline void register_nosave_region(unsigned long b, unsigned long e) {}
 static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9e58bdc8a562..44bee462ff57 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {}
 static inline void hibernate_image_size_init(void) {}
 #endif /* !CONFIG_HIBERNATION */
 
-extern int pfn_is_nosave(unsigned long);
-
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-24 10:31 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-ia64, linux-sh, oleg, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx, ldv,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <67e4458a-9cc4-d1aa-608c-73ebe9e2f7a3@yandex-team.ru>

On Thu, May 23, 2019 at 07:22:17PM +0300, Konstantin Khlebnikov wrote:
> On 22.05.2019 18:52, Christian Brauner wrote:> This adds the close_range() syscall. It allows to efficiently close a range
> > of file descriptors up to all file descriptors of a calling task.
> >
> > The syscall came up in a recent discussion around the new mount API and
> > making new file descriptor types cloexec by default. During this
> > discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
> > syscall in this manner has been requested by various people over time.
> >
> > First, it helps to close all file descriptors of an exec()ing task. This
> > can be done safely via (quoting Al's example from [1] verbatim):
> >
> >          /* that exec is sensitive */
> >          unshare(CLONE_FILES);
> >          /* we don't want anything past stderr here */
> >          close_range(3, ~0U);
> >          execve(....);
> >
> > The code snippet above is one way of working around the problem that file
> > descriptors are not cloexec by default. This is aggravated by the fact that
> > we can't just switch them over without massively regressing userspace. For
> > a whole class of programs having an in-kernel method of closing all file
> > descriptors is very helpful (e.g. demons, service managers, programming
> > language standard libraries, container managers etc.).
> > (Please note, unshare(CLONE_FILES) should only be needed if the calling
> >   task is multi-threaded and shares the file descriptor table with another
> >   thread in which case two threads could race with one thread allocating
> >   file descriptors and the other one closing them via close_range(). For the
> >   general case close_range() before the execve() is sufficient.)
> >
> > Second, it allows userspace to avoid implementing closing all file
> > descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
> > file descriptor. From looking at various large(ish) userspace code bases
> > this or similar patterns are very common in:
> > - service managers (cf. [4])
> > - libcs (cf. [6])
> > - container runtimes (cf. [5])
> > - programming language runtimes/standard libraries
> >    - Python (cf. [2])
> >    - Rust (cf. [7], [8])
> > As Dmitry pointed out there's even a long-standing glibc bug about missing
> > kernel support for this task (cf. [3]).
> > In addition, the syscall will also work for tasks that do not have procfs
> > mounted and on kernels that do not have procfs support compiled in. In such
> > situations the only way to make sure that all file descriptors are closed
> > is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
> > OPEN_MAX trickery (cf. comment [8] on Rust).
> >
> > The performance is striking. For good measure, comparing the following
> > simple close_all_fds() userspace implementation that is essentially just
> > glibc's version in [6]:
> >
> > static int close_all_fds(void)
> > {
> >          int dir_fd;
> >          DIR *dir;
> >          struct dirent *direntp;
> >
> >          dir = opendir("/proc/self/fd");
> >          if (!dir)
> >                  return -1;
> >          dir_fd = dirfd(dir);
> >          while ((direntp = readdir(dir))) {
> >                  int fd;
> >                  if (strcmp(direntp->d_name, ".") == 0)
> >                          continue;
> >                  if (strcmp(direntp->d_name, "..") == 0)
> >                          continue;
> >                  fd = atoi(direntp->d_name);
> >                  if (fd == dir_fd || fd == 0 || fd == 1 || fd == 2)
> >                          continue;
> >                  close(fd);
> >          }
> >          closedir(dir);
> >          return 0;
> > }
> >
> > to close_range() yields:
> > 1. closing 4 open files:
> >     - close_all_fds(): ~280 us
> >     - close_range():    ~24 us
> >
> > 2. closing 1000 open files:
> >     - close_all_fds(): ~5000 us
> >     - close_range():   ~800 us
> >
> > close_range() is designed to allow for some flexibility. Specifically, it
> > does not simply always close all open file descriptors of a task. Instead,
> > callers can specify an upper bound.
> > This is e.g. useful for scenarios where specific file descriptors are
> > created with well-known numbers that are supposed to be excluded from
> > getting closed.
> > For extra paranoia close_range() comes with a flags argument. This can e.g.
> > be used to implement extension. Once can imagine userspace wanting to stop
> > at the first error instead of ignoring errors under certain circumstances.
> 
> > There might be other valid ideas in the future. In any case, a flag
> > argument doesn't hurt and keeps us on the safe side.
> 
> Here is another strange but real-live scenario: crash handler for dumping core.
> 
> If applications has network connections it would be better to close them all,
> otherwise clients will wait until end of dumping process or timeout.
> Also closing normal files might be a good idea for releasing locks.
> 
> But simple closing might race with other threads - closed fd will be reused
> while some code still thinks it refers to original file.
> 
> Our solution closes files without freeing fd: it opens /dev/null and
> replaces all opened descriptors using dup2.
> 
> So, special flag for close_range() could close files without clearing bitmap.
> Effect should be the same - fd wouldn't be reused.
> 
> Actually two flags for two phases: closing files and releasing fd.

Konstantin, I'm sorry, I totally missed that part of your mail
yesterday.
Without speaking to the feasibility of this it's at least a good
illustration that people really do have the possible need for a flag
argument.

Thanks!
Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-24  9:57 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linux-ia64, Linux-sh list, Konstantin Khlebnikov, Oleg Nesterov,
	David Howells, open list:KERNEL SELFTEST FRAMEWORK, sparclinux,
	Shuah Khan, linux-arch, linux-s390, Miklos Szeredi,
	the arch/x86 maintainers, Linus Torvalds, linux-mips,
	linux-xtensa, Todd Kjos, Jann Horn, linux-m68k, Al Viro,
	Thomas Gleixner, Dmitry V. Levin, Linux ARM, Florian Weimer,
	Parisc List, Linux API, Linux Kernel Mailing List, alpha,
	Linux FS-devel Mailing List, linuxppc-dev
In-Reply-To: <CAK8P3a26uvqmExJZsezhB+cp2ADM0Ai9jVUKWOFM6kg848bCKg@mail.gmail.com>

On Fri, May 24, 2019 at 09:43:53AM +0200, Arnd Bergmann wrote:
> On Thu, May 23, 2019 at 6:33 PM Christian Brauner <christian@brauner.io> wrote:
> > On Thu, May 23, 2019 at 07:22:17PM +0300, Konstantin Khlebnikov wrote:
> > > On 22.05.2019 18:52, Christian Brauner wrote:> This adds the close_range() syscall. It allows to efficiently close a range
> > > >   22 files changed, 100 insertions(+), 9 deletions(-)
> > > >
> > >
> > > It would be better to split arch/ wiring into separate patch for better readability.
> >
> > Ok. You mean only do x86 - seems to be the standard - and then move the
> > others into a separate patch? Doesn't seem worth to have a patch
> > per-arch, I'd think.
> 
> I think I would prefer the first patch to just add the call without wiring it up
> anywhere, and a second patch do add it on all architectures including x86.

I've split this into two patches and also bumped arm64
__NR_compat_syscalls that I've missed before as you mentioned!

Thanks!
Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Arnd Bergmann @ 2019-05-24  7:43 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-ia64, Linux-sh list, Konstantin Khlebnikov, Oleg Nesterov,
	David Howells, open list:KERNEL SELFTEST FRAMEWORK, sparclinux,
	Shuah Khan, linux-arch, linux-s390, Miklos Szeredi,
	the arch/x86 maintainers, Linus Torvalds, linux-mips,
	linux-xtensa, Todd Kjos, Jann Horn, linux-m68k, Al Viro,
	Thomas Gleixner, Dmitry V. Levin, Linux ARM, Florian Weimer,
	Parisc List, Linux API, Linux Kernel Mailing List, alpha,
	Linux FS-devel Mailing List, linuxppc-dev
In-Reply-To: <20190523163345.q5ynd2ytk7nxcvqf@brauner.io>

On Thu, May 23, 2019 at 6:33 PM Christian Brauner <christian@brauner.io> wrote:
> On Thu, May 23, 2019 at 07:22:17PM +0300, Konstantin Khlebnikov wrote:
> > On 22.05.2019 18:52, Christian Brauner wrote:> This adds the close_range() syscall. It allows to efficiently close a range
> > >   22 files changed, 100 insertions(+), 9 deletions(-)
> > >
> >
> > It would be better to split arch/ wiring into separate patch for better readability.
>
> Ok. You mean only do x86 - seems to be the standard - and then move the
> others into a separate patch? Doesn't seem worth to have a patch
> per-arch, I'd think.

I think I would prefer the first patch to just add the call without wiring it up
anywhere, and a second patch do add it on all architectures including x86.

     Arnd

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-24  7:40 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev
In-Reply-To: <20190524073615.GD5234@darkstar.musicnaut.iki.fi>



Le 24/05/2019 à 09:36, Aaro Koskinen a écrit :
> Hi,
> 
> On Fri, May 24, 2019 at 08:08:36AM +0200, Christophe Leroy wrote:
>>> Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
>>>> Unfortunately still no luck... The crash is pretty much the same with
>>>> both
>>>> changes.
>>>
>>> Right. In fact change_page_attr() does nothing because this part of RAM is
>>> mapped by DBATs so v_block_mapped() returns not NULL.
>>>
>>> So, we have to set an IBAT for this area. I'll try and send you a new
>>> patch for that before noon (CET).
>>>
>>
>> patch sent out. In the patch I have also added a printk to print the buffer
>> address, so if the problem still occurs, we'll know if the problem is really
>> at the address of the buffer or if we are wrong from the beginning.
> 
> Reboot code buffer at ef0c3000
> Bye!
> BUG: Unable to handle kernel instruction fetch
> Faulting instruction address: 0xef0c3000
> 

Oops, I forgot to call update_bats() after setibat().

Can you add it and retry ?

Thanks
Christophe

^ permalink raw reply

* [Bug 203699] Kernel 5.2-rc1 fails to boot on a Mac Mini G4: dt_headr_start=0x01501000
From: bugzilla-daemon @ 2019-05-24  7:36 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203699-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203699

--- Comment #1 from Mathieu Malaterre (mathieu.malaterre@gmail.com) ---
Created attachment 282929
  --> https://bugzilla.kernel.org/attachment.cgi?id=282929&action=edit
Fix symptoms

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Aaro Koskinen @ 2019-05-24  7:36 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <ed5f9739-7661-b32a-cb8f-157b516baafa@c-s.fr>

Hi,

On Fri, May 24, 2019 at 08:08:36AM +0200, Christophe Leroy wrote:
> >Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
> >>Unfortunately still no luck... The crash is pretty much the same with
> >>both
> >>changes.
> >
> >Right. In fact change_page_attr() does nothing because this part of RAM is
> >mapped by DBATs so v_block_mapped() returns not NULL.
> >
> >So, we have to set an IBAT for this area. I'll try and send you a new
> >patch for that before noon (CET).
> >
> 
> patch sent out. In the patch I have also added a printk to print the buffer
> address, so if the problem still occurs, we'll know if the problem is really
> at the address of the buffer or if we are wrong from the beginning.

Reboot code buffer at ef0c3000
Bye!
BUG: Unable to handle kernel instruction fetch
Faulting instruction address: 0xef0c3000

A.

^ permalink raw reply

* Re: Failure to boot G4: dt_headr_start=0x01501000
From: Mathieu Malaterre @ 2019-05-24  7:34 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <225acfb1-2eed-ed7b-fd82-732a3f8b746a@c-s.fr>

On Thu, May 23, 2019 at 8:12 PM Christophe Leroy
<christophe.leroy@c-s.fr> wrote:
>
>
>
> On 05/23/2019 10:16 AM, Mathieu Malaterre wrote:
> > On Thu, May 23, 2019 at 11:45 AM Christophe Leroy
> > <christophe.leroy@c-s.fr> wrote:
> >>
> >>
> >>
> >> Le 23/05/2019 à 10:53, Mathieu Malaterre a écrit :
> >>>
> >>> I confirm powerpc/merge does not boot for me (same config). Commit id:
> >>>
> >>> a27eaa62326d (powerpc/merge) Automatic merge of branches 'master',
> >>> 'next' and 'fixes' into merge
> >>
> >> I see in the config you sent me that you have selected CONFIG_KASAN,
> >> which is a big new stuff.
> >>
> >> Can you try without it ?
> >
> > With same config but CONFIG_KASAN=n (on top of a27eaa62326d), I can
> > reproduce the boot failure (no change).
> >
> > Time for bisect ?
> >
>
> I found the issue. In order to be able to support KASAN, the setup of
> segments have moved earlier in the boot. Your problem is a side effect
> of this change.
> Function setup_disp_bat() is supposed to setup BAT3 for btext data.
> But setup_disp_bat() rely on someone setting in disp_BAT the values to
> be loaded into BATs. This is done by btext_prepare_BAT() which is called
> by bootx_init().
> The problem is that bootx_init() is never called, so setup_disp_bat()
> does nothing and the access to btext data is possible because the
> bootloader has set an entry for it in the hash table.
>
> But by setting up the segment earlier, we break the bootloader hash
> table, which shouldn't be an issue if the BATs had been set properly as
> expected.
>
> The problematic commit is 215b823707ce ("powerpc/32s: set up an early
> static hash table for KASAN)"
>
> Here is a dirty fix that works for me when CONFIG_KASAN is NOT set.
> Of course, the real fix has to be to setup the BATs properly, but I
> won't have time to look at that before June. Maybe you can ?
>
> diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
> index 755fab9641d6..fba16970c028 100644
> --- a/arch/powerpc/kernel/head_32.S
> +++ b/arch/powerpc/kernel/head_32.S
> @@ -162,7 +162,6 @@ __after_mmu_off:
>         bl      flush_tlbs
>
>         bl      initial_bats
> -       bl      load_segment_registers
>   #ifdef CONFIG_KASAN
>         bl      early_hash_table
>   #endif
> @@ -920,6 +919,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
>         RFI
>   /* Load up the kernel context */
>   2:    bl      load_up_mmu
> +       bl      load_segment_registers
>
>   #ifdef CONFIG_BDI_SWITCH
>         /* Add helper information for the Abatron bdiGDB debugger.

With above patch I can boot powerpc/merge just fine. Thanks Christophe !

$ dmesg | head
[    0.000000] printk: bootconsole [udbg0] enabled
[    0.000000] Total memory = 512MB; using 1024kB for hash table
[    0.000000] Linux version 5.1.0+ (malat@debian.org) (gcc version
8.3.0 (Debian 8.3.0-7)) #12 Thu May 23 11:56:33 UTC 2019


> Christophe

^ permalink raw reply

* [Bug 203699] New: Kernel 5.2-rc1 fails to boot on a Mac Mini G4: dt_headr_start=0x01501000
From: bugzilla-daemon @ 2019-05-24  7:10 UTC (permalink / raw)
  To: linuxppc-dev

https://bugzilla.kernel.org/show_bug.cgi?id=203699

            Bug ID: 203699
           Summary: Kernel 5.2-rc1 fails to boot on a Mac Mini G4:
                    dt_headr_start=0x01501000
           Product: Platform Specific/Hardware
           Version: 2.5
    Kernel Version: Kernel 5.2
          Hardware: PPC-32
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: blocking
          Priority: P1
         Component: PPC-32
          Assignee: platform_ppc-32@kernel-bugs.osdl.org
          Reporter: mathieu.malaterre@gmail.com
        Regression: No

From: Christophe Leroy <>

I found the issue. In order to be able to support KASAN, the setup of 
segments have moved earlier in the boot. Your problem is a side effect 
of this change.
Function setup_disp_bat() is supposed to setup BAT3 for btext data.
But setup_disp_bat() rely on someone setting in disp_BAT the values to 
be loaded into BATs. This is done by btext_prepare_BAT() which is called 
by bootx_init().
The problem is that bootx_init() is never called, so setup_disp_bat() 
does nothing and the access to btext data is possible because the 
bootloader has set an entry for it in the hash table.

But by setting up the segment earlier, we break the bootloader hash 
table, which shouldn't be an issue if the BATs had been set properly as 
expected.

The problematic commit is 215b823707ce ("powerpc/32s: set up an early 
static hash table for KASAN)"

Here is a dirty fix that works for me when CONFIG_KASAN is NOT set.
Of course, the real fix has to be to setup the BATs properly, but I 
won't have time to look at that before June. Maybe you can ?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH] mm: add account_locked_vm utility function
From: Alexey Kardashevskiy @ 2019-05-24  6:43 UTC (permalink / raw)
  To: Daniel Jordan
  Cc: Mark Rutland, Davidlohr Bueso, kvm, Alan Tull, linux-fpga,
	linux-kernel, kvm-ppc, linux-mm, Alex Williamson, Jason Gunthorpe,
	Moritz Fischer, Steve Sistare, akpm, linuxppc-dev,
	Christoph Lameter, Wu Hao
In-Reply-To: <20190520153020.mzvjsjwefwxz6cau@ca-dmjordan1.us.oracle.com>



On 21/05/2019 01:30, Daniel Jordan wrote:
> On Mon, May 20, 2019 at 04:19:34PM +1000, Alexey Kardashevskiy wrote:
>> On 04/05/2019 06:16, Daniel Jordan wrote:
>>> locked_vm accounting is done roughly the same way in five places, so
>>> unify them in a helper.  Standardize the debug prints, which vary
>>> slightly.
>>
>> And I rather liked that prints were different and tell precisely which
>> one of three each printk is.
> 
> I'm not following.  One of three...callsites?  But there were five callsites.


Well, 3 of them are mine, I was referring to them :)


> Anyway, I added a _RET_IP_ to the debug print so you can differentiate.


I did not know that existed, cool!


> 
>> I commented below but in general this seems working.
>>
>> Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> 
> Thanks!  And for the review as well.
> 
>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>> index 6b64e45a5269..d39a1b830d82 100644
>>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>> @@ -34,49 +35,13 @@
>>>  static void tce_iommu_detach_group(void *iommu_data,
>>>  		struct iommu_group *iommu_group);
>>>  
>>> -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
>>> +static int tce_account_locked_vm(struct mm_struct *mm, unsigned long npages,
>>> +				 bool inc)
>>>  {
>>> -	long ret = 0, locked, lock_limit;
>>> -
>>>  	if (WARN_ON_ONCE(!mm))
>>>  		return -EPERM;
>>
>>
>> If this WARN_ON is the only reason for having tce_account_locked_vm()
>> instead of calling account_locked_vm() directly, you can then ditch the
>> check as I have never ever seen this triggered.
> 
> Great, will do.
> 
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index d0f731c9920a..15ac76171ccd 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -273,25 +273,14 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
>>>  		return -ESRCH; /* process exited */
>>>  
>>>  	ret = down_write_killable(&mm->mmap_sem);
>>> -	if (!ret) {
>>> -		if (npage > 0) {
>>> -			if (!dma->lock_cap) {
>>> -				unsigned long limit;
>>> -
>>> -				limit = task_rlimit(dma->task,
>>> -						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>> -
>>> -				if (mm->locked_vm + npage > limit)
>>> -					ret = -ENOMEM;
>>> -			}
>>> -		}
>>> +	if (ret)
>>> +		goto out;
>>
>>
>> A single "goto" to jump just 3 lines below seems unnecessary.
> 
> No strong preference here, I'll take out the goto.
> 
>>> +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
>>> +			struct task_struct *task, bool bypass_rlim)
>>> +{
>>> +	unsigned long locked_vm, limit;
>>> +	int ret = 0;
>>> +
>>> +	locked_vm = mm->locked_vm;
>>> +	if (inc) {
>>> +		if (!bypass_rlim) {
>>> +			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>> +			if (locked_vm + pages > limit) {
>>> +				ret = -ENOMEM;
>>> +				goto out;
>>> +			}
>>> +		}
>>
>> Nit:
>>
>> if (!ret)
>>
>> and then you don't need "goto out".
> 
> Ok, sure.
> 
>>> +		mm->locked_vm = locked_vm + pages;
>>> +	} else {
>>> +		WARN_ON_ONCE(pages > locked_vm);
>>> +		mm->locked_vm = locked_vm - pages;
>>
>>
>> Can go negative here. Not a huge deal but inaccurate imo.
> 
> I hear you, but setting a negative value to zero, as we had done previously,
> doesn't make much sense to me.


Ok then. I have not seen these WARN_ON for a very long time anyway.


-- 
Alexey

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-24  6:08 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev
In-Reply-To: <334b0aca-3731-5754-bd26-af147991345d@c-s.fr>



Le 24/05/2019 à 07:46, Christophe Leroy a écrit :
> Hi
> 
> Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
>> Hi,
>>
>> On Thu, May 23, 2019 at 08:58:11PM +0200, Christophe Leroy wrote:
>>> Le 23/05/2019 à 19:27, Aaro Koskinen a écrit :
>>>> On Thu, May 23, 2019 at 07:33:38AM +0200, Christophe Leroy wrote:
>>>>> Ok, the Oops confirms that the error is due to executing the kexec 
>>>>> control
>>>>> code which is located outside the kernel text area.
>>>>>
>>>>> My yesterday's proposed change doesn't work because on book3S/32, NX
>>>>> protection is based on setting segments to NX, and using IBATs for 
>>>>> kernel
>>>>> text.
>>>>>
>>>>> Can you try the patch I sent out a few minutes ago ?
>>>>> (https://patchwork.ozlabs.org/patch/1103827/)
>>>>
>>>> It now crashes with "BUG: Unable to handle kernel instruction fetch"
>>>> and the faulting address is 0xef13a000.
>>>
>>> Ok.
>>>
>>> Can you try with both changes at the same time, ie the mtsrin(...) 
>>> and the
>>> change_page_attr() ?
>>>
>>> I suspect that allthough the HW is not able to check EXEC flag, the 
>>> SW will
>>> check it before loading the hash entry.
>>
>> Unfortunately still no luck... The crash is pretty much the same with 
>> both
>> changes.
> 
> Right. In fact change_page_attr() does nothing because this part of RAM 
> is mapped by DBATs so v_block_mapped() returns not NULL.
> 
> So, we have to set an IBAT for this area. I'll try and send you a new 
> patch for that before noon (CET).
> 

patch sent out. In the patch I have also added a printk to print the 
buffer address, so if the problem still occurs, we'll know if the 
problem is really at the address of the buffer or if we are wrong from 
the beginning.

Christophe

^ permalink raw reply

* [RFC PATCH v2] powerpc: fix kexec failure on book3s/32
From: Christophe Leroy @ 2019-05-24  6:05 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20190522211724.GC456@darkstar.musicnaut.iki.fi>

Fixes: 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/kernel/machine_kexec_32.c | 8 ++++++++
 arch/powerpc/mm/book3s32/mmu.c         | 7 +++++--
 arch/powerpc/mm/mmu_decl.h             | 2 ++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
index affe5dcce7f4..83e61a8f8468 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -15,6 +15,7 @@
 #include <asm/cacheflush.h>
 #include <asm/hw_irq.h>
 #include <asm/io.h>
+#include <mm/mmu_decl.h>
 
 typedef void (*relocate_new_kernel_t)(
 				unsigned long indirection_page,
@@ -35,6 +36,8 @@ void default_machine_kexec(struct kimage *image)
 	unsigned long page_list;
 	unsigned long reboot_code_buffer, reboot_code_buffer_phys;
 	relocate_new_kernel_t rnk;
+	unsigned long bat_size = 128 << 10;
+	unsigned long bat_mask = ~(bat_size - 1);
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
@@ -54,6 +57,11 @@ void default_machine_kexec(struct kimage *image)
 	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
 						relocate_new_kernel_size);
 
+	printk(KERN_INFO "Reboot code buffer at %lx\n", reboot_code_buffer);
+	mtsrin(mfsrin(reboot_code_buffer) & ~SR_NX, reboot_code_buffer);
+	setibat(7, reboot_code_buffer & bat_mask, reboot_code_buffer_phys & bat_mask,
+		bat_size, PAGE_KERNEL_TEXT);
+
 	flush_icache_range(reboot_code_buffer,
 				reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
 	printk(KERN_INFO "Bye!\n");
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index fc073cb2c517..7124700edb0f 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -124,8 +124,8 @@ static unsigned int block_size(unsigned long base, unsigned long top)
  * of 2 between 128k and 256M.
  * Only for 603+ ...
  */
-static void setibat(int index, unsigned long virt, phys_addr_t phys,
-		    unsigned int size, pgprot_t prot)
+void setibat(int index, unsigned long virt, phys_addr_t phys,
+	     unsigned int size, pgprot_t prot)
 {
 	unsigned int bl = (size >> 17) - 1;
 	int wimgxpp;
@@ -197,6 +197,9 @@ void mmu_mark_initmem_nx(void)
 	if (cpu_has_feature(CPU_FTR_601))
 		return;
 
+	if (IS_ENABLED(CONFIG_KEXEC))
+		nb--;
+
 	for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
 		size = block_size(base, top);
 		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7bac0aa2026a..478584d50cf2 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -103,6 +103,8 @@ void print_system_hash_info(void);
 extern void mapin_ram(void);
 extern void setbat(int index, unsigned long virt, phys_addr_t phys,
 		   unsigned int size, pgprot_t prot);
+void setibat(int index, unsigned long virt, phys_addr_t phys,
+	     unsigned int size, pgprot_t prot);
 
 extern int __map_without_bats;
 extern unsigned int rtas_data, rtas_size;
-- 
2.13.3


^ permalink raw reply related

* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-24  5:46 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev
In-Reply-To: <20190523222344.GC5234@darkstar.musicnaut.iki.fi>

Hi

Le 24/05/2019 à 00:23, Aaro Koskinen a écrit :
> Hi,
> 
> On Thu, May 23, 2019 at 08:58:11PM +0200, Christophe Leroy wrote:
>> Le 23/05/2019 à 19:27, Aaro Koskinen a écrit :
>>> On Thu, May 23, 2019 at 07:33:38AM +0200, Christophe Leroy wrote:
>>>> Ok, the Oops confirms that the error is due to executing the kexec control
>>>> code which is located outside the kernel text area.
>>>>
>>>> My yesterday's proposed change doesn't work because on book3S/32, NX
>>>> protection is based on setting segments to NX, and using IBATs for kernel
>>>> text.
>>>>
>>>> Can you try the patch I sent out a few minutes ago ?
>>>> (https://patchwork.ozlabs.org/patch/1103827/)
>>>
>>> It now crashes with "BUG: Unable to handle kernel instruction fetch"
>>> and the faulting address is 0xef13a000.
>>
>> Ok.
>>
>> Can you try with both changes at the same time, ie the mtsrin(...) and the
>> change_page_attr() ?
>>
>> I suspect that allthough the HW is not able to check EXEC flag, the SW will
>> check it before loading the hash entry.
> 
> Unfortunately still no luck... The crash is pretty much the same with both
> changes.

Right. In fact change_page_attr() does nothing because this part of RAM 
is mapped by DBATs so v_block_mapped() returns not NULL.

So, we have to set an IBAT for this area. I'll try and send you a new 
patch for that before noon (CET).

Christophe

^ permalink raw reply

* [PATCH] powerpc/powernv: Update firmware archaeology around OPAL_HANDLE_HMI
From: Stewart Smith @ 2019-05-24  5:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Stewart Smith

The first machines to ship with OPAL firmware all got firmware updates
that have the new call, but just in case someone is foolish enough to
believe the first 4 months of firmware is the best, we keep this code
around.

Comment is updated to not refer to late 2014 as recent or the future.

Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
---
 arch/powerpc/platforms/powernv/opal.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index f2b063b027f0..89b6ddc3ed38 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -206,16 +206,18 @@ static int __init opal_register_exception_handlers(void)
 	glue = 0x7000;
 
 	/*
-	 * Check if we are running on newer firmware that exports
-	 * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
-	 * the HMI interrupt and we catch it directly in Linux.
+	 * Only ancient OPAL firmware requires this.
+	 * Specifically, firmware from FW810.00 (released June 2014)
+	 * through FW810.20 (Released October 2014).
 	 *
-	 * For older firmware (i.e currently released POWER8 System Firmware
-	 * as of today <= SV810_087), we fallback to old behavior and let OPAL
-	 * patch the HMI vector and handle it inside OPAL firmware.
+	 * Check if we are running on newer (post Oct 2014) firmware that
+	 * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
+	 * patch the HMI interrupt and we catch it directly in Linux.
 	 *
-	 * For newer firmware (in development/yet to be released) we will
-	 * start catching/handling HMI directly in Linux.
+	 * For older firmware (i.e < FW810.20), we fallback to old behavior and
+	 * let OPAL patch the HMI vector and handle it inside OPAL firmware.
+	 *
+	 * For newer firmware we catch/handle the HMI directly in Linux.
 	 */
 	if (!opal_check_token(OPAL_HANDLE_HMI)) {
 		pr_info("Old firmware detected, OPAL handles HMIs.\n");
@@ -225,6 +227,11 @@ static int __init opal_register_exception_handlers(void)
 		glue += 128;
 	}
 
+	/*
+	 * Only applicable to ancient firmware, all modern
+	 * (post March 2015/skiboot 5.0) firmware will just return
+	 * OPAL_UNSUPPORTED.
+	 */
 	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
 #endif
 
-- 
2.21.0


^ permalink raw reply related

* [RFC] powerpc/xmon: restrict when kernel is locked down
From: Christopher M. Riedl @ 2019-05-24  3:30 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Christopher M. Riedl, ajd

Xmon should be either fully or partially disabled depending on the
kernel lockdown state.

Put xmon into read-only mode for lockdown=integrity and completely
disable xmon when lockdown=confidentiality. Xmon checks the lockdown
state and takes appropriate action:

 (1) during xmon_setup to prevent early xmon'ing

 (2) when triggered via sysrq

 (3) when toggled via debugfs

 (4) when triggered via a previously enabled breakpoint

The following lockdown state transitions are handled:

 (1) lockdown=none -> lockdown=integrity
     clear all breakpoints, set xmon read-only mode

 (2) lockdown=none -> lockdown=confidentiality
     clear all breakpoints, prevent re-entry into xmon

 (3) lockdown=integrity -> lockdown=confidentiality
     prevent re-entry into xmon

Suggested-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
Applies on top of this series:
https://patchwork.kernel.org/patch/10870173/

I've done some limited testing using a single CPU QEMU config.

 arch/powerpc/xmon/xmon.c | 56 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 3e7be19aa208..8c4a5a0c28f0 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -191,6 +191,9 @@ static void dump_tlb_44x(void);
 static void dump_tlb_book3e(void);
 #endif
 
+static void clear_all_bpt(void);
+static void xmon_init(int);
+
 #ifdef CONFIG_PPC64
 #define REG		"%.16lx"
 #else
@@ -291,6 +294,39 @@ Commands:\n\
   zh	halt\n"
 ;
 
+#ifdef CONFIG_LOCK_DOWN_KERNEL
+static bool xmon_check_lockdown(void)
+{
+	static bool lockdown = false;
+
+	if (!lockdown) {
+		lockdown = kernel_is_locked_down("Using xmon",
+						 LOCKDOWN_CONFIDENTIALITY);
+		if (lockdown) {
+			printf("xmon: Disabled by strict kernel lockdown\n");
+			xmon_on = 0;
+			xmon_init(0);
+		}
+	}
+
+	if (!xmon_is_ro) {
+		xmon_is_ro = kernel_is_locked_down("Using xmon write-access",
+						   LOCKDOWN_INTEGRITY);
+		if (xmon_is_ro) {
+			printf("xmon: Read-only due to kernel lockdown\n");
+			clear_all_bpt();
+		}
+	}
+
+	return lockdown;
+}
+#else
+inline static bool xmon_check_lockdown(void)
+{
+	return false;
+}
+#endif /* CONFIG_LOCK_DOWN_KERNEL */
+
 static struct pt_regs *xmon_regs;
 
 static inline void sync(void)
@@ -708,6 +744,9 @@ static int xmon_bpt(struct pt_regs *regs)
 	struct bpt *bp;
 	unsigned long offset;
 
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 
@@ -739,6 +778,9 @@ static int xmon_sstep(struct pt_regs *regs)
 
 static int xmon_break_match(struct pt_regs *regs)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 	if (dabr.enabled == 0)
@@ -749,6 +791,9 @@ static int xmon_break_match(struct pt_regs *regs)
 
 static int xmon_iabr_match(struct pt_regs *regs)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
 		return 0;
 	if (iabr == NULL)
@@ -3742,6 +3787,9 @@ static void xmon_init(int enable)
 #ifdef CONFIG_MAGIC_SYSRQ
 static void sysrq_handle_xmon(int key)
 {
+	if (xmon_check_lockdown())
+		return;
+
 	/* ensure xmon is enabled */
 	xmon_init(1);
 	debugger(get_irq_regs());
@@ -3763,7 +3811,6 @@ static int __init setup_xmon_sysrq(void)
 device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#ifdef CONFIG_DEBUG_FS
 static void clear_all_bpt(void)
 {
 	int i;
@@ -3785,8 +3832,12 @@ static void clear_all_bpt(void)
 	printf("xmon: All breakpoints cleared\n");
 }
 
+#ifdef CONFIG_DEBUG_FS
 static int xmon_dbgfs_set(void *data, u64 val)
 {
+	if (xmon_check_lockdown())
+		return 0;
+
 	xmon_on = !!val;
 	xmon_init(xmon_on);
 
@@ -3845,6 +3896,9 @@ early_param("xmon", early_parse_xmon);
 
 void __init xmon_setup(void)
 {
+	if (xmon_check_lockdown())
+		return;
+
 	if (xmon_on)
 		xmon_init(1);
 	if (xmon_early)
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox