* [RFC PATCH 4/6] mm/mremap: Use mmu gather interface instead of flush_tlb_range
From: Aneesh Kumar K.V @ 2021-02-02 9:11 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: peterz, kaleshsingh, Aneesh Kumar K.V, joel, linuxppc-dev
In-Reply-To: <20210202091116.196134-1-aneesh.kumar@linux.ibm.com>
Some architectures do have the concept of page walk cache and only mmu gather
interface supports flushing them. A fast mremap that involves moving page
table pages instead of copying pte entries should flush page walk cache since
the old translation cache is no more valid. Hence switch to mm gather to flush
TLB and mark tlb.freed_tables = 1. No page table pages need to be freed here.
With this the tlb flush is done outside page table lock (ptl).
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
mm/mremap.c | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/mm/mremap.c b/mm/mremap.c
index 54fd2302b99d..14778d215011 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -217,6 +217,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
pmd_t pmd;
/*
@@ -245,11 +246,12 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ tlb_gather_mmu(&tlb, mm, old_addr, PMD_SIZE);
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -258,13 +260,23 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pmd = *old_pmd;
pmd_clear(old_pmd);
+ /*
+ * Mark the range. We are not freeing page table pages nor
+ * regular pages. Hence we don't need to call tlb_remove_table()
+ * or tlb_remove_page().
+ */
+ tlb_flush_pte_range(&tlb, old_addr, PMD_SIZE);
+ tlb.freed_tables = 1;
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ /*
+ * This will invalidate both the old TLB and page table walk caches.
+ */
+ tlb_finish_mmu(&tlb, old_addr, PMD_SIZE);
return true;
}
@@ -283,6 +295,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
pud_t pud;
/*
@@ -292,11 +305,12 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ tlb_gather_mmu(&tlb, mm, old_addr, PUD_SIZE);
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -305,14 +319,25 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
pud = *old_pud;
pud_clear(old_pud);
+ /*
+ * Mark the range. We are not freeing page table pages nor
+ * regular pages. Hence we don't need to call tlb_remove_table()
+ * or tlb_remove_page().
+ */
+ tlb_flush_pte_range(&tlb, old_addr, PUD_SIZE);
+ tlb.freed_tables = 1;
VM_BUG_ON(!pud_none(*new_pud));
pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
- flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ /*
+ * This will invalidate both the old TLB and page table walk caches.
+ */
+ tlb_finish_mmu(&tlb, old_addr, PUD_SIZE);
return true;
}
#else
--
2.29.2
^ permalink raw reply related
* [RFC PATCH 1/6] selftest/mremap_test: Update the test to handle pagesize other than 4K
From: Aneesh Kumar K.V @ 2021-02-02 9:11 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: peterz, kaleshsingh, Aneesh Kumar K.V, joel, linuxppc-dev
Instead of hardcoding 4K page size fetch it using sysconf(). For the performance
measurements test still assume 2M and 1G are hugepage sizes.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
tools/testing/selftests/vm/mremap_test.c | 113 ++++++++++++-----------
1 file changed, 61 insertions(+), 52 deletions(-)
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index 9c391d016922..c9a5461eb786 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -45,14 +45,15 @@ enum {
_4MB = 4ULL << 20,
_1GB = 1ULL << 30,
_2GB = 2ULL << 30,
- PTE = _4KB,
PMD = _2MB,
PUD = _1GB,
};
+#define PTE page_size
+
#define MAKE_TEST(source_align, destination_align, size, \
overlaps, should_fail, test_name) \
-{ \
+(struct test){ \
.name = test_name, \
.config = { \
.src_alignment = source_align, \
@@ -252,12 +253,17 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
return 0;
}
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
int main(int argc, char **argv)
{
int failures = 0;
int i, run_perf_tests;
unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
unsigned int pattern_seed;
+ struct test test_cases[MAX_TEST];
+ struct test perf_test_cases[MAX_PERF_TEST];
+ int page_size;
time_t t;
pattern_seed = (unsigned int) time(&t);
@@ -268,56 +274,59 @@ int main(int argc, char **argv)
ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
threshold_mb, pattern_seed);
- struct test test_cases[] = {
- /* Expected mremap failures */
- MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE,
- "mremap - Source and Destination Regions Overlapping"),
- MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
- "mremap - Destination Address Misaligned (1KB-aligned)"),
- MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
- "mremap - Source Address Misaligned (1KB-aligned)"),
-
- /* Src addr PTE aligned */
- MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "8KB mremap - Source PTE-aligned, Destination PTE-aligned"),
-
- /* Src addr 1MB aligned */
- MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"),
- MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"),
-
- /* Src addr PMD aligned */
- MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination PTE-aligned"),
- MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"),
- MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination PMD-aligned"),
-
- /* Src addr PUD aligned */
- MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PTE-aligned"),
- MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"),
- MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PMD-aligned"),
- MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PUD-aligned"),
- };
-
- struct test perf_test_cases[] = {
- /*
- * mremap 1GB region - Page table level aligned time
- * comparison.
- */
- MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PTE-aligned, Destination PTE-aligned"),
- MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PMD-aligned, Destination PMD-aligned"),
- MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PUD-aligned, Destination PUD-aligned"),
- };
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Expected mremap failures */
+ test_cases[0] = MAKE_TEST(page_size, page_size, page_size,
+ OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Source and Destination Regions Overlapping");
+
+ test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+ NON_OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Destination Address Misaligned (1KB-aligned)");
+ test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+ NON_OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Source Address Misaligned (1KB-aligned)");
+
+ /* Src addr PTE aligned */
+ test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+ NON_OVERLAPPING, EXPECT_SUCCESS,
+ "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+ /* Src addr 1MB aligned */
+ test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+ test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+ /* Src addr PMD aligned */
+ test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+ test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+ test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+ /* Src addr PUD aligned */
+ test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+ test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+ test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+ test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+ perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+ /*
+ * mremap 1GB region - Page table level aligned time
+ * comparison.
+ */
+ perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+ perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) ||
(threshold_mb * _1MB >= _1GB);
--
2.29.2
^ permalink raw reply related
* [RFC PATCH 3/6] mm/mremap: Use pmd/pud_poplulate to update page table entries
From: Aneesh Kumar K.V @ 2021-02-02 9:11 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: peterz, kaleshsingh, Aneesh Kumar K.V, joel, linuxppc-dev
In-Reply-To: <20210202091116.196134-1-aneesh.kumar@linux.ibm.com>
pmd/pud_populate is the right interface to be used to set the respective
page table entries. Some architectures do assume that set_pmd/pud_at
can only be used to set a hugepage PTE. Since we are not setting up a hugepage
PTE here, use the pmd/pud_populate interface.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
mm/mremap.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mm/mremap.c b/mm/mremap.c
index f554320281cc..54fd2302b99d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -27,6 +27,7 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#include "internal.h"
@@ -258,9 +259,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pmd_clear(old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
+ pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
- /* Set the new pmd */
- set_pmd_at(mm, new_addr, new_pmd, pmd);
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -307,8 +307,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
- /* Set the new pud */
- set_pud_at(mm, new_addr, new_pud, pud);
+ pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
--
2.29.2
^ permalink raw reply related
* [powerpc:next-test] BUILD SUCCESS WITH WARNING 30133c32d19c678dbd9da28ace3aac35eb5dd4c9
From: kernel test robot @ 2021-02-02 7:58 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test
branch HEAD: 30133c32d19c678dbd9da28ace3aac35eb5dd4c9 Fixup powermac PCI
Warning in current branch:
arch/powerpc/kernel/pci-common.c:1704:12: warning: no previous prototype for 'discover_phbs' [-Wmissing-prototypes]
arch/powerpc/kernel/pci-common.c:1704:12: warning: no previous prototype for function 'discover_phbs' [-Wmissing-prototypes]
Warning ids grouped by kconfigs:
gcc_recent_errors
`-- powerpc-allyesconfig
`-- arch-powerpc-kernel-pci-common.c:warning:no-previous-prototype-for-discover_phbs
clang_recent_errors
`-- powerpc-randconfig-r033-20210131
`-- arch-powerpc-kernel-pci-common.c:warning:no-previous-prototype-for-function-discover_phbs
elapsed time: 2644m
configs tested: 151
configs skipped: 2
gcc tested configs:
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
arm vt8500_v6_v7_defconfig
powerpc tqm8xx_defconfig
sh r7785rp_defconfig
arm mvebu_v5_defconfig
sh rts7751r2dplus_defconfig
mips workpad_defconfig
powerpc bluestone_defconfig
powerpc walnut_defconfig
sh se7721_defconfig
sh sh2007_defconfig
ia64 alldefconfig
c6x evmc6457_defconfig
sh rsk7203_defconfig
powerpc arches_defconfig
m68k mvme147_defconfig
powerpc tqm8548_defconfig
sh se7780_defconfig
powerpc motionpro_defconfig
arm ep93xx_defconfig
arm multi_v5_defconfig
mips cobalt_defconfig
arm pleb_defconfig
arm stm32_defconfig
powerpc tqm8555_defconfig
powerpc taishan_defconfig
arm alldefconfig
powerpc maple_defconfig
arm at91_dt_defconfig
powerpc mpc8313_rdb_defconfig
powerpc ppc64e_defconfig
arc axs103_smp_defconfig
sh sh03_defconfig
mips mtx1_defconfig
mips malta_defconfig
powerpc mpc8272_ads_defconfig
sh se7751_defconfig
arc hsdk_defconfig
arm mini2440_defconfig
arm assabet_defconfig
s390 zfcpdump_defconfig
arm gemini_defconfig
alpha alldefconfig
mips omega2p_defconfig
mips bmips_stb_defconfig
xtensa audio_kc705_defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
nds32 defconfig
nios2 allyesconfig
csky defconfig
alpha defconfig
alpha allyesconfig
xtensa allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
i386 allyesconfig
sparc allyesconfig
sparc defconfig
i386 tinyconfig
i386 defconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
mips allyesconfig
mips allmodconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
i386 randconfig-a001-20210201
i386 randconfig-a005-20210201
i386 randconfig-a003-20210201
i386 randconfig-a006-20210201
i386 randconfig-a002-20210201
i386 randconfig-a004-20210201
i386 randconfig-a005-20210131
i386 randconfig-a003-20210131
i386 randconfig-a002-20210131
i386 randconfig-a001-20210131
i386 randconfig-a004-20210131
i386 randconfig-a006-20210131
x86_64 randconfig-a015-20210131
x86_64 randconfig-a011-20210131
x86_64 randconfig-a014-20210131
x86_64 randconfig-a016-20210131
x86_64 randconfig-a012-20210131
x86_64 randconfig-a013-20210131
x86_64 randconfig-a013-20210202
x86_64 randconfig-a014-20210202
x86_64 randconfig-a015-20210202
x86_64 randconfig-a016-20210202
x86_64 randconfig-a011-20210202
x86_64 randconfig-a012-20210202
i386 randconfig-a013-20210131
i386 randconfig-a011-20210131
i386 randconfig-a015-20210131
i386 randconfig-a012-20210131
i386 randconfig-a014-20210131
i386 randconfig-a016-20210131
i386 randconfig-a013-20210201
i386 randconfig-a016-20210201
i386 randconfig-a014-20210201
i386 randconfig-a012-20210201
i386 randconfig-a015-20210201
i386 randconfig-a011-20210201
x86_64 randconfig-a006-20210201
x86_64 randconfig-a001-20210201
x86_64 randconfig-a005-20210201
x86_64 randconfig-a002-20210201
x86_64 randconfig-a004-20210201
x86_64 randconfig-a003-20210201
riscv nommu_k210_defconfig
riscv allyesconfig
riscv nommu_virt_defconfig
riscv allnoconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
x86_64 rhel
x86_64 allyesconfig
x86_64 rhel-7.6-kselftests
x86_64 defconfig
x86_64 rhel-8.3
x86_64 rhel-8.3-kbuiltin
x86_64 kexec
clang tested configs:
x86_64 randconfig-a004-20210131
x86_64 randconfig-a002-20210131
x86_64 randconfig-a001-20210131
x86_64 randconfig-a005-20210131
x86_64 randconfig-a006-20210131
x86_64 randconfig-a003-20210131
x86_64 randconfig-a013-20210201
x86_64 randconfig-a014-20210201
x86_64 randconfig-a015-20210201
x86_64 randconfig-a016-20210201
x86_64 randconfig-a011-20210201
x86_64 randconfig-a012-20210201
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* [powerpc:next] BUILD SUCCESS 6895c5ba7bdcc55eacad03cf309ab23be63b9cac
From: kernel test robot @ 2021-02-02 7:58 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
branch HEAD: 6895c5ba7bdcc55eacad03cf309ab23be63b9cac powerpc/xmon: Select CONSOLE_POLL for the 8xx
elapsed time: 2644m
configs tested: 197
configs skipped: 4
The following configs have been built successfully.
More configs may be tested in the coming days.
gcc tested configs:
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
arm vt8500_v6_v7_defconfig
powerpc tqm8xx_defconfig
sh r7785rp_defconfig
arm mvebu_v5_defconfig
sh rts7751r2dplus_defconfig
openrisc alldefconfig
powerpc ppc6xx_defconfig
arm vexpress_defconfig
mips bmips_be_defconfig
h8300 h8300h-sim_defconfig
mips vocore2_defconfig
mips cavium_octeon_defconfig
mips db1xxx_defconfig
arm assabet_defconfig
mips ar7_defconfig
sh secureedge5410_defconfig
powerpc fsp2_defconfig
mips mpc30x_defconfig
powerpc tqm8541_defconfig
xtensa virt_defconfig
arm mxs_defconfig
powerpc acadia_defconfig
sh sh2007_defconfig
mips e55_defconfig
c6x evmc6457_defconfig
sh rsk7203_defconfig
powerpc arches_defconfig
m68k mvme147_defconfig
powerpc tqm8548_defconfig
sh se7780_defconfig
sh rts7751r2d1_defconfig
powerpc amigaone_defconfig
sh r7780mp_defconfig
arm s5pv210_defconfig
arm ezx_defconfig
arm cm_x300_defconfig
xtensa xip_kc705_defconfig
m68k m5475evb_defconfig
arm pleb_defconfig
sh migor_defconfig
arm nhk8815_defconfig
powerpc motionpro_defconfig
arm ep93xx_defconfig
arm multi_v5_defconfig
mips cobalt_defconfig
powerpc pasemi_defconfig
mips malta_qemu_32r6_defconfig
powerpc mpc85xx_cds_defconfig
mips sb1250_swarm_defconfig
xtensa allyesconfig
powerpc katmai_defconfig
powerpc wii_defconfig
mips ip32_defconfig
mips ath25_defconfig
xtensa cadence_csp_defconfig
powerpc maple_defconfig
arm at91_dt_defconfig
powerpc walnut_defconfig
powerpc mpc8313_rdb_defconfig
powerpc ppc64e_defconfig
arm clps711x_defconfig
powerpc tqm8555_defconfig
arm multi_v4t_defconfig
powerpc skiroot_defconfig
powerpc mpc8272_ads_defconfig
sh se7751_defconfig
arc hsdk_defconfig
arm mini2440_defconfig
s390 zfcpdump_defconfig
arm gemini_defconfig
alpha alldefconfig
mips omega2p_defconfig
mips bmips_stb_defconfig
xtensa audio_kc705_defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
nds32 defconfig
nios2 allyesconfig
csky defconfig
alpha defconfig
alpha allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
i386 allyesconfig
sparc allyesconfig
sparc defconfig
i386 tinyconfig
i386 defconfig
mips allyesconfig
mips allmodconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
x86_64 randconfig-a006-20210201
x86_64 randconfig-a001-20210201
x86_64 randconfig-a005-20210201
x86_64 randconfig-a002-20210201
x86_64 randconfig-a004-20210201
x86_64 randconfig-a003-20210201
i386 randconfig-a001-20210202
i386 randconfig-a005-20210202
i386 randconfig-a003-20210202
i386 randconfig-a006-20210202
i386 randconfig-a002-20210202
i386 randconfig-a004-20210202
i386 randconfig-a005-20210131
i386 randconfig-a003-20210131
i386 randconfig-a002-20210131
i386 randconfig-a001-20210131
i386 randconfig-a004-20210131
i386 randconfig-a006-20210131
i386 randconfig-a001-20210201
i386 randconfig-a005-20210201
i386 randconfig-a003-20210201
i386 randconfig-a006-20210201
i386 randconfig-a002-20210201
i386 randconfig-a004-20210201
x86_64 randconfig-a015-20210131
x86_64 randconfig-a011-20210131
x86_64 randconfig-a014-20210131
x86_64 randconfig-a016-20210131
x86_64 randconfig-a012-20210131
x86_64 randconfig-a013-20210131
x86_64 randconfig-a013-20210202
x86_64 randconfig-a014-20210202
x86_64 randconfig-a015-20210202
x86_64 randconfig-a016-20210202
x86_64 randconfig-a011-20210202
x86_64 randconfig-a012-20210202
i386 randconfig-a013-20210202
i386 randconfig-a016-20210202
i386 randconfig-a014-20210202
i386 randconfig-a012-20210202
i386 randconfig-a015-20210202
i386 randconfig-a011-20210202
i386 randconfig-a013-20210131
i386 randconfig-a011-20210131
i386 randconfig-a015-20210131
i386 randconfig-a012-20210131
i386 randconfig-a014-20210131
i386 randconfig-a016-20210131
i386 randconfig-a013-20210201
i386 randconfig-a016-20210201
i386 randconfig-a014-20210201
i386 randconfig-a012-20210201
i386 randconfig-a015-20210201
i386 randconfig-a011-20210201
riscv nommu_k210_defconfig
riscv allyesconfig
riscv nommu_virt_defconfig
riscv allnoconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
x86_64 rhel
x86_64 allyesconfig
x86_64 rhel-7.6-kselftests
x86_64 defconfig
x86_64 rhel-8.3
x86_64 rhel-8.3-kbuiltin
x86_64 kexec
clang tested configs:
x86_64 randconfig-a006-20210202
x86_64 randconfig-a001-20210202
x86_64 randconfig-a005-20210202
x86_64 randconfig-a002-20210202
x86_64 randconfig-a004-20210202
x86_64 randconfig-a003-20210202
x86_64 randconfig-a004-20210131
x86_64 randconfig-a002-20210131
x86_64 randconfig-a001-20210131
x86_64 randconfig-a005-20210131
x86_64 randconfig-a006-20210131
x86_64 randconfig-a003-20210131
x86_64 randconfig-a013-20210201
x86_64 randconfig-a014-20210201
x86_64 randconfig-a015-20210201
x86_64 randconfig-a016-20210201
x86_64 randconfig-a011-20210201
x86_64 randconfig-a012-20210201
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* [powerpc:fixes-test] BUILD SUCCESS 66f0a9e058fad50e569ad752be72e52701991fd5
From: kernel test robot @ 2021-02-02 7:57 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test
branch HEAD: 66f0a9e058fad50e569ad752be72e52701991fd5 powerpc/vdso64: remove meaningless vgettimeofday.o build rule
elapsed time: 2646m
configs tested: 195
configs skipped: 4
The following configs have been built successfully.
More configs may be tested in the coming days.
gcc tested configs:
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
powerpc64 defconfig
x86_64 alldefconfig
arm ezx_defconfig
powerpc canyonlands_defconfig
mips maltasmvp_defconfig
arm vt8500_v6_v7_defconfig
powerpc tqm8xx_defconfig
sh r7785rp_defconfig
arm mvebu_v5_defconfig
sh rts7751r2dplus_defconfig
mips workpad_defconfig
powerpc bluestone_defconfig
powerpc walnut_defconfig
sh se7721_defconfig
powerpc tqm8541_defconfig
powerpc amigaone_defconfig
powerpc cell_defconfig
sh sh7785lcr_defconfig
mips cavium_octeon_defconfig
mips db1xxx_defconfig
arm assabet_defconfig
h8300 h8300h-sim_defconfig
arm footbridge_defconfig
arm h5000_defconfig
arm multi_v4t_defconfig
sh sh2007_defconfig
sh se7619_defconfig
mips ar7_defconfig
sh secureedge5410_defconfig
powerpc fsp2_defconfig
mips mpc30x_defconfig
powerpc adder875_defconfig
powerpc powernv_defconfig
sh j2_defconfig
arm mvebu_v7_defconfig
um kunit_defconfig
c6x evmc6457_defconfig
sh rsk7203_defconfig
powerpc arches_defconfig
m68k mvme147_defconfig
powerpc tqm8548_defconfig
sh se7780_defconfig
sh rts7751r2d1_defconfig
sh r7780mp_defconfig
arm s5pv210_defconfig
powerpc motionpro_defconfig
arm ep93xx_defconfig
arm multi_v5_defconfig
mips cobalt_defconfig
xtensa allyesconfig
powerpc katmai_defconfig
powerpc wii_defconfig
mips ip32_defconfig
mips ath25_defconfig
xtensa cadence_csp_defconfig
powerpc maple_defconfig
arm at91_dt_defconfig
powerpc mpc8313_rdb_defconfig
powerpc ppc64e_defconfig
arm clps711x_defconfig
powerpc tqm8555_defconfig
powerpc skiroot_defconfig
powerpc mpc8272_ads_defconfig
sh se7751_defconfig
arc hsdk_defconfig
arm mini2440_defconfig
s390 zfcpdump_defconfig
arm gemini_defconfig
alpha alldefconfig
mips omega2p_defconfig
mips bmips_stb_defconfig
xtensa audio_kc705_defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
nds32 defconfig
nios2 allyesconfig
csky defconfig
alpha defconfig
alpha allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
i386 allyesconfig
sparc allyesconfig
sparc defconfig
i386 tinyconfig
i386 defconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
mips allyesconfig
mips allmodconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
x86_64 randconfig-a001-20210201
x86_64 randconfig-a002-20210201
x86_64 randconfig-a004-20210201
x86_64 randconfig-a003-20210201
i386 randconfig-a001-20210202
i386 randconfig-a005-20210202
i386 randconfig-a003-20210202
i386 randconfig-a006-20210202
i386 randconfig-a002-20210202
i386 randconfig-a004-20210202
i386 randconfig-a005-20210131
i386 randconfig-a003-20210131
i386 randconfig-a002-20210131
i386 randconfig-a001-20210131
i386 randconfig-a004-20210131
i386 randconfig-a006-20210131
i386 randconfig-a001-20210201
i386 randconfig-a005-20210201
i386 randconfig-a003-20210201
i386 randconfig-a002-20210201
i386 randconfig-a004-20210201
i386 randconfig-a006-20210201
x86_64 randconfig-a015-20210131
x86_64 randconfig-a011-20210131
x86_64 randconfig-a014-20210131
x86_64 randconfig-a016-20210131
x86_64 randconfig-a012-20210131
x86_64 randconfig-a013-20210131
x86_64 randconfig-a013-20210202
x86_64 randconfig-a014-20210202
x86_64 randconfig-a015-20210202
x86_64 randconfig-a016-20210202
x86_64 randconfig-a011-20210202
x86_64 randconfig-a012-20210202
i386 randconfig-a013-20210202
i386 randconfig-a016-20210202
i386 randconfig-a014-20210202
i386 randconfig-a012-20210202
i386 randconfig-a015-20210202
i386 randconfig-a011-20210202
i386 randconfig-a013-20210131
i386 randconfig-a011-20210131
i386 randconfig-a015-20210131
i386 randconfig-a012-20210131
i386 randconfig-a014-20210131
i386 randconfig-a016-20210131
i386 randconfig-a013-20210201
i386 randconfig-a014-20210201
i386 randconfig-a012-20210201
i386 randconfig-a011-20210201
i386 randconfig-a016-20210201
i386 randconfig-a015-20210201
x86_64 randconfig-a006-20210201
x86_64 randconfig-a005-20210201
riscv nommu_k210_defconfig
riscv allyesconfig
riscv nommu_virt_defconfig
riscv allnoconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
x86_64 rhel
x86_64 allyesconfig
x86_64 rhel-7.6-kselftests
x86_64 defconfig
x86_64 rhel-8.3
x86_64 rhel-8.3-kbuiltin
x86_64 kexec
clang tested configs:
x86_64 randconfig-a006-20210202
x86_64 randconfig-a001-20210202
x86_64 randconfig-a005-20210202
x86_64 randconfig-a002-20210202
x86_64 randconfig-a004-20210202
x86_64 randconfig-a003-20210202
x86_64 randconfig-a004-20210131
x86_64 randconfig-a002-20210131
x86_64 randconfig-a001-20210131
x86_64 randconfig-a005-20210131
x86_64 randconfig-a006-20210131
x86_64 randconfig-a003-20210131
x86_64 randconfig-a013-20210201
x86_64 randconfig-a014-20210201
x86_64 randconfig-a015-20210201
x86_64 randconfig-a016-20210201
x86_64 randconfig-a011-20210201
x86_64 randconfig-a012-20210201
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* [powerpc:merge] BUILD SUCCESS a2311d1e2b5ea0e77dcdd35fffb58b035da202b6
From: kernel test robot @ 2021-02-02 7:57 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge
branch HEAD: a2311d1e2b5ea0e77dcdd35fffb58b035da202b6 Automatic merge of 'fixes' into merge (2021-01-31 22:08)
elapsed time: 2645m
configs tested: 136
configs skipped: 2
The following configs have been built successfully.
More configs may be tested in the coming days.
gcc tested configs:
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
arm vt8500_v6_v7_defconfig
powerpc tqm8xx_defconfig
sh r7785rp_defconfig
arm mvebu_v5_defconfig
sh rts7751r2dplus_defconfig
powerpc redwood_defconfig
arm shmobile_defconfig
arm sama5_defconfig
sparc64 defconfig
mips decstation_r4k_defconfig
mips ar7_defconfig
sh secureedge5410_defconfig
powerpc fsp2_defconfig
mips mpc30x_defconfig
c6x evmc6457_defconfig
sh rsk7203_defconfig
powerpc arches_defconfig
m68k mvme147_defconfig
powerpc tqm8548_defconfig
sh se7780_defconfig
powerpc motionpro_defconfig
arm ep93xx_defconfig
arm multi_v5_defconfig
mips cobalt_defconfig
h8300 defconfig
mips loongson3_defconfig
powerpc mpc8272_ads_defconfig
sh se7751_defconfig
arc hsdk_defconfig
arm mini2440_defconfig
arm assabet_defconfig
mips ip27_defconfig
arm realview_defconfig
m68k sun3x_defconfig
powerpc taishan_defconfig
arm integrator_defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
nds32 defconfig
nios2 allyesconfig
csky defconfig
alpha defconfig
alpha allyesconfig
xtensa allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
i386 allyesconfig
sparc allyesconfig
sparc defconfig
i386 tinyconfig
i386 defconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
mips allyesconfig
mips allmodconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
x86_64 randconfig-a006-20210201
x86_64 randconfig-a001-20210201
x86_64 randconfig-a005-20210201
x86_64 randconfig-a002-20210201
x86_64 randconfig-a004-20210201
x86_64 randconfig-a003-20210201
i386 randconfig-a005-20210131
i386 randconfig-a003-20210131
i386 randconfig-a002-20210131
i386 randconfig-a001-20210131
i386 randconfig-a004-20210131
i386 randconfig-a006-20210131
i386 randconfig-a001-20210201
i386 randconfig-a005-20210201
i386 randconfig-a003-20210201
i386 randconfig-a006-20210201
i386 randconfig-a002-20210201
i386 randconfig-a004-20210201
x86_64 randconfig-a015-20210131
x86_64 randconfig-a011-20210131
x86_64 randconfig-a014-20210131
x86_64 randconfig-a016-20210131
x86_64 randconfig-a012-20210131
x86_64 randconfig-a013-20210131
i386 randconfig-a013-20210201
i386 randconfig-a016-20210201
i386 randconfig-a014-20210201
i386 randconfig-a012-20210201
i386 randconfig-a015-20210201
i386 randconfig-a011-20210201
i386 randconfig-a013-20210131
i386 randconfig-a011-20210131
i386 randconfig-a015-20210131
i386 randconfig-a012-20210131
i386 randconfig-a014-20210131
i386 randconfig-a016-20210131
riscv nommu_k210_defconfig
riscv allyesconfig
riscv nommu_virt_defconfig
riscv allnoconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
x86_64 rhel
x86_64 allyesconfig
x86_64 rhel-7.6-kselftests
x86_64 defconfig
x86_64 rhel-8.3
x86_64 rhel-8.3-kbuiltin
x86_64 kexec
clang tested configs:
x86_64 randconfig-a004-20210131
x86_64 randconfig-a002-20210131
x86_64 randconfig-a001-20210131
x86_64 randconfig-a005-20210131
x86_64 randconfig-a006-20210131
x86_64 randconfig-a003-20210131
x86_64 randconfig-a013-20210201
x86_64 randconfig-a014-20210201
x86_64 randconfig-a015-20210201
x86_64 randconfig-a016-20210201
x86_64 randconfig-a011-20210201
x86_64 randconfig-a012-20210201
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* Re: [PATCH] powerpc/64/signal: Fix regression in __kernel_sigtramp_rt64 semantics
From: Nicholas Piggin @ 2021-02-02 7:41 UTC (permalink / raw)
To: linuxppc-dev, Raoni Fassina Firmino
In-Reply-To: <20210201200505.iz46ubcizipnkcxe@work-tp>
Excerpts from Raoni Fassina Firmino's message of February 2, 2021 6:05 am:
> Tested on powerpc64 and powerpc64le, with a glibc build and running the
> affected glibc's testcase[2], inspected that glibc's backtrace() now gives
> the correct result and gdb backtrace also keeps working as before.
>
> I believe this should be backported to releases 5.9 and 5.10 as userspace
> is affected in this releases.
>
> ---- 8< ----
Thanks for this, I don't know the glibc code but the kernel change seems
okay to me.
Thanks,
Nick
>
> A Change[1] in __kernel_sigtramp_rt64 VDSO and trampoline code introduced a
> regression in the way glibc's backtrace()[2] detects the signal-handler
> stack frame. Apart from the practical implications, __kernel_sigtram_rt64
> was a VDSO with the semantics that it is a function you can call from
> userspace to end a signal handling. Now this semantics are no longer
> valid.
>
> I believe the aforementioned change affects all releases since 5.9.
>
> This patch tries to fix both the semantics and practical aspect of
> __kernel_sigtramp_rt64 returning it to the previous code, whilst keeping
> the intended behavior from[1] by adding a new symbol to serve as the jump
> target from the kernel to the trampoline. Now the trampoline has two parts,
> an new entry point and the old return point.
>
> [1] commit 0138ba5783ae0dcc799ad401a1e8ac8333790df9 ("powerpc/64/signal:
> Balance return predictor stack in signal trampoline")
> [2] https://lists.ozlabs.org/pipermail/linuxppc-dev/2021-January/223194.html
>
> Fixes: 0138ba5783ae ("powerpc/64/signal: Balance return predictor stack in signal trampoline")
> Signed-off-by: Raoni Fassina Firmino <raoni@linux.ibm.com>
> ---
> arch/powerpc/kernel/vdso64/sigtramp.S | 9 ++++++++-
> arch/powerpc/kernel/vdso64/vdso64.lds.S | 2 +-
> 2 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
> index bbf68cd01088..f0fd8d2a9fc4 100644
> --- a/arch/powerpc/kernel/vdso64/sigtramp.S
> +++ b/arch/powerpc/kernel/vdso64/sigtramp.S
> @@ -15,11 +15,18 @@
>
> .text
>
> +/* __kernel_start_sigtramp_rt64 and __kernel_sigtramp_rt64 together
> + are one function split in two parts. The kernel jumps to the former
> + and the signal handler indirectly (by blr) returns to the latter.
> + __kernel_sigtramp_rt64 needs to point to the return address so
> + glibc can correctly identify the trampoline stack frame. */
Are you planning to update glibc to cope with this as well? Any idea
about musl? If so, including version numbers would be good (not that
it's really a problem to carry this patch around).
I was just about to ask to turn the comment into kernel style, but the
whole file has this style so nevermind about that! :)
Thanks,
Nick
^ permalink raw reply
* Re: [RFC 11/20] mm/tlb: remove arch-specific tlb_start/end_vma()
From: Nadav Amit @ 2021-02-02 7:20 UTC (permalink / raw)
To: Nicholas Piggin
Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Peter Zijlstra,
Will Deacon, Dave Hansen, LKML, linux-csky@vger.kernel.org,
Linux-MM, Andy Lutomirski, Andrew Morton, linuxppc-dev,
Thomas Gleixner
In-Reply-To: <1612247956.0a1r1yjmm3.astroid@bobo.none>
> On Feb 1, 2021, at 10:41 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Excerpts from Peter Zijlstra's message of February 1, 2021 10:09 pm:
>> I also don't think AGRESSIVE_FLUSH_BATCHING quite captures what it does.
>> How about:
>>
>> CONFIG_MMU_GATHER_NO_PER_VMA_FLUSH
>
> Yes please, have to have descriptive names.
Point taken. I will fix it.
>
> I didn't quite see why this was much of an improvement though. Maybe
> follow up patches take advantage of it? I didn't see how they all fit
> together.
They do, but I realized as I said in other emails that I have a serious bug
in the deferred invalidation scheme.
Having said that, I think there is an advantage of having an explicit config
option instead of relying on whether tlb_end_vma is defined. For instance,
Arm does not define tlb_end_vma, and consequently it flushes the TLB after
each VMA. I suspect it is not intentional.
^ permalink raw reply
* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Nicholas Piggin @ 2021-02-02 7:14 UTC (permalink / raw)
To: Nadav Amit, Peter Zijlstra
Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Will Deacon,
Mel Gorman, Dave Hansen, LKML, linux-csky@vger.kernel.org,
Linux-MM, Andy Lutomirski, Andrew Morton, linuxppc-dev,
Thomas Gleixner
In-Reply-To: <YBf3sl3M+j3hJRoM@hirez.programming.kicks-ass.net>
Excerpts from Peter Zijlstra's message of February 1, 2021 10:44 pm:
> On Sun, Jan 31, 2021 at 07:57:01AM +0000, Nadav Amit wrote:
>> > On Jan 30, 2021, at 7:30 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
>> > I'll go through the patches a bit more closely when they all come
>> > through. Sparc and powerpc of course need the arch lazy mode to get
>> > per-page/pte information for operations that are not freeing pages,
>> > which is what mmu gather is designed for.
>>
>> IIUC you mean any PTE change requires a TLB flush. Even setting up a new PTE
>> where no previous PTE was set, right?
In cases of increasing permissiveness of access, yes it may want to
update the "TLB" (read hash table) to avoid taking hash table faults.
But whatever the reason for the flush, there may have to be more
data carried than just the virtual address range and/or physical
pages.
If you clear out the PTE then you have no guarantee of actually being
able to go back and address the the in-memory or in-hardware translation
structures to update them, depending on what exact scheme is used
(powerpc probably could if all page sizes were the same, but THP or
64k/4k sub pages would throw a spanner in those works).
> These are the HASH architectures. Their hardware doesn't walk the
> page-tables, but it consults a hash-table to resolve page translations.
Yeah, it's very cool in a masochistic way.
I actually don't know if it's worth doing a big rework of it, as much
as I'd like to. Rather than just keep it in place and eventually
dismantling some of the go-fast hooks from core code if we can one day
deprecate it in favour of the much easier radix mode.
The whole thing is like a big steam train, years ago Paul and Ben and
Anton and co got the boiler stoked up and set all the valves just right
so it runs unbelievably well for what it's actually doing but look at it
the wrong way and the whole thing could blow up. (at least that's what
it feels like to me probably because I don't know the code that well).
Sparc could probably do the same, not sure about Xen. I don't suppose
vmware is intending to add any kind of paravirt mode related to this stuff?
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: Christophe Leroy @ 2021-02-02 6:58 UTC (permalink / raw)
To: Nicholas Piggin, Benjamin Herrenschmidt, David Laight,
Michael Ellerman, msuchanek@suse.de, Paul Mackerras
Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <1612247170.ea0f766ml4.astroid@bobo.none>
Le 02/02/2021 à 07:38, Nicholas Piggin a écrit :
> Excerpts from Christophe Leroy's message of February 2, 2021 4:15 pm:
>>
>>
>> Le 28/01/2021 à 00:50, Nicholas Piggin a écrit :
>>> Excerpts from David Laight's message of January 26, 2021 8:28 pm:
>>>> From: Nicholas Piggin
>>>>> Sent: 26 January 2021 10:21
>>>>>
>>>>> Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
>>>>>> syscall_64.c will be reused almost as is for PPC32.
>>>>>>
>>>>>> Rename it syscall.c
>>>>>
>>>>> Could you rename it to interrupt.c instead? A system call is an
>>>>> interrupt, and the file now also has code to return from other
>>>>> interrupts as well, and it matches the new asm/interrupt.h from
>>>>> the interrupts series.
>>>>
>>>> Hmmm....
>>>>
>>>> That might make it harder for someone looking for the system call
>>>> entry code to find it.
>>>
>>> It's very grep'able.
>>>
>>>> In some sense interrupts are the simpler case.
>>>>
>>>> Especially when comparing with other architectures which have
>>>> special instructions for syscall entry.
>>>
>>> powerpc does have a special instruction for syscall, and it causes a
>>> system call interrupt.
>>>
>>> I'm not sure about other architectures, but for powerpc its more
>>> sensible to call it interrupt.c than syscall.c.
>>
>> Many other architectures have a syscall.c but for a different purpose: it contains arch specific
>> system calls. We have that in powerpc as well, it is called syscalls.c
>>
>> So to avoid confusion, I'll rename it. But I think "interrupt" is maybe not the right name. An
>> interrupt most of the time refers to IRQ.
>
> That depends what you mean by interrupt and IRQ.
>
> Linux kind of considers any asynchronous maskable interrupt an irq
> (local_irq_disable()). But if you say irq it's more likely to mean
> a device interrupt, and "interrupt" usually refres to the asynch
> ones.
>
> But Linux doesn't really assign names to synchronous interrupts in
> core code. It doesn't say they aren't interrupts, it just doesn't
> really have a convention for them at all.
>
> Other architectures e.g., x86 also have things like interrupt
> descriptor table for synchronous interrupts as well. That's where
> I got the interrupt wrappers code from actually.
>
> So it's really fine to use the proper arch-specific names for things
> in arch code. I'm trying to slowly change names from exception to
> interrupt.
>
>> For me system call is not an interrupt in the way it
>> doesn't unexpectedly interrupt a program flow. In powerpc manuals it is generally called exceptions,
>> no I'm more inclined to call it exception.c
>
> Actually that's backwards. Powerpc manuals (at least the one I look at)
> calls them all interrupts including system calls, and also the system
> call interrupt is actually the only one that doesn't appear to be
> associated with an exception.
>
> Also there is no distinction about expecte/unexpected -- a data storage
> interrupt is expected if you access a location without the right access
> permissions for example, but it is still an interrupt.
>
> These handlers very specifically deal with the change to execution flow
> (i.e., the interrupt), they do *not* deal with the exception which may
> be associated with it (that is the job of the handler).
>
> And on the other hand you can deal with exceptions in some cases without
> taking an interrupt at all. For example if you had MSR[EE]=0 you could
> change the decrementer or execute msgclr or change HMER SPR etc to clear
> various exceptions without ever taking the interrupt.
>
Ok, let's call it interrupt.c then, to be consistant with the interrupt wrapper story.
Christophe
^ permalink raw reply
* Re: [RFC 11/20] mm/tlb: remove arch-specific tlb_start/end_vma()
From: Nicholas Piggin @ 2021-02-02 6:41 UTC (permalink / raw)
To: Nadav Amit, Peter Zijlstra
Cc: Andrea Arcangeli, linux-s390, x86, Yu Zhao, Will Deacon,
Dave Hansen, linux-kernel, linux-csky, linux-mm, Nadav Amit,
Andy Lutomirski, Andrew Morton, linuxppc-dev, Thomas Gleixner
In-Reply-To: <YBfvh1Imz6RRTUDV@hirez.programming.kicks-ass.net>
Excerpts from Peter Zijlstra's message of February 1, 2021 10:09 pm:
> On Sat, Jan 30, 2021 at 04:11:23PM -0800, Nadav Amit wrote:
>
>> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
>> index 427bfcc6cdec..b97136b7010b 100644
>> --- a/include/asm-generic/tlb.h
>> +++ b/include/asm-generic/tlb.h
>> @@ -334,8 +334,8 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
>>
>> #ifdef CONFIG_MMU_GATHER_NO_RANGE
>>
>> -#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
>> -#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
>> +#if defined(tlb_flush)
>> +#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
>> #endif
>>
>> /*
>> @@ -362,10 +362,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
>>
>> #ifndef tlb_flush
>>
>> -#if defined(tlb_start_vma) || defined(tlb_end_vma)
>> -#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
>> -#endif
>
> #ifdef CONFIG_ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
> #error ....
> #endif
>
> goes here...
>
>
>> static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
>> {
>> if (tlb->fullmm)
>> return;
>>
>> + if (IS_ENABLED(CONFIG_ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING))
>> + return;
>
> Also, can you please stick to the CONFIG_MMU_GATHER_* namespace?
>
> I also don't think AGRESSIVE_FLUSH_BATCHING quite captures what it does.
> How about:
>
> CONFIG_MMU_GATHER_NO_PER_VMA_FLUSH
Yes please, have to have descriptive names.
I didn't quite see why this was much of an improvement though. Maybe
follow up patches take advantage of it? I didn't see how they all fit
together.
Thanks,
Nick
^ permalink raw reply
* [PATCH] powerpc/perf: Fix the guest crash issue with trace-imc
From: Athira Rajeev @ 2021-02-02 6:39 UTC (permalink / raw)
To: mpe; +Cc: maddy, linuxppc-dev
when perf kvm record with trace_imc event is attach to guest
pid(with -p option), the qemu process gets killed with permission
issue. This happens because trace_imc event requires admin privileges
to monitor the process.If the qemu creates threads, by default
child tasks also inherit the counters and if there is no permission
to monitor qemu threads, we return permission denied ( EACCES ).
Fix this by returning EACCES only if there is no CAP_SYS_ADMIN and the
event doesn’t have inheritance.
Fixes: 012ae244845f ("powerpc/perf: Trace imc PMU functions")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
arch/powerpc/perf/imc-pmu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index e106909ff9c3..cc5679bfd28b 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1429,7 +1429,7 @@ static int trace_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!perfmon_capable() && !event->attr.inherit)
return -EACCES;
/* Return if this is a couting event */
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: Nicholas Piggin @ 2021-02-02 6:38 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Christophe Leroy, David Laight,
Michael Ellerman, msuchanek@suse.de, Paul Mackerras
Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <0cf90825-da89-6464-98d4-dc7490bff557@csgroup.eu>
Excerpts from Christophe Leroy's message of February 2, 2021 4:15 pm:
>
>
> Le 28/01/2021 à 00:50, Nicholas Piggin a écrit :
>> Excerpts from David Laight's message of January 26, 2021 8:28 pm:
>>> From: Nicholas Piggin
>>>> Sent: 26 January 2021 10:21
>>>>
>>>> Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
>>>>> syscall_64.c will be reused almost as is for PPC32.
>>>>>
>>>>> Rename it syscall.c
>>>>
>>>> Could you rename it to interrupt.c instead? A system call is an
>>>> interrupt, and the file now also has code to return from other
>>>> interrupts as well, and it matches the new asm/interrupt.h from
>>>> the interrupts series.
>>>
>>> Hmmm....
>>>
>>> That might make it harder for someone looking for the system call
>>> entry code to find it.
>>
>> It's very grep'able.
>>
>>> In some sense interrupts are the simpler case.
>>>
>>> Especially when comparing with other architectures which have
>>> special instructions for syscall entry.
>>
>> powerpc does have a special instruction for syscall, and it causes a
>> system call interrupt.
>>
>> I'm not sure about other architectures, but for powerpc its more
>> sensible to call it interrupt.c than syscall.c.
>
> Many other architectures have a syscall.c but for a different purpose: it contains arch specific
> system calls. We have that in powerpc as well, it is called syscalls.c
>
> So to avoid confusion, I'll rename it. But I think "interrupt" is maybe not the right name. An
> interrupt most of the time refers to IRQ.
That depends what you mean by interrupt and IRQ.
Linux kind of considers any asynchronous maskable interrupt an irq
(local_irq_disable()). But if you say irq it's more likely to mean
a device interrupt, and "interrupt" usually refres to the asynch
ones.
But Linux doesn't really assign names to synchronous interrupts in
core code. It doesn't say they aren't interrupts, it just doesn't
really have a convention for them at all.
Other architectures e.g., x86 also have things like interrupt
descriptor table for synchronous interrupts as well. That's where
I got the interrupt wrappers code from actually.
So it's really fine to use the proper arch-specific names for things
in arch code. I'm trying to slowly change names from exception to
interrupt.
> For me system call is not an interrupt in the way it
> doesn't unexpectedly interrupt a program flow. In powerpc manuals it is generally called exceptions,
> no I'm more inclined to call it exception.c
Actually that's backwards. Powerpc manuals (at least the one I look at)
calls them all interrupts including system calls, and also the system
call interrupt is actually the only one that doesn't appear to be
associated with an exception.
Also there is no distinction about expecte/unexpected -- a data storage
interrupt is expected if you access a location without the right access
permissions for example, but it is still an interrupt.
These handlers very specifically deal with the change to execution flow
(i.e., the interrupt), they do *not* deal with the exception which may
be associated with it (that is the job of the handler).
And on the other hand you can deal with exceptions in some cases without
taking an interrupt at all. For example if you had MSR[EE]=0 you could
change the decrementer or execute msgclr or change HMER SPR etc to clear
various exceptions without ever taking the interrupt.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Aneesh Kumar K.V @ 2021-02-02 6:30 UTC (permalink / raw)
To: Christophe Leroy, Nicholas Piggin, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <7c06ba68-9959-44bc-233b-473d7cbc574a@csgroup.eu>
On 2/2/21 11:50 AM, Christophe Leroy wrote:
>
>
> Le 02/02/2021 à 07:16, Aneesh Kumar K.V a écrit :
>> On 2/2/21 11:32 AM, Christophe Leroy wrote:
>>>
>>>
>>> Le 02/02/2021 à 06:55, Aneesh Kumar K.V a écrit :
>>>> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>>
>>>>>> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>>>>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>>>>>> +Aneesh
>>>>>>>>
>>>>>>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>>>>>>> ..
>>>>>>>>> [ 96.200296] ------------[ cut here ]------------
>>>>>>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>>>>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at
>>>>>>>>> arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>>>>>>>
>>>>>>>>> [ 96.200734] NIP [c000000000849424]
>>>>>>>>> fault_in_pages_readable+0x104/0x350
>>>>>>>>> [ 96.200741] LR [c00000000084952c]
>>>>>>>>> fault_in_pages_readable+0x20c/0x350
>>>>>>>>> [ 96.200747] --- interrupt: 300
>>>>>>>>
>>>>>>>>
>>>>>>>> Problem happens in a section where userspace access is supposed
>>>>>>>> to be granted, so the patch you
>>>>>>>> proposed is definitely not the right fix.
>>>>>>>>
>>>>>>>> c000000000849408: 2c 01 00 4c isync
>>>>>>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting
>>>>>>>> userspace access permission
>>>>>>>> c000000000849410: 2c 01 00 4c isync
>>>>>>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>>>>>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>>>>>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>>>>>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>>>>>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <==
>>>>>>>> accessing userspace
>>>>>>>> c000000000849428: 10 00 82 41 beq c000000000849438
>>>>>>>> <fault_in_pages_readable+0x118>
>>>>>>>> c00000000084942c: 2c 01 00 4c isync
>>>>>>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <==
>>>>>>>> clearing userspace access permission
>>>>>>>> c000000000849434: 2c 01 00 4c isync
>>>>>>>>
>>>>>>>> My first guess is that the problem is linked to the following
>>>>>>>> function, see the comment
>>>>>>>>
>>>>>>>> /*
>>>>>>>> * For kernel thread that doesn't have thread.regs return
>>>>>>>> * default AMR/IAMR values.
>>>>>>>> */
>>>>>>>> static inline u64 current_thread_amr(void)
>>>>>>>> {
>>>>>>>> if (current->thread.regs)
>>>>>>>> return current->thread.regs->amr;
>>>>>>>> return AMR_KUAP_BLOCKED;
>>>>>>>> }
>>>>>>>>
>>>>>>>> Above function was introduced by commit 48a8ab4eeb82
>>>>>>>> ("powerpc/book3s64/pkeys: Don't update SPRN_AMR
>>>>>>>> when in kernel mode")
>>>>>>>
>>>>>>> Yeah that's a bit of a curly one.
>>>>>>>
>>>>>>> At some point io_uring did kthread_use_mm(), which is supposed to
>>>>>>> mean
>>>>>>> the kthread can operate on behalf of the original process that
>>>>>>> submitted
>>>>>>> the IO.
>>>>>>>
>>>>>>> But because KUAP is implemented using memory protection keys, it
>>>>>>> depends
>>>>>>> on the value of the AMR register, which is not part of the mm,
>>>>>>> it's in
>>>>>>> thread.regs->amr.
>>>>>>>
>>>>>>> And what's worse by the time we're in kthread_use_mm() we no
>>>>>>> longer have
>>>>>>> access to the thread.regs->amr of the original process that
>>>>>>> submitted
>>>>>>> the IO.
>>>>>>>
>>>>>>> We also can't simply move the AMR into the mm, precisely because
>>>>>>> it's
>>>>>>> per thread, not per mm.
>>>>>>>
>>>>>>> So TBH I don't know how we're going to fix this.
>>>>>>>
>>>>>>> I guess we could return AMR=unblocked for kernel threads, but that's
>>>>>>> arguably a bug because it allows a process to circumvent memory
>>>>>>> keys by
>>>>>>> asking the kernel to do the access.
>>>>>>
>>>>>> We shouldn't need to inherit AMR should we? We only need it to be
>>>>>> locked
>>>>>> for kernel threads until it's explicitly unlocked -- nothing mm
>>>>>> specific
>>>>>> there. I think current_thread_amr could return 0 for kernel
>>>>>> threads? Or
>>>>>> I would even avoid using that function for allow_user_access and open
>>>>>> code the kthread case and remove it from current_thread_amr().
>>>>>>
>>>>>> Thanks,
>>>>>> Nick
>>>>>
>>>>
>>>> updated one
>>>>
>>>> From 8fdb0680f983940d61f91da8252b13c8d3e8ebee Mon Sep 17 00:00:00 2001
>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
>>>> Date: Tue, 2 Feb 2021 09:23:38 +0530
>>>> Subject: [PATCH v2] powerpc/kuap: Allow kernel thread to access
>>>> userspace
>>>> after kthread_use_mm
>>>>
>>>> This fix the bad fault reported by KUAP when io_wqe_worker access
>>>> userspace.
>>>>
>>>> Bug: Read fault blocked by KUAP!
>>>> WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229
>>>> __do_page_fault+0x6b4/0xcd0
>>>> NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
>>>> LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
>>>> ..........
>>>> Call Trace:
>>>> [c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
>>>> (unreliable)
>>>> [c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
>>>> [c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
>>>> --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
>>>> ..........
>>>> NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
>>>> LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
>>>> interrupt: 300
>>>> [c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
>>>> [c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
>>>> [c000000016367990] [c000000000710330]
>>>> iomap_file_buffered_write+0xa0/0x120
>>>> [c0000000163679e0] [c00800000040791c]
>>>> xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
>>>> [c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
>>>> [c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
>>>> [c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
>>>> [c000000016367cb0] [c0000000006e2578]
>>>> io_worker_handle_work+0x498/0x800
>>>> [c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
>>>> [c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
>>>> [c000000016367e10] [c00000000000dbf0]
>>>> ret_from_kernel_thread+0x5c/0x6c
>>>>
>>>> The kernel consider thread AMR value for kernel thread to be
>>>> AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
>>>> of course not correct and we should allow userspace access after
>>>> kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
>>>> AMR value of the operating address space. But, the AMR value is
>>>> thread-specific and we inherit the address space and not thread
>>>> access restrictions. Because of this ignore AMR value when accessing
>>>> userspace via kernel thread.
>>>>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> ---
>>>> Changes from v1:
>>>> * Address review feedback from Nick
>>>>
>>>> arch/powerpc/include/asm/book3s/64/kup.h | 8 +++++++-
>>>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h
>>>> b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> index f50f72e535aa..95f4df99249e 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>>>> @@ -384,7 +384,13 @@ static __always_inline void
>>>> allow_user_access(void __user *to, const void __user
>>>> // This is written so we can resolve to a single case at build
>>>> time
>>>> BUILD_BUG_ON(!__builtin_constant_p(dir));
>>>> - if (mmu_has_feature(MMU_FTR_PKEY))
>>>> + /*
>>>> + * if it is a kthread that did kthread_use_mm() don't
>>>> + * use current_thread_amr().
>>>
>>> According to include/linux/sched.h, PF_KTHREAD means /* I am a kernel
>>> thread */
>>> It doesn't seem to be related to kthread_use_mm()
>>
>> That should be a sufficient check here. if we did reach here without
>> calling kthread_user_mm, we will crash on access because we don't have
>> a mm attached to the current process. a kernel thread with
>> kthread_use_mm has
>
> Ok but then the comment doesn't match the check.
I was trying to be explict in the comment that we expect the thread to
have done kthread_use_mm().
>
> And also the comment in current_thread_amr() is then misleading.
>
> Why not do the current->flags & PF_KTHREAD check in current_thread_amr()
> and return 0 in that case instead of BLOCKED ?
In my view currrent_thread_amr() is more generic and we want to be
explicit there that a kernel thread AMR is KUAP_BLOCKED. Only when we
call allow user access, we relax the AMR value.
>
>>
>> current->mm == current->active_mm && current->flags & PF_KTHREAD.
>>
>> The first part is true for every other process too.
>>
>>>
>>>> + */
>>>> + if (current->flags & PF_KTHREAD)
>>>> + thread_amr = 0;
>>>> + else if (mmu_has_feature(MMU_FTR_PKEY))
>>>> thread_amr = current_thread_amr();
>>>> if (dir == KUAP_READ)
>>>>
>>>
>>> Christophe
>>
>>
>> -aneesh
^ permalink raw reply
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Christophe Leroy @ 2021-02-02 6:20 UTC (permalink / raw)
To: Aneesh Kumar K.V, Nicholas Piggin, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <7c48c517-700d-e114-503d-e68e0e73c534@linux.ibm.com>
Le 02/02/2021 à 07:16, Aneesh Kumar K.V a écrit :
> On 2/2/21 11:32 AM, Christophe Leroy wrote:
>>
>>
>> Le 02/02/2021 à 06:55, Aneesh Kumar K.V a écrit :
>>> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:
>>>
>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>
>>>>> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>>>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>>>>> +Aneesh
>>>>>>>
>>>>>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>>>>>> ..
>>>>>>>> [ 96.200296] ------------[ cut here ]------------
>>>>>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>>>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at arch/powerpc/mm/fault.c:229
>>>>>>>> bad_kernel_fault+0x180/0x310
>>>>>>>
>>>>>>>> [ 96.200734] NIP [c000000000849424] fault_in_pages_readable+0x104/0x350
>>>>>>>> [ 96.200741] LR [c00000000084952c] fault_in_pages_readable+0x20c/0x350
>>>>>>>> [ 96.200747] --- interrupt: 300
>>>>>>>
>>>>>>>
>>>>>>> Problem happens in a section where userspace access is supposed to be granted, so the patch you
>>>>>>> proposed is definitely not the right fix.
>>>>>>>
>>>>>>> c000000000849408: 2c 01 00 4c isync
>>>>>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting userspace access permission
>>>>>>> c000000000849410: 2c 01 00 4c isync
>>>>>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>>>>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>>>>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>>>>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>>>>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <== accessing userspace
>>>>>>> c000000000849428: 10 00 82 41 beq c000000000849438 <fault_in_pages_readable+0x118>
>>>>>>> c00000000084942c: 2c 01 00 4c isync
>>>>>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <== clearing userspace access permission
>>>>>>> c000000000849434: 2c 01 00 4c isync
>>>>>>>
>>>>>>> My first guess is that the problem is linked to the following function, see the comment
>>>>>>>
>>>>>>> /*
>>>>>>> * For kernel thread that doesn't have thread.regs return
>>>>>>> * default AMR/IAMR values.
>>>>>>> */
>>>>>>> static inline u64 current_thread_amr(void)
>>>>>>> {
>>>>>>> if (current->thread.regs)
>>>>>>> return current->thread.regs->amr;
>>>>>>> return AMR_KUAP_BLOCKED;
>>>>>>> }
>>>>>>>
>>>>>>> Above function was introduced by commit 48a8ab4eeb82 ("powerpc/book3s64/pkeys: Don't update
>>>>>>> SPRN_AMR
>>>>>>> when in kernel mode")
>>>>>>
>>>>>> Yeah that's a bit of a curly one.
>>>>>>
>>>>>> At some point io_uring did kthread_use_mm(), which is supposed to mean
>>>>>> the kthread can operate on behalf of the original process that submitted
>>>>>> the IO.
>>>>>>
>>>>>> But because KUAP is implemented using memory protection keys, it depends
>>>>>> on the value of the AMR register, which is not part of the mm, it's in
>>>>>> thread.regs->amr.
>>>>>>
>>>>>> And what's worse by the time we're in kthread_use_mm() we no longer have
>>>>>> access to the thread.regs->amr of the original process that submitted
>>>>>> the IO.
>>>>>>
>>>>>> We also can't simply move the AMR into the mm, precisely because it's
>>>>>> per thread, not per mm.
>>>>>>
>>>>>> So TBH I don't know how we're going to fix this.
>>>>>>
>>>>>> I guess we could return AMR=unblocked for kernel threads, but that's
>>>>>> arguably a bug because it allows a process to circumvent memory keys by
>>>>>> asking the kernel to do the access.
>>>>>
>>>>> We shouldn't need to inherit AMR should we? We only need it to be locked
>>>>> for kernel threads until it's explicitly unlocked -- nothing mm specific
>>>>> there. I think current_thread_amr could return 0 for kernel threads? Or
>>>>> I would even avoid using that function for allow_user_access and open
>>>>> code the kthread case and remove it from current_thread_amr().
>>>>>
>>>>> Thanks,
>>>>> Nick
>>>>
>>>
>>> updated one
>>>
>>> From 8fdb0680f983940d61f91da8252b13c8d3e8ebee Mon Sep 17 00:00:00 2001
>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
>>> Date: Tue, 2 Feb 2021 09:23:38 +0530
>>> Subject: [PATCH v2] powerpc/kuap: Allow kernel thread to access userspace
>>> after kthread_use_mm
>>>
>>> This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
>>>
>>> Bug: Read fault blocked by KUAP!
>>> WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
>>> NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
>>> LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
>>> ..........
>>> Call Trace:
>>> [c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
>>> [c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
>>> [c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
>>> --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
>>> ..........
>>> NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
>>> LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
>>> interrupt: 300
>>> [c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
>>> [c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
>>> [c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
>>> [c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
>>> [c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
>>> [c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
>>> [c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
>>> [c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
>>> [c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
>>> [c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
>>> [c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
>>>
>>> The kernel consider thread AMR value for kernel thread to be
>>> AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
>>> of course not correct and we should allow userspace access after
>>> kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
>>> AMR value of the operating address space. But, the AMR value is
>>> thread-specific and we inherit the address space and not thread
>>> access restrictions. Because of this ignore AMR value when accessing
>>> userspace via kernel thread.
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>> Changes from v1:
>>> * Address review feedback from Nick
>>>
>>> arch/powerpc/include/asm/book3s/64/kup.h | 8 +++++++-
>>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
>>> index f50f72e535aa..95f4df99249e 100644
>>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>>> @@ -384,7 +384,13 @@ static __always_inline void allow_user_access(void __user *to, const void
>>> __user
>>> // This is written so we can resolve to a single case at build time
>>> BUILD_BUG_ON(!__builtin_constant_p(dir));
>>> - if (mmu_has_feature(MMU_FTR_PKEY))
>>> + /*
>>> + * if it is a kthread that did kthread_use_mm() don't
>>> + * use current_thread_amr().
>>
>> According to include/linux/sched.h, PF_KTHREAD means /* I am a kernel thread */
>> It doesn't seem to be related to kthread_use_mm()
>
> That should be a sufficient check here. if we did reach here without calling kthread_user_mm, we
> will crash on access because we don't have a mm attached to the current process. a kernel thread
> with kthread_use_mm has
Ok but then the comment doesn't match the check.
And also the comment in current_thread_amr() is then misleading.
Why not do the current->flags & PF_KTHREAD check in current_thread_amr() and return 0 in that case
instead of BLOCKED ?
>
> current->mm == current->active_mm && current->flags & PF_KTHREAD.
>
> The first part is true for every other process too.
>
>>
>>> + */
>>> + if (current->flags & PF_KTHREAD)
>>> + thread_amr = 0;
>>> + else if (mmu_has_feature(MMU_FTR_PKEY))
>>> thread_amr = current_thread_amr();
>>> if (dir == KUAP_READ)
>>>
>>
>> Christophe
>
>
> -aneesh
^ permalink raw reply
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Aneesh Kumar K.V @ 2021-02-02 6:16 UTC (permalink / raw)
To: Christophe Leroy, Nicholas Piggin, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <6b081ea7-e4ee-21bb-7085-e33b4e5c6205@csgroup.eu>
On 2/2/21 11:32 AM, Christophe Leroy wrote:
>
>
> Le 02/02/2021 à 06:55, Aneesh Kumar K.V a écrit :
>> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:
>>
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>
>>>> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>>>> +Aneesh
>>>>>>
>>>>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>>>>> ..
>>>>>>> [ 96.200296] ------------[ cut here ]------------
>>>>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at
>>>>>>> arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>>>>>
>>>>>>> [ 96.200734] NIP [c000000000849424]
>>>>>>> fault_in_pages_readable+0x104/0x350
>>>>>>> [ 96.200741] LR [c00000000084952c]
>>>>>>> fault_in_pages_readable+0x20c/0x350
>>>>>>> [ 96.200747] --- interrupt: 300
>>>>>>
>>>>>>
>>>>>> Problem happens in a section where userspace access is supposed to
>>>>>> be granted, so the patch you
>>>>>> proposed is definitely not the right fix.
>>>>>>
>>>>>> c000000000849408: 2c 01 00 4c isync
>>>>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting
>>>>>> userspace access permission
>>>>>> c000000000849410: 2c 01 00 4c isync
>>>>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>>>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>>>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>>>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>>>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <==
>>>>>> accessing userspace
>>>>>> c000000000849428: 10 00 82 41 beq c000000000849438
>>>>>> <fault_in_pages_readable+0x118>
>>>>>> c00000000084942c: 2c 01 00 4c isync
>>>>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <== clearing
>>>>>> userspace access permission
>>>>>> c000000000849434: 2c 01 00 4c isync
>>>>>>
>>>>>> My first guess is that the problem is linked to the following
>>>>>> function, see the comment
>>>>>>
>>>>>> /*
>>>>>> * For kernel thread that doesn't have thread.regs return
>>>>>> * default AMR/IAMR values.
>>>>>> */
>>>>>> static inline u64 current_thread_amr(void)
>>>>>> {
>>>>>> if (current->thread.regs)
>>>>>> return current->thread.regs->amr;
>>>>>> return AMR_KUAP_BLOCKED;
>>>>>> }
>>>>>>
>>>>>> Above function was introduced by commit 48a8ab4eeb82
>>>>>> ("powerpc/book3s64/pkeys: Don't update SPRN_AMR
>>>>>> when in kernel mode")
>>>>>
>>>>> Yeah that's a bit of a curly one.
>>>>>
>>>>> At some point io_uring did kthread_use_mm(), which is supposed to mean
>>>>> the kthread can operate on behalf of the original process that
>>>>> submitted
>>>>> the IO.
>>>>>
>>>>> But because KUAP is implemented using memory protection keys, it
>>>>> depends
>>>>> on the value of the AMR register, which is not part of the mm, it's in
>>>>> thread.regs->amr.
>>>>>
>>>>> And what's worse by the time we're in kthread_use_mm() we no longer
>>>>> have
>>>>> access to the thread.regs->amr of the original process that submitted
>>>>> the IO.
>>>>>
>>>>> We also can't simply move the AMR into the mm, precisely because it's
>>>>> per thread, not per mm.
>>>>>
>>>>> So TBH I don't know how we're going to fix this.
>>>>>
>>>>> I guess we could return AMR=unblocked for kernel threads, but that's
>>>>> arguably a bug because it allows a process to circumvent memory
>>>>> keys by
>>>>> asking the kernel to do the access.
>>>>
>>>> We shouldn't need to inherit AMR should we? We only need it to be
>>>> locked
>>>> for kernel threads until it's explicitly unlocked -- nothing mm
>>>> specific
>>>> there. I think current_thread_amr could return 0 for kernel threads? Or
>>>> I would even avoid using that function for allow_user_access and open
>>>> code the kthread case and remove it from current_thread_amr().
>>>>
>>>> Thanks,
>>>> Nick
>>>
>>
>> updated one
>>
>> From 8fdb0680f983940d61f91da8252b13c8d3e8ebee Mon Sep 17 00:00:00 2001
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
>> Date: Tue, 2 Feb 2021 09:23:38 +0530
>> Subject: [PATCH v2] powerpc/kuap: Allow kernel thread to access userspace
>> after kthread_use_mm
>>
>> This fix the bad fault reported by KUAP when io_wqe_worker access
>> userspace.
>>
>> Bug: Read fault blocked by KUAP!
>> WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229
>> __do_page_fault+0x6b4/0xcd0
>> NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
>> LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
>> ..........
>> Call Trace:
>> [c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
>> (unreliable)
>> [c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
>> [c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
>> --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
>> ..........
>> NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
>> LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
>> interrupt: 300
>> [c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
>> [c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
>> [c000000016367990] [c000000000710330]
>> iomap_file_buffered_write+0xa0/0x120
>> [c0000000163679e0] [c00800000040791c]
>> xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
>> [c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
>> [c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
>> [c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
>> [c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
>> [c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
>> [c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
>> [c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
>>
>> The kernel consider thread AMR value for kernel thread to be
>> AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
>> of course not correct and we should allow userspace access after
>> kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
>> AMR value of the operating address space. But, the AMR value is
>> thread-specific and we inherit the address space and not thread
>> access restrictions. Because of this ignore AMR value when accessing
>> userspace via kernel thread.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>> Changes from v1:
>> * Address review feedback from Nick
>>
>> arch/powerpc/include/asm/book3s/64/kup.h | 8 +++++++-
>> 1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h
>> b/arch/powerpc/include/asm/book3s/64/kup.h
>> index f50f72e535aa..95f4df99249e 100644
>> --- a/arch/powerpc/include/asm/book3s/64/kup.h
>> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
>> @@ -384,7 +384,13 @@ static __always_inline void
>> allow_user_access(void __user *to, const void __user
>> // This is written so we can resolve to a single case at build time
>> BUILD_BUG_ON(!__builtin_constant_p(dir));
>> - if (mmu_has_feature(MMU_FTR_PKEY))
>> + /*
>> + * if it is a kthread that did kthread_use_mm() don't
>> + * use current_thread_amr().
>
> According to include/linux/sched.h, PF_KTHREAD means /* I am a kernel
> thread */
> It doesn't seem to be related to kthread_use_mm()
That should be a sufficient check here. if we did reach here without
calling kthread_user_mm, we will crash on access because we don't have a
mm attached to the current process. a kernel thread with kthread_use_mm has
current->mm == current->active_mm && current->flags & PF_KTHREAD.
The first part is true for every other process too.
>
>> + */
>> + if (current->flags & PF_KTHREAD)
>> + thread_amr = 0;
>> + else if (mmu_has_feature(MMU_FTR_PKEY))
>> thread_amr = current_thread_amr();
>> if (dir == KUAP_READ)
>>
>
> Christophe
-aneesh
^ permalink raw reply
* Re: [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: Christophe Leroy @ 2021-02-02 6:15 UTC (permalink / raw)
To: Nicholas Piggin, Benjamin Herrenschmidt, David Laight,
Michael Ellerman, msuchanek@suse.de, Paul Mackerras
Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <1611791083.sqnnh21vv0.astroid@bobo.none>
Le 28/01/2021 à 00:50, Nicholas Piggin a écrit :
> Excerpts from David Laight's message of January 26, 2021 8:28 pm:
>> From: Nicholas Piggin
>>> Sent: 26 January 2021 10:21
>>>
>>> Excerpts from Christophe Leroy's message of January 26, 2021 12:48 am:
>>>> syscall_64.c will be reused almost as is for PPC32.
>>>>
>>>> Rename it syscall.c
>>>
>>> Could you rename it to interrupt.c instead? A system call is an
>>> interrupt, and the file now also has code to return from other
>>> interrupts as well, and it matches the new asm/interrupt.h from
>>> the interrupts series.
>>
>> Hmmm....
>>
>> That might make it harder for someone looking for the system call
>> entry code to find it.
>
> It's very grep'able.
>
>> In some sense interrupts are the simpler case.
>>
>> Especially when comparing with other architectures which have
>> special instructions for syscall entry.
>
> powerpc does have a special instruction for syscall, and it causes a
> system call interrupt.
>
> I'm not sure about other architectures, but for powerpc its more
> sensible to call it interrupt.c than syscall.c.
Many other architectures have a syscall.c but for a different purpose: it contains arch specific
system calls. We have that in powerpc as well, it is called syscalls.c
So to avoid confusion, I'll rename it. But I think "interrupt" is maybe not the right name. An
interrupt most of the time refers to IRQ. For me system call is not an interrupt in the way it
doesn't unexpectedly interrupt a program flow. In powerpc manuals it is generally called exceptions,
no I'm more inclined to call it exception.c
Christophe
^ permalink raw reply
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Christophe Leroy @ 2021-02-02 6:02 UTC (permalink / raw)
To: Aneesh Kumar K.V, Nicholas Piggin, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <874kivrp2v.fsf@linux.ibm.com>
Le 02/02/2021 à 06:55, Aneesh Kumar K.V a écrit :
> Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:
>
>> Nicholas Piggin <npiggin@gmail.com> writes:
>>
>>> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>>> +Aneesh
>>>>>
>>>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>>>> ..
>>>>>> [ 96.200296] ------------[ cut here ]------------
>>>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>>>>
>>>>>> [ 96.200734] NIP [c000000000849424] fault_in_pages_readable+0x104/0x350
>>>>>> [ 96.200741] LR [c00000000084952c] fault_in_pages_readable+0x20c/0x350
>>>>>> [ 96.200747] --- interrupt: 300
>>>>>
>>>>>
>>>>> Problem happens in a section where userspace access is supposed to be granted, so the patch you
>>>>> proposed is definitely not the right fix.
>>>>>
>>>>> c000000000849408: 2c 01 00 4c isync
>>>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting userspace access permission
>>>>> c000000000849410: 2c 01 00 4c isync
>>>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <== accessing userspace
>>>>> c000000000849428: 10 00 82 41 beq c000000000849438 <fault_in_pages_readable+0x118>
>>>>> c00000000084942c: 2c 01 00 4c isync
>>>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <== clearing userspace access permission
>>>>> c000000000849434: 2c 01 00 4c isync
>>>>>
>>>>> My first guess is that the problem is linked to the following function, see the comment
>>>>>
>>>>> /*
>>>>> * For kernel thread that doesn't have thread.regs return
>>>>> * default AMR/IAMR values.
>>>>> */
>>>>> static inline u64 current_thread_amr(void)
>>>>> {
>>>>> if (current->thread.regs)
>>>>> return current->thread.regs->amr;
>>>>> return AMR_KUAP_BLOCKED;
>>>>> }
>>>>>
>>>>> Above function was introduced by commit 48a8ab4eeb82 ("powerpc/book3s64/pkeys: Don't update SPRN_AMR
>>>>> when in kernel mode")
>>>>
>>>> Yeah that's a bit of a curly one.
>>>>
>>>> At some point io_uring did kthread_use_mm(), which is supposed to mean
>>>> the kthread can operate on behalf of the original process that submitted
>>>> the IO.
>>>>
>>>> But because KUAP is implemented using memory protection keys, it depends
>>>> on the value of the AMR register, which is not part of the mm, it's in
>>>> thread.regs->amr.
>>>>
>>>> And what's worse by the time we're in kthread_use_mm() we no longer have
>>>> access to the thread.regs->amr of the original process that submitted
>>>> the IO.
>>>>
>>>> We also can't simply move the AMR into the mm, precisely because it's
>>>> per thread, not per mm.
>>>>
>>>> So TBH I don't know how we're going to fix this.
>>>>
>>>> I guess we could return AMR=unblocked for kernel threads, but that's
>>>> arguably a bug because it allows a process to circumvent memory keys by
>>>> asking the kernel to do the access.
>>>
>>> We shouldn't need to inherit AMR should we? We only need it to be locked
>>> for kernel threads until it's explicitly unlocked -- nothing mm specific
>>> there. I think current_thread_amr could return 0 for kernel threads? Or
>>> I would even avoid using that function for allow_user_access and open
>>> code the kthread case and remove it from current_thread_amr().
>>>
>>> Thanks,
>>> Nick
>>
>
> updated one
>
> From 8fdb0680f983940d61f91da8252b13c8d3e8ebee Mon Sep 17 00:00:00 2001
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
> Date: Tue, 2 Feb 2021 09:23:38 +0530
> Subject: [PATCH v2] powerpc/kuap: Allow kernel thread to access userspace
> after kthread_use_mm
>
> This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
>
> Bug: Read fault blocked by KUAP!
> WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
> NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
> LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
> ..........
> Call Trace:
> [c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
> [c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
> [c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
> --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
> ..........
> NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
> LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
> interrupt: 300
> [c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
> [c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
> [c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
> [c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
> [c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
> [c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
> [c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
> [c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
> [c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
> [c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
> [c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
>
> The kernel consider thread AMR value for kernel thread to be
> AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
> of course not correct and we should allow userspace access after
> kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
> AMR value of the operating address space. But, the AMR value is
> thread-specific and we inherit the address space and not thread
> access restrictions. Because of this ignore AMR value when accessing
> userspace via kernel thread.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> Changes from v1:
> * Address review feedback from Nick
>
> arch/powerpc/include/asm/book3s/64/kup.h | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
> index f50f72e535aa..95f4df99249e 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
> @@ -384,7 +384,13 @@ static __always_inline void allow_user_access(void __user *to, const void __user
> // This is written so we can resolve to a single case at build time
> BUILD_BUG_ON(!__builtin_constant_p(dir));
>
> - if (mmu_has_feature(MMU_FTR_PKEY))
> + /*
> + * if it is a kthread that did kthread_use_mm() don't
> + * use current_thread_amr().
According to include/linux/sched.h, PF_KTHREAD means /* I am a kernel thread */
It doesn't seem to be related to kthread_use_mm()
> + */
> + if (current->flags & PF_KTHREAD)
> + thread_amr = 0;
> + else if (mmu_has_feature(MMU_FTR_PKEY))
> thread_amr = current_thread_amr();
>
> if (dir == KUAP_READ)
>
Christophe
^ permalink raw reply
* Re: [PATCH v7 00/42] powerpc: interrupt wrappers
From: Christophe Leroy @ 2021-02-02 5:57 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Athira Rajeev
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>
Le 30/01/2021 à 14:08, Nicholas Piggin a écrit :
> This adds interrupt handler wrapper functions, similar to the
> generic / x86 code, and moves several common operations into them
> from either asm or open coded in the individual handlers.
>
> This series is based on powerpc fixes-test tree, there's another
> unrelated pending fix in patch 1 of the series which clashes a
> bit.
This series trivialy conflicts with
https://github.com/linuxppc/linux/commit/11f9c1d2fb497f69f83d4fab6fb7fc8a6884eded on powerpc next tree.
>
> This includes more changes and fixes suggested by Christophe,
> a few minor bug fixes and compile fix noticed by kbuild, and
> some NMI changes Athira asked about -- PMI interrupts don't
> block tracing when they are soft-NMI.
>
> Since v1:
> - Fixed a couple of compile issues
> - Fixed perf weirdness (sometimes NMI, sometimes not)
> - Also move irq_enter/exit into wrappers
>
> Since v2:
> - Rebased upstream
> - Took code in patch 3 from Christophe
> - Fixed some compile errors from 0day
>
> Since v3:
> - Rebased
> - Split Christophe's 32s DABR patch into its own patch
> - Fixed missing asm from 32s on patch 3 noticed by Christophe.
> - Moved changes around, split out one more patch (patch 9) to make
> changes more logical and atomic.
> - Add comments explaining _RAW handlers (SLB, HPTE) interrupts better
>
> Since v4:
> - Rebased (on top of scv fallback flush fix)
> - Rearranged a few changes into different patches from Christophe,
> e.g., the ___do_page_fault change from patch 2 to 10. I didn't
> do everything (e.g., splitting to update __hash_page to drop the
> msr argument before the bulk of patch 2 seemed like churn without
> much improvement), and also other things like removing the new
> ___do_page_fault variant if we can change hash fault context tracking
> I didn't get time to completely investigate and implement. I think
> this shouldn't be a showstopper though we can make more improvements
> as we go.
>
> Since v5:
> - Lots of good review suggestions from Christophe, see v5 email threads.
> - Major change being do_break is left in asm and selected early as an
> alternate interrupt handler now, which is a smaller step and matches
> other subarchs better.
> - Rearranged patches, split, moved things, bug fixes, etc.
> - Converted a few more missed exception handlers for debug and ras
>
> Since v6:
> - Move related interrupt handler de-argify patches together [Christophe]
> - Split do_bad_page_fault patch [Christophe]
> - Change do_page_fault cleanup patch [Christophe]
> - entry_32.S can't avoid saving r4/r5 until later in the series [Christophe]
> - Soft-NMI decrementer and perf don't block ftrace [Athira]
> - Rebased on some fixes
> - Fixed mismerge / duplicate line in patch 40
> - Fix kbuild hash missing declaration bug
>
> Christophe Leroy (1):
> powerpc/32s: move DABR match out of handle_page_fault
>
> Nicholas Piggin (41):
> powerpc/64s: interrupt exit improve bounding of interrupt recursion
> KVM: PPC: Book3S HV: Context tracking exit guest context before
> enabling irqs
> powerpc/64s: move DABR match out of handle_page_fault
> powerpc/64s: move the hash fault handling logic to C
> powerpc: remove arguments from fault handler functions
> powerpc/fsl_booke/32: CacheLockingException remove args
> powerpc: do_break get registers from regs
> powerpc: DebugException remove args
> powerpc/32: transfer can avoid saving r4/r5 over trace call
> powerpc: bad_page_fault get registers from regs
> powerpc/64s: add do_bad_page_fault_segv handler
> powerpc: rearrange do_page_fault error case to be inside
> exception_enter
> powerpc/64s: move bad_page_fault handling to C
> powerpc/64s: split do_hash_fault
> powerpc/mm: Remove stale do_page_fault comment referring to SLB faults
> powerpc/64s: slb comment update
> powerpc/traps: add NOKPROBE_SYMBOL for sreset and mce
> powerpc/perf: move perf irq/nmi handling details into traps.c
> powerpc/time: move timer_broadcast_interrupt prototype to asm/time.h
> powerpc: add and use unknown_async_exception
> powerpc/cell: tidy up pervasive declarations
> powerpc: introduce die_mce
> powerpc/mce: ensure machine check handler always tests RI
> powerpc: improve handling of unrecoverable system reset
> powerpc: interrupt handler wrapper functions
> powerpc: add interrupt wrapper entry / exit stub functions
> powerpc: convert interrupt handlers to use wrappers
> powerpc: add interrupt_cond_local_irq_enable helper
> powerpc/64: context tracking remove _TIF_NOHZ
> powerpc/64s/hash: improve context tracking of hash faults
> powerpc/64: context tracking move to interrupt wrappers
> powerpc/64: add context tracking to asynchronous interrupts
> powerpc: handle irq_enter/irq_exit in interrupt handler wrappers
> powerpc/64s: move context tracking exit to interrupt exit path
> powerpc/64s: reconcile interrupts in C
> powerpc/64: move account_stolen_time into its own function
> powerpc/64: entry cpu time accounting in C
> powerpc: move NMI entry/exit code into wrapper
> powerpc/64s: move NMI soft-mask handling to C
> powerpc/64s: runlatch interrupt handling in C
> powerpc/64s: power4 nap fixup in C
>
> arch/powerpc/Kconfig | 1 -
> arch/powerpc/include/asm/asm-prototypes.h | 29 --
> arch/powerpc/include/asm/bug.h | 9 +-
> arch/powerpc/include/asm/cputime.h | 14 +
> arch/powerpc/include/asm/debug.h | 4 -
> arch/powerpc/include/asm/hw_irq.h | 9 -
> arch/powerpc/include/asm/interrupt.h | 437 +++++++++++++++++++++
> arch/powerpc/include/asm/ppc_asm.h | 24 --
> arch/powerpc/include/asm/processor.h | 1 +
> arch/powerpc/include/asm/thread_info.h | 10 +-
> arch/powerpc/include/asm/time.h | 2 +
> arch/powerpc/kernel/dbell.c | 9 +-
> arch/powerpc/kernel/entry_32.S | 25 +-
> arch/powerpc/kernel/exceptions-64e.S | 8 +-
> arch/powerpc/kernel/exceptions-64s.S | 310 ++-------------
> arch/powerpc/kernel/head_40x.S | 11 +-
> arch/powerpc/kernel/head_8xx.S | 11 +-
> arch/powerpc/kernel/head_book3s_32.S | 14 +-
> arch/powerpc/kernel/head_booke.h | 6 +-
> arch/powerpc/kernel/head_fsl_booke.S | 6 +-
> arch/powerpc/kernel/idle_book3s.S | 4 +
> arch/powerpc/kernel/irq.c | 7 +-
> arch/powerpc/kernel/mce.c | 16 +-
> arch/powerpc/kernel/process.c | 8 +-
> arch/powerpc/kernel/ptrace/ptrace.c | 4 -
> arch/powerpc/kernel/signal.c | 4 -
> arch/powerpc/kernel/syscall_64.c | 90 +++--
> arch/powerpc/kernel/tau_6xx.c | 5 +-
> arch/powerpc/kernel/time.c | 7 +-
> arch/powerpc/kernel/traps.c | 265 ++++++-------
> arch/powerpc/kernel/watchdog.c | 15 +-
> arch/powerpc/kvm/book3s_hv.c | 7 +-
> arch/powerpc/kvm/book3s_hv_builtin.c | 1 +
> arch/powerpc/kvm/booke.c | 1 +
> arch/powerpc/mm/book3s64/hash_utils.c | 97 +++--
> arch/powerpc/mm/book3s64/slb.c | 40 +-
> arch/powerpc/mm/fault.c | 76 ++--
> arch/powerpc/perf/core-book3s.c | 35 +-
> arch/powerpc/perf/core-fsl-emb.c | 25 --
> arch/powerpc/platforms/8xx/machine_check.c | 2 +-
> arch/powerpc/platforms/cell/pervasive.c | 1 +
> arch/powerpc/platforms/cell/pervasive.h | 3 -
> arch/powerpc/platforms/cell/ras.c | 6 +-
> arch/powerpc/platforms/cell/ras.h | 9 +-
> arch/powerpc/platforms/powernv/idle.c | 1 +
> arch/powerpc/platforms/powernv/opal.c | 2 +-
> arch/powerpc/platforms/pseries/ras.c | 2 +-
> 47 files changed, 914 insertions(+), 759 deletions(-)
> create mode 100644 arch/powerpc/include/asm/interrupt.h
>
^ permalink raw reply
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Aneesh Kumar K.V @ 2021-02-02 5:55 UTC (permalink / raw)
To: Nicholas Piggin, Christophe Leroy, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <877dnrrsbu.fsf@linux.ibm.com>
Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> writes:
> Nicholas Piggin <npiggin@gmail.com> writes:
>
>> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>> +Aneesh
>>>>
>>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>>> ..
>>>>> [ 96.200296] ------------[ cut here ]------------
>>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>>>
>>>>> [ 96.200734] NIP [c000000000849424] fault_in_pages_readable+0x104/0x350
>>>>> [ 96.200741] LR [c00000000084952c] fault_in_pages_readable+0x20c/0x350
>>>>> [ 96.200747] --- interrupt: 300
>>>>
>>>>
>>>> Problem happens in a section where userspace access is supposed to be granted, so the patch you
>>>> proposed is definitely not the right fix.
>>>>
>>>> c000000000849408: 2c 01 00 4c isync
>>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting userspace access permission
>>>> c000000000849410: 2c 01 00 4c isync
>>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <== accessing userspace
>>>> c000000000849428: 10 00 82 41 beq c000000000849438 <fault_in_pages_readable+0x118>
>>>> c00000000084942c: 2c 01 00 4c isync
>>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <== clearing userspace access permission
>>>> c000000000849434: 2c 01 00 4c isync
>>>>
>>>> My first guess is that the problem is linked to the following function, see the comment
>>>>
>>>> /*
>>>> * For kernel thread that doesn't have thread.regs return
>>>> * default AMR/IAMR values.
>>>> */
>>>> static inline u64 current_thread_amr(void)
>>>> {
>>>> if (current->thread.regs)
>>>> return current->thread.regs->amr;
>>>> return AMR_KUAP_BLOCKED;
>>>> }
>>>>
>>>> Above function was introduced by commit 48a8ab4eeb82 ("powerpc/book3s64/pkeys: Don't update SPRN_AMR
>>>> when in kernel mode")
>>>
>>> Yeah that's a bit of a curly one.
>>>
>>> At some point io_uring did kthread_use_mm(), which is supposed to mean
>>> the kthread can operate on behalf of the original process that submitted
>>> the IO.
>>>
>>> But because KUAP is implemented using memory protection keys, it depends
>>> on the value of the AMR register, which is not part of the mm, it's in
>>> thread.regs->amr.
>>>
>>> And what's worse by the time we're in kthread_use_mm() we no longer have
>>> access to the thread.regs->amr of the original process that submitted
>>> the IO.
>>>
>>> We also can't simply move the AMR into the mm, precisely because it's
>>> per thread, not per mm.
>>>
>>> So TBH I don't know how we're going to fix this.
>>>
>>> I guess we could return AMR=unblocked for kernel threads, but that's
>>> arguably a bug because it allows a process to circumvent memory keys by
>>> asking the kernel to do the access.
>>
>> We shouldn't need to inherit AMR should we? We only need it to be locked
>> for kernel threads until it's explicitly unlocked -- nothing mm specific
>> there. I think current_thread_amr could return 0 for kernel threads? Or
>> I would even avoid using that function for allow_user_access and open
>> code the kthread case and remove it from current_thread_amr().
>>
>> Thanks,
>> Nick
>
updated one
From 8fdb0680f983940d61f91da8252b13c8d3e8ebee Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Tue, 2 Feb 2021 09:23:38 +0530
Subject: [PATCH v2] powerpc/kuap: Allow kernel thread to access userspace
after kthread_use_mm
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
Bug: Read fault blocked by KUAP!
WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
..........
Call Trace:
[c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
[c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
[c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
--- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..........
NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
interrupt: 300
[c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
[c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
[c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
[c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
[c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
[c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
[c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
[c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
[c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
[c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
[c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v1:
* Address review feedback from Nick
arch/powerpc/include/asm/book3s/64/kup.h | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..95f4df99249e 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -384,7 +384,13 @@ static __always_inline void allow_user_access(void __user *to, const void __user
// This is written so we can resolve to a single case at build time
BUILD_BUG_ON(!__builtin_constant_p(dir));
- if (mmu_has_feature(MMU_FTR_PKEY))
+ /*
+ * if it is a kthread that did kthread_use_mm() don't
+ * use current_thread_amr().
+ */
+ if (current->flags & PF_KTHREAD)
+ thread_amr = 0;
+ else if (mmu_has_feature(MMU_FTR_PKEY))
thread_amr = current_thread_amr();
if (dir == KUAP_READ)
--
2.29.2
^ permalink raw reply related
* [PATCH] cpufreq: Remove unused flag CPUFREQ_PM_NO_WARN
From: Viresh Kumar @ 2021-02-02 5:41 UTC (permalink / raw)
To: Rafael Wysocki, Viresh Kumar, Michael Ellerman,
Benjamin Herrenschmidt, Paul Mackerras
Cc: linuxppc-dev, Vincent Guittot, linux-kernel, linux-pm
This flag is set by one of the drivers but it isn't used in the code
otherwise. Remove the unused flag and update the driver.
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
Rebased over:
https://lore.kernel.org/lkml/a59bb322b22c247d570b70a8e94067804287623b.1612241683.git.viresh.kumar@linaro.org/
drivers/cpufreq/pmac32-cpufreq.c | 3 +--
include/linux/cpufreq.h | 13 +++++--------
2 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 73621bc11976..4f20c6a9108d 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -439,8 +439,7 @@ static struct cpufreq_driver pmac_cpufreq_driver = {
.init = pmac_cpufreq_cpu_init,
.suspend = pmac_cpufreq_suspend,
.resume = pmac_cpufreq_resume,
- .flags = CPUFREQ_PM_NO_WARN |
- CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
+ .flags = CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
.attr = cpufreq_generic_attr,
.name = "powermac",
};
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c8e40e91fe9b..353969c7acd3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -398,8 +398,11 @@ struct cpufreq_driver {
/* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */
#define CPUFREQ_CONST_LOOPS BIT(1)
-/* don't warn on suspend/resume speed mismatches */
-#define CPUFREQ_PM_NO_WARN BIT(2)
+/*
+ * Set by drivers that want the core to automatically register the cpufreq
+ * driver as a thermal cooling device.
+ */
+#define CPUFREQ_IS_COOLING_DEV BIT(2)
/*
* This should be set by platforms having multiple clock-domains, i.e.
@@ -431,12 +434,6 @@ struct cpufreq_driver {
*/
#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6)
-/*
- * Set by drivers that want the core to automatically register the cpufreq
- * driver as a thermal cooling device.
- */
-#define CPUFREQ_IS_COOLING_DEV BIT(7)
-
int cpufreq_register_driver(struct cpufreq_driver *driver_data);
int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
--
2.25.0.rc1.19.g042ed3e048af
^ permalink raw reply related
* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Aneesh Kumar K.V @ 2021-02-02 4:45 UTC (permalink / raw)
To: Nicholas Piggin, Christophe Leroy, Michael Ellerman, Zorro Lang
Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <1612014260.b4fac0liie.astroid@bobo.none>
Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>> +Aneesh
>>>
>>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
>> ..
>>>> [ 96.200296] ------------[ cut here ]------------
>>>> [ 96.200304] Bug: Read fault blocked by KUAP!
>>>> [ 96.200309] WARNING: CPU: 3 PID: 1876 at arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>>
>>>> [ 96.200734] NIP [c000000000849424] fault_in_pages_readable+0x104/0x350
>>>> [ 96.200741] LR [c00000000084952c] fault_in_pages_readable+0x20c/0x350
>>>> [ 96.200747] --- interrupt: 300
>>>
>>>
>>> Problem happens in a section where userspace access is supposed to be granted, so the patch you
>>> proposed is definitely not the right fix.
>>>
>>> c000000000849408: 2c 01 00 4c isync
>>> c00000000084940c: a6 03 3d 7d mtspr 29,r9 <== granting userspace access permission
>>> c000000000849410: 2c 01 00 4c isync
>>> c000000000849414: 00 00 36 e9 ld r9,0(r22)
>>> c000000000849418: 20 00 29 81 lwz r9,32(r9)
>>> c00000000084941c: 00 02 29 71 andi. r9,r9,512
>>> c000000000849420: 78 d3 5e 7f mr r30,r26
>>> ==> c000000000849424: 00 00 bf 8b lbz r29,0(r31) <== accessing userspace
>>> c000000000849428: 10 00 82 41 beq c000000000849438 <fault_in_pages_readable+0x118>
>>> c00000000084942c: 2c 01 00 4c isync
>>> c000000000849430: a6 03 bd 7e mtspr 29,r21 <== clearing userspace access permission
>>> c000000000849434: 2c 01 00 4c isync
>>>
>>> My first guess is that the problem is linked to the following function, see the comment
>>>
>>> /*
>>> * For kernel thread that doesn't have thread.regs return
>>> * default AMR/IAMR values.
>>> */
>>> static inline u64 current_thread_amr(void)
>>> {
>>> if (current->thread.regs)
>>> return current->thread.regs->amr;
>>> return AMR_KUAP_BLOCKED;
>>> }
>>>
>>> Above function was introduced by commit 48a8ab4eeb82 ("powerpc/book3s64/pkeys: Don't update SPRN_AMR
>>> when in kernel mode")
>>
>> Yeah that's a bit of a curly one.
>>
>> At some point io_uring did kthread_use_mm(), which is supposed to mean
>> the kthread can operate on behalf of the original process that submitted
>> the IO.
>>
>> But because KUAP is implemented using memory protection keys, it depends
>> on the value of the AMR register, which is not part of the mm, it's in
>> thread.regs->amr.
>>
>> And what's worse by the time we're in kthread_use_mm() we no longer have
>> access to the thread.regs->amr of the original process that submitted
>> the IO.
>>
>> We also can't simply move the AMR into the mm, precisely because it's
>> per thread, not per mm.
>>
>> So TBH I don't know how we're going to fix this.
>>
>> I guess we could return AMR=unblocked for kernel threads, but that's
>> arguably a bug because it allows a process to circumvent memory keys by
>> asking the kernel to do the access.
>
> We shouldn't need to inherit AMR should we? We only need it to be locked
> for kernel threads until it's explicitly unlocked -- nothing mm specific
> there. I think current_thread_amr could return 0 for kernel threads? Or
> I would even avoid using that function for allow_user_access and open
> code the kthread case and remove it from current_thread_amr().
>
> Thanks,
> Nick
Can we try this?
commit 9a193d38b1a0a52364bc70ec953e0685993603b6
Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Date: Tue Feb 2 09:23:38 2021 +0530
powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
Bug: Read fault blocked by KUAP!
WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
..........
Call Trace:
[c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
[c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
[c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
--- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..........
NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
interrupt: 300
[c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
[c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
[c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
[c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
[c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
[c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
[c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
[c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
[c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
[c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
[c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..7457d80ba0bb 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -384,7 +384,13 @@ static __always_inline void allow_user_access(void __user *to, const void __user
// This is written so we can resolve to a single case at build time
BUILD_BUG_ON(!__builtin_constant_p(dir));
- if (mmu_has_feature(MMU_FTR_PKEY))
+ /*
+ * if it is a kthread that did kthread_use_mm() don't
+ * use current_thread_amr().
+ */
+ if (!current->mm && current->active_mm != &init_mm)
+ thread_amr = 0;
+ else if (mmu_has_feature(MMU_FTR_PKEY))
thread_amr = current_thread_amr();
if (dir == KUAP_READ)
^ permalink raw reply related
* Re: [PATCH] dma-mapping: remove unneeded semicolon
From: Geoff Levand @ 2021-02-02 4:14 UTC (permalink / raw)
To: Yang Li; +Cc: paulus, linuxppc-dev, linux-kernel
In-Reply-To: <1612237276-111378-1-git-send-email-yang.lee@linux.alibaba.com>
On 2/1/21 7:41 PM, Yang Li wrote:
> Eliminate the following coccicheck warning:
> ./arch/powerpc/platforms/ps3/system-bus.c:606:2-3: Unneeded semicolon
> ./arch/powerpc/platforms/ps3/system-bus.c:765:2-3: Unneeded semicolon
>
> Reported-by: Abaci Robot <abaci@linux.alibaba.com>
> Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
> ---
> arch/powerpc/platforms/ps3/system-bus.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
Thanks for your patch, it looks good.
Acked-by: Geoff Levand <geoff@infradead.org>
-Geoff
^ permalink raw reply
* [PATCH] powerpc/book3s64: remove unneeded semicolon
From: Yang Li @ 2021-02-02 3:51 UTC (permalink / raw)
To: mpe; +Cc: Yang Li, paulus, linuxppc-dev, linux-kernel
Eliminate the following coccicheck warning:
./arch/powerpc/platforms/pseries/lpar.c:1632:2-3: Unneeded semicolon
./arch/powerpc/platforms/pseries/lpar.c:1663:2-3: Unneeded semicolon
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
---
arch/powerpc/platforms/pseries/lpar.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 764170f..24889b8 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1629,7 +1629,7 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
}
msleep(delay);
rc = plpar_resize_hpt_prepare(0, shift);
- };
+ }
switch (rc) {
case H_SUCCESS:
@@ -1663,7 +1663,7 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
state.commit_rc);
return -EIO;
- };
+ }
}
pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
--
1.8.3.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox