BUG: unable to handle kernel paging request in fuse_copy

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* BUG: unable to handle kernel paging request in fuse_copy_do
@ 2024-03-21  7:52 xingwei lee
  2024-03-22 13:50 ` Miklos Szeredi
  0 siblings, 1 reply; 12+ messages in thread
From: xingwei lee @ 2024-03-21  7:52 UTC (permalink / raw)
  To: linux-fsdevel, miklos; +Cc: linux-kernel, samsun1006219, syzkaller-bugs

Hello I found a bug titled "BUG: unable to handle kernel paging
request in fuse_copy_do” with modified syzkaller, and maybe it is
related to fs/fuse.
I also confirmed in the latest upstream.

If you fix this issue, please add the following tag to the commit:
Reported-by: xingwei lee <xrivendell7@gmail.com>
Reported-by: yue sun <samsun1006219@gmail.com>

kernel: upstream 23956900041d968f9ad0f30db6dede4daccd7aa9
kernel config: https://syzkaller.appspot.com/text?tag=KernelConfig&x=9f47e8dfa53b0b11
with KASAN enabled
compiler: gcc (Debian 12.2.0-14) 12.2.0

BUG: unable to handle kernel paging request in fuse_copy_do
UDPLite: UDP-Lite is deprecated and scheduled to be removed in 2025,
please contact the netdev mailing list
BUG: unable to handle page fault for address: ffff88802c29c000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 13001067 P4D 13001067 PUD 13002067 PMD 24c8d063 PTE 800fffffd3d63060
Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 1 PID: 8221 Comm: 1e9 Not tainted 6.8.0-05202-g9187210eee7d-dirty #21
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.16.2-1.fc38 04/01/2014
RIP: 0010:memcpy+0xc/0x20 arch/x86/lib/memcpy_64.S:38
Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90
90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 48 89 f80
RSP: 0018:ffffc9001065f9c8 EFLAGS: 00010246
RAX: ffffc9001065fb10 RBX: ffffc9001065fc78 RCX: 0000000000000010
RDX: 0000000000000010 RSI: ffff88802c29c000 RDI: ffffc9001065fb10
RBP: 0000000000000010 R08: ffff88802c29c000 R09: 0000000000000001
R10: ffffffff8ea82ed7 R11: ffffc9001065fd98 R12: ffffc9001065fac0
R13: 0000000000000010 R14: ffffc9001065faf0 R15: ffffc9001065fcbc
FS: 000000000f82d480(0000) GS:ffff88823bc00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff88802c29c000 CR3: 000000002dd7c000 CR4: 0000000000750ef0
PKRU: 55555554
Call Trace:
<TASK>
fuse_copy_do+0x152/0x340 fs/fuse/dev.c:758
fuse_copy_one fs/fuse/dev.c:1007 [inline]
fuse_dev_do_write+0x1df/0x26a0 fs/fuse/dev.c:1863
fuse_dev_write+0x129/0x1b0 fs/fuse/dev.c:1960
call_write_iter include/linux/fs.h:2108 [inline]
new_sync_write fs/read_write.c:497 [inline]
vfs_write+0x62e/0x10a0 fs/read_write.c:590
ksys_write+0xf6/0x1d0 fs/read_write.c:643
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x7c/0x1d0 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x6c/0x74

=* repro.c =*
#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#ifndef __NR_memfd_secret
#define __NR_memfd_secret 447
#endif

static __thread int clone_ongoing;
static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* ctx) {
 if (__atomic_load_n(&clone_ongoing, __ATOMIC_RELAXED) != 0) {
   exit(sig);
 }
 uintptr_t addr = (uintptr_t)info->si_addr;
 const uintptr_t prog_start = 1 << 20;
 const uintptr_t prog_end = 100 << 20;
 int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
 int valid = addr < prog_start || addr > prog_end;
 if (skip && valid) {
   _longjmp(segv_env, 1);
 }
 exit(sig);
}

static void install_segv_handler(void) {
 struct sigaction sa;
 memset(&sa, 0, sizeof(sa));
 sa.sa_handler = SIG_IGN;
 syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
 syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
 memset(&sa, 0, sizeof(sa));
 sa.sa_sigaction = segv_handler;
 sa.sa_flags = SA_NODEFER | SA_SIGINFO;
 sigaction(SIGSEGV, &sa, NULL);
 sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...)                                  \
 ({                                                     \
   int ok = 1;                                          \
   __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
   if (_setjmp(segv_env) == 0) {                        \
     __VA_ARGS__;                                       \
   } else                                               \
     ok = 0;                                            \
   __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
   ok;                                                  \
 })

static void sleep_ms(uint64_t ms) {
 usleep(ms * 1000);
}

static uint64_t current_time_ms(void) {
 struct timespec ts;
 if (clock_gettime(CLOCK_MONOTONIC, &ts))
   exit(1);
 return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static bool write_file(const char* file, const char* what, ...) {
 char buf[1024];
 va_list args;
 va_start(args, what);
 vsnprintf(buf, sizeof(buf), what, args);
 va_end(args);
 buf[sizeof(buf) - 1] = 0;
 int len = strlen(buf);
 int fd = open(file, O_WRONLY | O_CLOEXEC);
 if (fd == -1)
   return false;
 if (write(fd, buf, len) != len) {
   int err = errno;
   close(fd);
   errno = err;
   return false;
 }
 close(fd);
 return true;
}

static void kill_and_wait(int pid, int* status) {
 kill(-pid, SIGKILL);
 kill(pid, SIGKILL);
 for (int i = 0; i < 100; i++) {
   if (waitpid(-1, status, WNOHANG | __WALL) == pid)
     return;
   usleep(1000);
 }
 DIR* dir = opendir("/sys/fs/fuse/connections");
 if (dir) {
   for (;;) {
     struct dirent* ent = readdir(dir);
     if (!ent)
       break;
     if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
       continue;
     char abort[300];
     snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
              ent->d_name);
     int fd = open(abort, O_WRONLY);
     if (fd == -1) {
       continue;
     }
     if (write(fd, abort, 1) < 0) {
     }
     close(fd);
   }
   closedir(dir);
 } else {
 }
 while (waitpid(-1, status, __WALL) != pid) {
 }
}

static void setup_test() {
 prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
 setpgrp();
 write_file("/proc/self/oom_score_adj", "1000");
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void) {
 int iter = 0;
 for (;; iter++) {
   int pid = fork();
   if (pid < 0)
     exit(1);
   if (pid == 0) {
     setup_test();
     execute_one();
     exit(0);
   }
   int status = 0;
   uint64_t start = current_time_ms();
   for (;;) {
     if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
       break;
     sleep_ms(1);
     if (current_time_ms() - start < 5000)
       continue;
     kill_and_wait(pid, &status);
     break;
   }
 }
}

uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};

void execute_one(void) {
 intptr_t res = 0;
 NONFAILING(memcpy((void*)0x20002040, "./file0\000", 8));
 syscall(__NR_mkdirat, /*fd=*/0xffffff9c, /*path=*/0x20002040ul, /*mode=*/0ul);
 NONFAILING(memcpy((void*)0x20002080, "/dev/fuse\000", 10));
 res = syscall(__NR_openat, /*fd=*/0xffffffffffffff9cul, /*file=*/0x20002080ul,
               /*flags=*/2ul, /*mode=*/0ul);
 if (res != -1)
   r[0] = res;
 NONFAILING(memcpy((void*)0x200020c0, "./file0\000", 8));
 NONFAILING(memcpy((void*)0x20002100, "fuse\000", 5));
 NONFAILING(memcpy((void*)0x20002140, "fd", 2));
 NONFAILING(*(uint8_t*)0x20002142 = 0x3d);
 NONFAILING(sprintf((char*)0x20002143, "0x%016llx", (long long)r[0]));
 NONFAILING(*(uint8_t*)0x20002155 = 0x2c);
 NONFAILING(memcpy((void*)0x20002156, "rootmode", 8));
 NONFAILING(*(uint8_t*)0x2000215e = 0x3d);
 NONFAILING(sprintf((char*)0x2000215f, "%023llo", (long long)0x4000));
 NONFAILING(*(uint8_t*)0x20002176 = 0x2c);
 NONFAILING(memcpy((void*)0x20002177, "user_id", 7));
 NONFAILING(*(uint8_t*)0x2000217e = 0x3d);
 NONFAILING(sprintf((char*)0x2000217f, "%020llu", (long long)0));
 NONFAILING(*(uint8_t*)0x20002193 = 0x2c);
 NONFAILING(memcpy((void*)0x20002194, "group_id", 8));
 NONFAILING(*(uint8_t*)0x2000219c = 0x3d);
 NONFAILING(sprintf((char*)0x2000219d, "%020llu", (long long)0));
 NONFAILING(*(uint8_t*)0x200021b1 = 0x2c);
 NONFAILING(*(uint8_t*)0x200021b2 = 0);
 syscall(__NR_mount, /*src=*/0ul, /*dst=*/0x200020c0ul, /*type=*/0x20002100ul,
         /*flags=*/0ul, /*opts=*/0x20002140ul);
 res = syscall(__NR_memfd_secret, /*flags=*/0ul);
 if (res != -1)
   r[1] = res;
 syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xb36000ul,
         /*prot=PROT_GROWSUP|PROT_READ*/ 0x2000001ul,
         /*flags=MAP_STACK|MAP_POPULATE|MAP_FIXED|MAP_SHARED*/ 0x28011ul,
         /*fd=*/r[1], /*offset=*/0ul);
 syscall(__NR_ftruncate, /*fd=*/r[1], /*len=*/7ul);
 res = syscall(__NR_socket, /*domain=*/2ul, /*type=*/2ul, /*proto=*/0x88);
 if (res != -1)
   r[2] = res;
 NONFAILING(*(uint32_t*)0x20000280 = 0);
 syscall(__NR_getsockopt, /*fd=*/r[2], /*level=*/1, /*optname=*/0x11,
         /*optval=*/0ul, /*optlen=*/0x20000280ul);
 NONFAILING(*(uint32_t*)0x20000000 = 0x50);
 NONFAILING(*(uint32_t*)0x20000004 = 0);
 NONFAILING(*(uint64_t*)0x20000008 = 0);
 NONFAILING(*(uint32_t*)0x20000010 = 7);
 NONFAILING(*(uint32_t*)0x20000014 = 0x27);
 NONFAILING(*(uint32_t*)0x20000018 = 0);
 NONFAILING(*(uint32_t*)0x2000001c = 0);
 NONFAILING(*(uint16_t*)0x20000020 = 0);
 NONFAILING(*(uint16_t*)0x20000022 = 0);
 NONFAILING(*(uint32_t*)0x20000024 = 0);
 NONFAILING(*(uint32_t*)0x20000028 = 0);
 NONFAILING(*(uint16_t*)0x2000002c = 0);
 NONFAILING(*(uint16_t*)0x2000002e = 0);
 NONFAILING(memset((void*)0x20000030, 0, 32));
 syscall(__NR_write, /*fd=*/r[0], /*arg=*/0x20000000ul, /*len=*/0x50ul);
}
int main(void) {
 syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
         /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
         /*offset=*/0ul);
 syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
         /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
         /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
         /*offset=*/0ul);
 syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
         /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
         /*offset=*/0ul);
 install_segv_handler();
 loop();
 return 0;
}

=* repro.txt =*
mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x2, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100),
0x0, &(0x7f0000002140)={{'fd', 0x3d, r0}, 0x2c, {'rootmode', 0x3d,
0x4000}})
r1 = memfd_secret(0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2000001, 0x28011, r1, 0x0)
ftruncate(r1, 0x7)
r2 = socket$inet_udplite(0x2, 0x2, 0x88)
getsockopt$sock_cred(r2, 0x1, 0x11, 0x0, &(0x7f0000000280))
write$FUSE_INIT(r0, &(0x7f0000000000)={0x50}, 0x50)


see aslo https://gist.github.com/xrivendell7/961be96ae091c9671bb56efea902cec4.

I hope it helps.
best regards.
xingwei Lee

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-21  7:52 BUG: unable to handle kernel paging request in fuse_copy_do xingwei lee
@ 2024-03-22 13:50 ` Miklos Szeredi
  2024-03-22 15:41   ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: Miklos Szeredi @ 2024-03-22 13:50 UTC (permalink / raw)
  To: xingwei lee
  Cc: linux-fsdevel, linux-kernel, samsun1006219, syzkaller-bugs,
	linux-mm, Mike Rapoport

[MM list + secretmem author CC-d]

On Thu, 21 Mar 2024 at 08:52, xingwei lee <xrivendell7@gmail.com> wrote:
>
> Hello I found a bug titled "BUG: unable to handle kernel paging
> request in fuse_copy_do” with modified syzkaller, and maybe it is
> related to fs/fuse.
> I also confirmed in the latest upstream.
>
> If you fix this issue, please add the following tag to the commit:
> Reported-by: xingwei lee <xrivendell7@gmail.com>
> Reported-by: yue sun <samsun1006219@gmail.com>

Thanks for the report.   This looks like a secretmem vs get_user_pages issue.

I reduced the syz reproducer to a minimal one that isn't dependent on fuse:

=== repro.c ===
#define _GNU_SOURCE

#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/socket.h>

int main(void)
{
        int fd1, fd2, fd3;
        int pip[2];
        struct iovec iov;
        void *addr;

        fd1 = syscall(__NR_memfd_secret, 0);
        addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0);
        ftruncate(fd1, 7);
        fd2 = socket(AF_INET, SOCK_DGRAM, 0);
        getsockopt(fd2, 0, 0, NULL, addr);

        pipe(pip);
        iov.iov_base = addr;
        iov.iov_len = 0x50;
        vmsplice(pip[1], &iov, 1, 0);

        fd3 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
        splice(pip[0], NULL, fd3, NULL, 0x50, 0);

        return 0;
}
=======

Thanks,
Miklos

>
> kernel: upstream 23956900041d968f9ad0f30db6dede4daccd7aa9
> kernel config: https://syzkaller.appspot.com/text?tag=KernelConfig&x=9f47e8dfa53b0b11
> with KASAN enabled
> compiler: gcc (Debian 12.2.0-14) 12.2.0
>
> BUG: unable to handle kernel paging request in fuse_copy_do
> UDPLite: UDP-Lite is deprecated and scheduled to be removed in 2025,
> please contact the netdev mailing list
> BUG: unable to handle page fault for address: ffff88802c29c000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 13001067 P4D 13001067 PUD 13002067 PMD 24c8d063 PTE 800fffffd3d63060
> Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI
> CPU: 1 PID: 8221 Comm: 1e9 Not tainted 6.8.0-05202-g9187210eee7d-dirty #21
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> 1.16.2-1.fc38 04/01/2014
> RIP: 0010:memcpy+0xc/0x20 arch/x86/lib/memcpy_64.S:38
> Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90
> 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 48 89 f80
> RSP: 0018:ffffc9001065f9c8 EFLAGS: 00010246
> RAX: ffffc9001065fb10 RBX: ffffc9001065fc78 RCX: 0000000000000010
> RDX: 0000000000000010 RSI: ffff88802c29c000 RDI: ffffc9001065fb10
> RBP: 0000000000000010 R08: ffff88802c29c000 R09: 0000000000000001
> R10: ffffffff8ea82ed7 R11: ffffc9001065fd98 R12: ffffc9001065fac0
> R13: 0000000000000010 R14: ffffc9001065faf0 R15: ffffc9001065fcbc
> FS: 000000000f82d480(0000) GS:ffff88823bc00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffff88802c29c000 CR3: 000000002dd7c000 CR4: 0000000000750ef0
> PKRU: 55555554
> Call Trace:
> <TASK>
> fuse_copy_do+0x152/0x340 fs/fuse/dev.c:758
> fuse_copy_one fs/fuse/dev.c:1007 [inline]
> fuse_dev_do_write+0x1df/0x26a0 fs/fuse/dev.c:1863
> fuse_dev_write+0x129/0x1b0 fs/fuse/dev.c:1960
> call_write_iter include/linux/fs.h:2108 [inline]
> new_sync_write fs/read_write.c:497 [inline]
> vfs_write+0x62e/0x10a0 fs/read_write.c:590
> ksys_write+0xf6/0x1d0 fs/read_write.c:643
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0x7c/0x1d0 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x6c/0x74
>
> =* repro.c =*
> #define _GNU_SOURCE
>
> #include <dirent.h>
> #include <endian.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <setjmp.h>
> #include <signal.h>
> #include <stdarg.h>
> #include <stdbool.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/prctl.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <sys/wait.h>
> #include <time.h>
> #include <unistd.h>
>
> #ifndef __NR_memfd_secret
> #define __NR_memfd_secret 447
> #endif
>
> static __thread int clone_ongoing;
> static __thread int skip_segv;
> static __thread jmp_buf segv_env;
>
> static void segv_handler(int sig, siginfo_t* info, void* ctx) {
>  if (__atomic_load_n(&clone_ongoing, __ATOMIC_RELAXED) != 0) {
>    exit(sig);
>  }
>  uintptr_t addr = (uintptr_t)info->si_addr;
>  const uintptr_t prog_start = 1 << 20;
>  const uintptr_t prog_end = 100 << 20;
>  int skip = __atomic_load_n(&skip_segv, __ATOMIC_RELAXED) != 0;
>  int valid = addr < prog_start || addr > prog_end;
>  if (skip && valid) {
>    _longjmp(segv_env, 1);
>  }
>  exit(sig);
> }
>
> static void install_segv_handler(void) {
>  struct sigaction sa;
>  memset(&sa, 0, sizeof(sa));
>  sa.sa_handler = SIG_IGN;
>  syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
>  syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
>  memset(&sa, 0, sizeof(sa));
>  sa.sa_sigaction = segv_handler;
>  sa.sa_flags = SA_NODEFER | SA_SIGINFO;
>  sigaction(SIGSEGV, &sa, NULL);
>  sigaction(SIGBUS, &sa, NULL);
> }
>
> #define NONFAILING(...)                                  \
>  ({                                                     \
>    int ok = 1;                                          \
>    __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
>    if (_setjmp(segv_env) == 0) {                        \
>      __VA_ARGS__;                                       \
>    } else                                               \
>      ok = 0;                                            \
>    __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
>    ok;                                                  \
>  })
>
> static void sleep_ms(uint64_t ms) {
>  usleep(ms * 1000);
> }
>
> static uint64_t current_time_ms(void) {
>  struct timespec ts;
>  if (clock_gettime(CLOCK_MONOTONIC, &ts))
>    exit(1);
>  return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
> }
>
> static bool write_file(const char* file, const char* what, ...) {
>  char buf[1024];
>  va_list args;
>  va_start(args, what);
>  vsnprintf(buf, sizeof(buf), what, args);
>  va_end(args);
>  buf[sizeof(buf) - 1] = 0;
>  int len = strlen(buf);
>  int fd = open(file, O_WRONLY | O_CLOEXEC);
>  if (fd == -1)
>    return false;
>  if (write(fd, buf, len) != len) {
>    int err = errno;
>    close(fd);
>    errno = err;
>    return false;
>  }
>  close(fd);
>  return true;
> }
>
> static void kill_and_wait(int pid, int* status) {
>  kill(-pid, SIGKILL);
>  kill(pid, SIGKILL);
>  for (int i = 0; i < 100; i++) {
>    if (waitpid(-1, status, WNOHANG | __WALL) == pid)
>      return;
>    usleep(1000);
>  }
>  DIR* dir = opendir("/sys/fs/fuse/connections");
>  if (dir) {
>    for (;;) {
>      struct dirent* ent = readdir(dir);
>      if (!ent)
>        break;
>      if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
>        continue;
>      char abort[300];
>      snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
>               ent->d_name);
>      int fd = open(abort, O_WRONLY);
>      if (fd == -1) {
>        continue;
>      }
>      if (write(fd, abort, 1) < 0) {
>      }
>      close(fd);
>    }
>    closedir(dir);
>  } else {
>  }
>  while (waitpid(-1, status, __WALL) != pid) {
>  }
> }
>
> static void setup_test() {
>  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
>  setpgrp();
>  write_file("/proc/self/oom_score_adj", "1000");
> }
>
> static void execute_one(void);
>
> #define WAIT_FLAGS __WALL
>
> static void loop(void) {
>  int iter = 0;
>  for (;; iter++) {
>    int pid = fork();
>    if (pid < 0)
>      exit(1);
>    if (pid == 0) {
>      setup_test();
>      execute_one();
>      exit(0);
>    }
>    int status = 0;
>    uint64_t start = current_time_ms();
>    for (;;) {
>      if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
>        break;
>      sleep_ms(1);
>      if (current_time_ms() - start < 5000)
>        continue;
>      kill_and_wait(pid, &status);
>      break;
>    }
>  }
> }
>
> uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};
>
> void execute_one(void) {
>  intptr_t res = 0;
>  NONFAILING(memcpy((void*)0x20002040, "./file0\000", 8));
>  syscall(__NR_mkdirat, /*fd=*/0xffffff9c, /*path=*/0x20002040ul, /*mode=*/0ul);
>  NONFAILING(memcpy((void*)0x20002080, "/dev/fuse\000", 10));
>  res = syscall(__NR_openat, /*fd=*/0xffffffffffffff9cul, /*file=*/0x20002080ul,
>                /*flags=*/2ul, /*mode=*/0ul);
>  if (res != -1)
>    r[0] = res;
>  NONFAILING(memcpy((void*)0x200020c0, "./file0\000", 8));
>  NONFAILING(memcpy((void*)0x20002100, "fuse\000", 5));
>  NONFAILING(memcpy((void*)0x20002140, "fd", 2));
>  NONFAILING(*(uint8_t*)0x20002142 = 0x3d);
>  NONFAILING(sprintf((char*)0x20002143, "0x%016llx", (long long)r[0]));
>  NONFAILING(*(uint8_t*)0x20002155 = 0x2c);
>  NONFAILING(memcpy((void*)0x20002156, "rootmode", 8));
>  NONFAILING(*(uint8_t*)0x2000215e = 0x3d);
>  NONFAILING(sprintf((char*)0x2000215f, "%023llo", (long long)0x4000));
>  NONFAILING(*(uint8_t*)0x20002176 = 0x2c);
>  NONFAILING(memcpy((void*)0x20002177, "user_id", 7));
>  NONFAILING(*(uint8_t*)0x2000217e = 0x3d);
>  NONFAILING(sprintf((char*)0x2000217f, "%020llu", (long long)0));
>  NONFAILING(*(uint8_t*)0x20002193 = 0x2c);
>  NONFAILING(memcpy((void*)0x20002194, "group_id", 8));
>  NONFAILING(*(uint8_t*)0x2000219c = 0x3d);
>  NONFAILING(sprintf((char*)0x2000219d, "%020llu", (long long)0));
>  NONFAILING(*(uint8_t*)0x200021b1 = 0x2c);
>  NONFAILING(*(uint8_t*)0x200021b2 = 0);
>  syscall(__NR_mount, /*src=*/0ul, /*dst=*/0x200020c0ul, /*type=*/0x20002100ul,
>          /*flags=*/0ul, /*opts=*/0x20002140ul);
>  res = syscall(__NR_memfd_secret, /*flags=*/0ul);
>  if (res != -1)
>    r[1] = res;
>  syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xb36000ul,
>          /*prot=PROT_GROWSUP|PROT_READ*/ 0x2000001ul,
>          /*flags=MAP_STACK|MAP_POPULATE|MAP_FIXED|MAP_SHARED*/ 0x28011ul,
>          /*fd=*/r[1], /*offset=*/0ul);
>  syscall(__NR_ftruncate, /*fd=*/r[1], /*len=*/7ul);
>  res = syscall(__NR_socket, /*domain=*/2ul, /*type=*/2ul, /*proto=*/0x88);
>  if (res != -1)
>    r[2] = res;
>  NONFAILING(*(uint32_t*)0x20000280 = 0);
>  syscall(__NR_getsockopt, /*fd=*/r[2], /*level=*/1, /*optname=*/0x11,
>          /*optval=*/0ul, /*optlen=*/0x20000280ul);
>  NONFAILING(*(uint32_t*)0x20000000 = 0x50);
>  NONFAILING(*(uint32_t*)0x20000004 = 0);
>  NONFAILING(*(uint64_t*)0x20000008 = 0);
>  NONFAILING(*(uint32_t*)0x20000010 = 7);
>  NONFAILING(*(uint32_t*)0x20000014 = 0x27);
>  NONFAILING(*(uint32_t*)0x20000018 = 0);
>  NONFAILING(*(uint32_t*)0x2000001c = 0);
>  NONFAILING(*(uint16_t*)0x20000020 = 0);
>  NONFAILING(*(uint16_t*)0x20000022 = 0);
>  NONFAILING(*(uint32_t*)0x20000024 = 0);
>  NONFAILING(*(uint32_t*)0x20000028 = 0);
>  NONFAILING(*(uint16_t*)0x2000002c = 0);
>  NONFAILING(*(uint16_t*)0x2000002e = 0);
>  NONFAILING(memset((void*)0x20000030, 0, 32));
>  syscall(__NR_write, /*fd=*/r[0], /*arg=*/0x20000000ul, /*len=*/0x50ul);
> }
> int main(void) {
>  syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
>          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
>          /*offset=*/0ul);
>  syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
>          /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
>          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
>          /*offset=*/0ul);
>  syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
>          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1,
>          /*offset=*/0ul);
>  install_segv_handler();
>  loop();
>  return 0;
> }
>
> =* repro.txt =*
> mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
> r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x2, 0x0)
> mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100),
> 0x0, &(0x7f0000002140)={{'fd', 0x3d, r0}, 0x2c, {'rootmode', 0x3d,
> 0x4000}})
> r1 = memfd_secret(0x0)
> mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2000001, 0x28011, r1, 0x0)
> ftruncate(r1, 0x7)
> r2 = socket$inet_udplite(0x2, 0x2, 0x88)
> getsockopt$sock_cred(r2, 0x1, 0x11, 0x0, &(0x7f0000000280))
> write$FUSE_INIT(r0, &(0x7f0000000000)={0x50}, 0x50)
>
>
> see aslo https://gist.github.com/xrivendell7/961be96ae091c9671bb56efea902cec4.
>
> I hope it helps.
> best regards.
> xingwei Lee

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 13:50 ` Miklos Szeredi
@ 2024-03-22 15:41   ` David Hildenbrand
  2024-03-22 19:46     ` Miklos Szeredi
  0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 15:41 UTC (permalink / raw)
  To: Miklos Szeredi, xingwei lee
  Cc: linux-fsdevel, linux-kernel, samsun1006219, syzkaller-bugs,
	linux-mm, Mike Rapoport

On 22.03.24 14:50, Miklos Szeredi wrote:
> [MM list + secretmem author CC-d]
> 
> On Thu, 21 Mar 2024 at 08:52, xingwei lee <xrivendell7@gmail.com> wrote:
>>
>> Hello I found a bug titled "BUG: unable to handle kernel paging
>> request in fuse_copy_do” with modified syzkaller, and maybe it is
>> related to fs/fuse.
>> I also confirmed in the latest upstream.
>>
>> If you fix this issue, please add the following tag to the commit:
>> Reported-by: xingwei lee <xrivendell7@gmail.com>
>> Reported-by: yue sun <samsun1006219@gmail.com>
> 
> Thanks for the report.   This looks like a secretmem vs get_user_pages issue.
> 
> I reduced the syz reproducer to a minimal one that isn't dependent on fuse:
> 
> === repro.c ===
> #define _GNU_SOURCE
> 
> #include <fcntl.h>
> #include <unistd.h>
> #include <sys/mman.h>
> #include <sys/syscall.h>
> #include <sys/socket.h>
> 
> int main(void)
> {
>          int fd1, fd2, fd3;
>          int pip[2];
>          struct iovec iov;
>          void *addr;
> 
>          fd1 = syscall(__NR_memfd_secret, 0);
>          addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd1, 0);
>          ftruncate(fd1, 7);
>          fd2 = socket(AF_INET, SOCK_DGRAM, 0);
>          getsockopt(fd2, 0, 0, NULL, addr);
> 
>          pipe(pip);
>          iov.iov_base = addr;
>          iov.iov_len = 0x50;
>          vmsplice(pip[1], &iov, 1, 0);

pip[1] should be the write end. So it will be used as the source.

I assume we go the ITER_SOURCE path in vmsplice, and call 
vmsplice_to_pipe(). Then we call iter_to_pipe().

I would expect iov_iter_get_pages2() -> get_user_pages_fast() to fail on 
secretmem pages?

But at least the vmsplice() just seems to work. Which is weird, because 
GUP-fast should not apply (page not faulted in?) and check_vma_flags() 
bails out early on vma_is_secretmem(vma).

So something is not quite right.

> 
>          fd3 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
>          splice(pip[0], NULL, fd3, NULL, 0x50, 0);
> 
>          return 0;
> }




-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 15:41   ` David Hildenbrand
@ 2024-03-22 19:46     ` Miklos Szeredi
  2024-03-22 21:08       ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: Miklos Szeredi @ 2024-03-22 19:46 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:

> But at least the vmsplice() just seems to work. Which is weird, because
> GUP-fast should not apply (page not faulted in?)

But it is faulted in, and that indeed seems to be the root cause.
Improved repro:

#define _GNU_SOURCE

#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/syscall.h>

int main(void)
{
        int fd1, fd2;
        int pip[2];
        struct iovec iov;
        char *addr;
        int ret;

        fd1 = syscall(__NR_memfd_secret, 0);
        addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
        ftruncate(fd1, 7);
        addr[0] = 1; /* fault in page */
        pipe(pip);
        iov.iov_base = addr;
        iov.iov_len = 0x50;
        ret = vmsplice(pip[1], &iov, 1, 0);
        if (ret == -1 && errno == EFAULT) {
                printf("Success\n");
                return 0;
        }

        fd2 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
        splice(pip[0], NULL, fd2, NULL, 0x50, 0);

        return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 19:46     ` Miklos Szeredi
@ 2024-03-22 21:08       ` David Hildenbrand
  2024-03-22 21:13         ` Miklos Szeredi
  0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 21:08 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On 22.03.24 20:46, Miklos Szeredi wrote:
> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
> 
>> But at least the vmsplice() just seems to work. Which is weird, because
>> GUP-fast should not apply (page not faulted in?)
> 
> But it is faulted in, and that indeed seems to be the root cause.

secretmem mmap() won't populate the page tables. So it's not faulted in yet.

When we GUP via vmsplice, GUP-fast should not find it in the page tables 
and fallback to slow GUP.

There, we seem to pass check_vma_flags(), trigger faultin_page() to 
fault it in, and then find it via follow_page_mask().

... and I wonder how we manage to skip check_vma_flags(), or otherwise 
managed to GUP it.

vmsplice() should, in theory, never succeed here.

Weird :/

> Improved repro:
> 
> #define _GNU_SOURCE
> 
> #include <fcntl.h>
> #include <unistd.h>
> #include <stdio.h>
> #include <errno.h>
> #include <sys/mman.h>
> #include <sys/syscall.h>
> 
> int main(void)
> {
>          int fd1, fd2;
>          int pip[2];
>          struct iovec iov;
>          char *addr;
>          int ret;
> 
>          fd1 = syscall(__NR_memfd_secret, 0);
>          addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>          ftruncate(fd1, 7);
>          addr[0] = 1; /* fault in page */
>          pipe(pip);
>          iov.iov_base = addr;
>          iov.iov_len = 0x50;
>          ret = vmsplice(pip[1], &iov, 1, 0);
>          if (ret == -1 && errno == EFAULT) {
>                  printf("Success\n");
>                  return 0;
>          }
> 
>          fd2 = open("/tmp/repro-secretmem.test", O_RDWR | O_CREAT, 0x600);
>          splice(pip[0], NULL, fd2, NULL, 0x50, 0);
> 
>          return 0;
> }
> 

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:08       ` David Hildenbrand
@ 2024-03-22 21:13         ` Miklos Szeredi
  2024-03-22 21:18           ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: Miklos Szeredi @ 2024-03-22 21:13 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
>
> On 22.03.24 20:46, Miklos Szeredi wrote:
> > On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
> >
> >> But at least the vmsplice() just seems to work. Which is weird, because
> >> GUP-fast should not apply (page not faulted in?)
> >
> > But it is faulted in, and that indeed seems to be the root cause.
>
> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>
> When we GUP via vmsplice, GUP-fast should not find it in the page tables
> and fallback to slow GUP.
>
> There, we seem to pass check_vma_flags(), trigger faultin_page() to
> fault it in, and then find it via follow_page_mask().
>
> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
> managed to GUP it.
>
> vmsplice() should, in theory, never succeed here.
>
> Weird :/
>
> > Improved repro:
> >
> > #define _GNU_SOURCE
> >
> > #include <fcntl.h>
> > #include <unistd.h>
> > #include <stdio.h>
> > #include <errno.h>
> > #include <sys/mman.h>
> > #include <sys/syscall.h>
> >
> > int main(void)
> > {
> >          int fd1, fd2;
> >          int pip[2];
> >          struct iovec iov;
> >          char *addr;
> >          int ret;
> >
> >          fd1 = syscall(__NR_memfd_secret, 0);
> >          addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
> >          ftruncate(fd1, 7);
> >          addr[0] = 1; /* fault in page */

Here the page is faulted in and GUP-fast will find it.  It's not in
the kernel page table, but it is in the user page table, which is what
matter for GUP.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:13         ` Miklos Szeredi
@ 2024-03-22 21:18           ` David Hildenbrand
  2024-03-22 21:33             ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 21:18 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On 22.03.24 22:13, Miklos Szeredi wrote:
> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
>>>
>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>> GUP-fast should not apply (page not faulted in?)
>>>
>>> But it is faulted in, and that indeed seems to be the root cause.
>>
>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>
>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>> and fallback to slow GUP.
>>
>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>> fault it in, and then find it via follow_page_mask().
>>
>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>> managed to GUP it.
>>
>> vmsplice() should, in theory, never succeed here.
>>
>> Weird :/
>>
>>> Improved repro:
>>>
>>> #define _GNU_SOURCE
>>>
>>> #include <fcntl.h>
>>> #include <unistd.h>
>>> #include <stdio.h>
>>> #include <errno.h>
>>> #include <sys/mman.h>
>>> #include <sys/syscall.h>
>>>
>>> int main(void)
>>> {
>>>           int fd1, fd2;
>>>           int pip[2];
>>>           struct iovec iov;
>>>           char *addr;
>>>           int ret;
>>>
>>>           fd1 = syscall(__NR_memfd_secret, 0);
>>>           addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>           ftruncate(fd1, 7);
>>>           addr[0] = 1; /* fault in page */
> 
> Here the page is faulted in and GUP-fast will find it.  It's not in
> the kernel page table, but it is in the user page table, which is what
> matter for GUP.

Trust me, I know the GUP code very well :P

gup_pte_range -- GUP fast -- contains:

if (unlikely(folio_is_secretmem(folio))) {
	gup_put_folio(folio, 1, flags);
	goto pte_unmap;
}

So we "should" be rejecting any secretmem folios and fallback to GUP slow.


... we don't check the same in gup_huge_pmd(), but we shouldn't ever see 
THP in secretmem code.

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:18           ` David Hildenbrand
@ 2024-03-22 21:33             ` David Hildenbrand
  2024-03-22 21:37               ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 21:33 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On 22.03.24 22:18, David Hildenbrand wrote:
> On 22.03.24 22:13, Miklos Szeredi wrote:
>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>> GUP-fast should not apply (page not faulted in?)
>>>>
>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>
>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>
>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>> and fallback to slow GUP.
>>>
>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>> fault it in, and then find it via follow_page_mask().
>>>
>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>> managed to GUP it.
>>>
>>> vmsplice() should, in theory, never succeed here.
>>>
>>> Weird :/
>>>
>>>> Improved repro:
>>>>
>>>> #define _GNU_SOURCE
>>>>
>>>> #include <fcntl.h>
>>>> #include <unistd.h>
>>>> #include <stdio.h>
>>>> #include <errno.h>
>>>> #include <sys/mman.h>
>>>> #include <sys/syscall.h>
>>>>
>>>> int main(void)
>>>> {
>>>>            int fd1, fd2;
>>>>            int pip[2];
>>>>            struct iovec iov;
>>>>            char *addr;
>>>>            int ret;
>>>>
>>>>            fd1 = syscall(__NR_memfd_secret, 0);
>>>>            addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>>            ftruncate(fd1, 7);
>>>>            addr[0] = 1; /* fault in page */
>>
>> Here the page is faulted in and GUP-fast will find it.  It's not in
>> the kernel page table, but it is in the user page table, which is what
>> matter for GUP.
> 
> Trust me, I know the GUP code very well :P
> 
> gup_pte_range -- GUP fast -- contains:
> 
> if (unlikely(folio_is_secretmem(folio))) {
> 	gup_put_folio(folio, 1, flags);
> 	goto pte_unmap;
> }
> 
> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
> 
> 
> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
> THP in secretmem code.
> 

Ehm:

[   29.441405] Secretmem fault: PFN: 1096177
[   29.442092] GUP-fast: PFN: 1096177


... is folio_is_secretmem() broken?

... is it something "obvious" like:

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index 35f3a4a8ceb1e..6996f1f53f147 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
          * We know that secretmem pages are not compound and LRU so we can
          * save a couple of cycles here.
          */
-       if (folio_test_large(folio) || !folio_test_lru(folio))
+       if (folio_test_large(folio) || folio_test_lru(folio))
                 return false;
  
         mapping = (struct address_space *)


-- 
Cheers,

David / dhildenb


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:33             ` David Hildenbrand
@ 2024-03-22 21:37               ` David Hildenbrand
  2024-03-22 21:56                 ` David Hildenbrand
  0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 21:37 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On 22.03.24 22:33, David Hildenbrand wrote:
> On 22.03.24 22:18, David Hildenbrand wrote:
>> On 22.03.24 22:13, Miklos Szeredi wrote:
>>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>>> GUP-fast should not apply (page not faulted in?)
>>>>>
>>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>>
>>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>>
>>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>>> and fallback to slow GUP.
>>>>
>>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>>> fault it in, and then find it via follow_page_mask().
>>>>
>>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>>> managed to GUP it.
>>>>
>>>> vmsplice() should, in theory, never succeed here.
>>>>
>>>> Weird :/
>>>>
>>>>> Improved repro:
>>>>>
>>>>> #define _GNU_SOURCE
>>>>>
>>>>> #include <fcntl.h>
>>>>> #include <unistd.h>
>>>>> #include <stdio.h>
>>>>> #include <errno.h>
>>>>> #include <sys/mman.h>
>>>>> #include <sys/syscall.h>
>>>>>
>>>>> int main(void)
>>>>> {
>>>>>             int fd1, fd2;
>>>>>             int pip[2];
>>>>>             struct iovec iov;
>>>>>             char *addr;
>>>>>             int ret;
>>>>>
>>>>>             fd1 = syscall(__NR_memfd_secret, 0);
>>>>>             addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>>>             ftruncate(fd1, 7);
>>>>>             addr[0] = 1; /* fault in page */
>>>
>>> Here the page is faulted in and GUP-fast will find it.  It's not in
>>> the kernel page table, but it is in the user page table, which is what
>>> matter for GUP.
>>
>> Trust me, I know the GUP code very well :P
>>
>> gup_pte_range -- GUP fast -- contains:
>>
>> if (unlikely(folio_is_secretmem(folio))) {
>> 	gup_put_folio(folio, 1, flags);
>> 	goto pte_unmap;
>> }
>>
>> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
>>
>>
>> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
>> THP in secretmem code.
>>
> 
> Ehm:
> 
> [   29.441405] Secretmem fault: PFN: 1096177
> [   29.442092] GUP-fast: PFN: 1096177
> 
> 
> ... is folio_is_secretmem() broken?
> 
> ... is it something "obvious" like:
> 
> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> index 35f3a4a8ceb1e..6996f1f53f147 100644
> --- a/include/linux/secretmem.h
> +++ b/include/linux/secretmem.h
> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
>            * We know that secretmem pages are not compound and LRU so we can
>            * save a couple of cycles here.
>            */
> -       if (folio_test_large(folio) || !folio_test_lru(folio))
> +       if (folio_test_large(folio) || folio_test_lru(folio))
>                   return false;
>    
>           mapping = (struct address_space *)

... yes, that does the trick!

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:37               ` David Hildenbrand
@ 2024-03-22 21:56                 ` David Hildenbrand
  2024-03-24 10:29                   ` Mike Rapoport
  2024-03-25 11:21                   ` Miklos Szeredi
  0 siblings, 2 replies; 12+ messages in thread
From: David Hildenbrand @ 2024-03-22 21:56 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On 22.03.24 22:37, David Hildenbrand wrote:
> On 22.03.24 22:33, David Hildenbrand wrote:
>> On 22.03.24 22:18, David Hildenbrand wrote:
>>> On 22.03.24 22:13, Miklos Szeredi wrote:
>>>> On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 22.03.24 20:46, Miklos Szeredi wrote:
>>>>>> On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
>>>>>>
>>>>>>> But at least the vmsplice() just seems to work. Which is weird, because
>>>>>>> GUP-fast should not apply (page not faulted in?)
>>>>>>
>>>>>> But it is faulted in, and that indeed seems to be the root cause.
>>>>>
>>>>> secretmem mmap() won't populate the page tables. So it's not faulted in yet.
>>>>>
>>>>> When we GUP via vmsplice, GUP-fast should not find it in the page tables
>>>>> and fallback to slow GUP.
>>>>>
>>>>> There, we seem to pass check_vma_flags(), trigger faultin_page() to
>>>>> fault it in, and then find it via follow_page_mask().
>>>>>
>>>>> ... and I wonder how we manage to skip check_vma_flags(), or otherwise
>>>>> managed to GUP it.
>>>>>
>>>>> vmsplice() should, in theory, never succeed here.
>>>>>
>>>>> Weird :/
>>>>>
>>>>>> Improved repro:
>>>>>>
>>>>>> #define _GNU_SOURCE
>>>>>>
>>>>>> #include <fcntl.h>
>>>>>> #include <unistd.h>
>>>>>> #include <stdio.h>
>>>>>> #include <errno.h>
>>>>>> #include <sys/mman.h>
>>>>>> #include <sys/syscall.h>
>>>>>>
>>>>>> int main(void)
>>>>>> {
>>>>>>              int fd1, fd2;
>>>>>>              int pip[2];
>>>>>>              struct iovec iov;
>>>>>>              char *addr;
>>>>>>              int ret;
>>>>>>
>>>>>>              fd1 = syscall(__NR_memfd_secret, 0);
>>>>>>              addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
>>>>>>              ftruncate(fd1, 7);
>>>>>>              addr[0] = 1; /* fault in page */
>>>>
>>>> Here the page is faulted in and GUP-fast will find it.  It's not in
>>>> the kernel page table, but it is in the user page table, which is what
>>>> matter for GUP.
>>>
>>> Trust me, I know the GUP code very well :P
>>>
>>> gup_pte_range -- GUP fast -- contains:
>>>
>>> if (unlikely(folio_is_secretmem(folio))) {
>>> 	gup_put_folio(folio, 1, flags);
>>> 	goto pte_unmap;
>>> }
>>>
>>> So we "should" be rejecting any secretmem folios and fallback to GUP slow.
>>>
>>>
>>> ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
>>> THP in secretmem code.
>>>
>>
>> Ehm:
>>
>> [   29.441405] Secretmem fault: PFN: 1096177
>> [   29.442092] GUP-fast: PFN: 1096177
>>
>>
>> ... is folio_is_secretmem() broken?
>>
>> ... is it something "obvious" like:
>>
>> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
>> index 35f3a4a8ceb1e..6996f1f53f147 100644
>> --- a/include/linux/secretmem.h
>> +++ b/include/linux/secretmem.h
>> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
>>             * We know that secretmem pages are not compound and LRU so we can
>>             * save a couple of cycles here.
>>             */
>> -       if (folio_test_large(folio) || !folio_test_lru(folio))
>> +       if (folio_test_large(folio) || folio_test_lru(folio))
>>                    return false;
>>     
>>            mapping = (struct address_space *)
> 
> ... yes, that does the trick!
> 

Proper patch (I might send out again on Monday "officially"). There are
other improvements we want to do to folio_is_secretmem() in the light of
folio_fast_pin_allowed(), that I wanted to do a while ago. I might send
a patch for that as well now that I'm at it.


 From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 22 Mar 2024 22:45:36 +0100
Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios

folio_is_secretmem() states that secretmem folios cannot be LRU folios:
so we may only exit early if we find an LRU folio. Yet, we exit early if
we find a folio that is not a secretmem folio.

Consequently, folio_is_secretmem() fails to detect secretmem folios and,
therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
crashing the kernel when we later try reading/writing to the folio, because
the folio has been unmapped from the directmap.

Reported-by: xingwei lee <xrivendell7@gmail.com>
Reported-by: yue sun <samsun1006219@gmail.com>
Debugged-by: Miklos Szeredi <miklos@szeredi.hu>
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Cc: <stable@vger.kernel.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
  include/linux/secretmem.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index 35f3a4a8ceb1..6996f1f53f14 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
  	 * We know that secretmem pages are not compound and LRU so we can
  	 * save a couple of cycles here.
  	 */
-	if (folio_test_large(folio) || !folio_test_lru(folio))
+	if (folio_test_large(folio) || folio_test_lru(folio))
  		return false;
  
  	mapping = (struct address_space *)
-- 
2.43.2


-- 
Cheers,

David / dhildenb


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:56                 ` David Hildenbrand
@ 2024-03-24 10:29                   ` Mike Rapoport
  2024-03-25 11:21                   ` Miklos Szeredi
  1 sibling, 0 replies; 12+ messages in thread
From: Mike Rapoport @ 2024-03-24 10:29 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Miklos Szeredi, xingwei lee, linux-fsdevel, linux-kernel,
	samsun1006219, syzkaller-bugs, linux-mm

On Fri, Mar 22, 2024 at 10:56:08PM +0100, David Hildenbrand wrote:
> On 22.03.24 22:37, David Hildenbrand wrote:
> > On 22.03.24 22:33, David Hildenbrand wrote:
> > > On 22.03.24 22:18, David Hildenbrand wrote:
> > > > On 22.03.24 22:13, Miklos Szeredi wrote:
> > > > > On Fri, 22 Mar 2024 at 22:08, David Hildenbrand <david@redhat.com> wrote:
> > > > > > 
> > > > > > On 22.03.24 20:46, Miklos Szeredi wrote:
> > > > > > > On Fri, 22 Mar 2024 at 16:41, David Hildenbrand <david@redhat.com> wrote:
> > > > > > > 
> > > > > > > > But at least the vmsplice() just seems to work. Which is weird, because
> > > > > > > > GUP-fast should not apply (page not faulted in?)
> > > > > > > 
> > > > > > > But it is faulted in, and that indeed seems to be the root cause.
> > > > > > 
> > > > > > secretmem mmap() won't populate the page tables. So it's not faulted in yet.
> > > > > > 
> > > > > > When we GUP via vmsplice, GUP-fast should not find it in the page tables
> > > > > > and fallback to slow GUP.
> > > > > > 
> > > > > > There, we seem to pass check_vma_flags(), trigger faultin_page() to
> > > > > > fault it in, and then find it via follow_page_mask().
> > > > > > 
> > > > > > ... and I wonder how we manage to skip check_vma_flags(), or otherwise
> > > > > > managed to GUP it.
> > > > > > 
> > > > > > vmsplice() should, in theory, never succeed here.
> > > > > > 
> > > > > > Weird :/
> > > > > > 
> > > > > > > Improved repro:
> > > > > > > 
> > > > > > > #define _GNU_SOURCE
> > > > > > > 
> > > > > > > #include <fcntl.h>
> > > > > > > #include <unistd.h>
> > > > > > > #include <stdio.h>
> > > > > > > #include <errno.h>
> > > > > > > #include <sys/mman.h>
> > > > > > > #include <sys/syscall.h>
> > > > > > > 
> > > > > > > int main(void)
> > > > > > > {
> > > > > > >              int fd1, fd2;
> > > > > > >              int pip[2];
> > > > > > >              struct iovec iov;
> > > > > > >              char *addr;
> > > > > > >              int ret;
> > > > > > > 
> > > > > > >              fd1 = syscall(__NR_memfd_secret, 0);
> > > > > > >              addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
> > > > > > >              ftruncate(fd1, 7);
> > > > > > >              addr[0] = 1; /* fault in page */
> > > > > 
> > > > > Here the page is faulted in and GUP-fast will find it.  It's not in
> > > > > the kernel page table, but it is in the user page table, which is what
> > > > > matter for GUP.
> > > > 
> > > > Trust me, I know the GUP code very well :P
> > > > 
> > > > gup_pte_range -- GUP fast -- contains:
> > > > 
> > > > if (unlikely(folio_is_secretmem(folio))) {
> > > > 	gup_put_folio(folio, 1, flags);
> > > > 	goto pte_unmap;
> > > > }
> > > > 
> > > > So we "should" be rejecting any secretmem folios and fallback to GUP slow.
> > > > 
> > > > 
> > > > ... we don't check the same in gup_huge_pmd(), but we shouldn't ever see
> > > > THP in secretmem code.
> > > > 
> > > 
> > > Ehm:
> > > 
> > > [   29.441405] Secretmem fault: PFN: 1096177
> > > [   29.442092] GUP-fast: PFN: 1096177
> > > 
> > > 
> > > ... is folio_is_secretmem() broken?
> > > 
> > > ... is it something "obvious" like:
> > > 
> > > diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> > > index 35f3a4a8ceb1e..6996f1f53f147 100644
> > > --- a/include/linux/secretmem.h
> > > +++ b/include/linux/secretmem.h
> > > @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
> > >             * We know that secretmem pages are not compound and LRU so we can
> > >             * save a couple of cycles here.
> > >             */
> > > -       if (folio_test_large(folio) || !folio_test_lru(folio))
> > > +       if (folio_test_large(folio) || folio_test_lru(folio))
> > >                    return false;
> > >            mapping = (struct address_space *)
> > 
> > ... yes, that does the trick!
> > 
> 
> Proper patch (I might send out again on Monday "officially"). There are
> other improvements we want to do to folio_is_secretmem() in the light of
> folio_fast_pin_allowed(), that I wanted to do a while ago. I might send
> a patch for that as well now that I'm at it.
 
The most robust but a bit slower solution is to make folio_is_secretmem()
call folio_mapping() rather than open code the check.

What improvements did you have in mind?
 
> From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Fri, 22 Mar 2024 22:45:36 +0100
> Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios
> 
> folio_is_secretmem() states that secretmem folios cannot be LRU folios:
> so we may only exit early if we find an LRU folio. Yet, we exit early if
> we find a folio that is not a secretmem folio.
>
> Consequently, folio_is_secretmem() fails to detect secretmem folios and,
> therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
> crashing the kernel when we later try reading/writing to the folio, because
> the folio has been unmapped from the directmap.
> 
> Reported-by: xingwei lee <xrivendell7@gmail.com>
> Reported-by: yue sun <samsun1006219@gmail.com>
> Debugged-by: Miklos Szeredi <miklos@szeredi.hu>
> Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: David Hildenbrand <david@redhat.com>

Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>

> ---
>  include/linux/secretmem.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> index 35f3a4a8ceb1..6996f1f53f14 100644
> --- a/include/linux/secretmem.h
> +++ b/include/linux/secretmem.h
> @@ -16,7 +16,7 @@ static inline bool folio_is_secretmem(struct folio *folio)
>  	 * We know that secretmem pages are not compound and LRU so we can
>  	 * save a couple of cycles here.
>  	 */
> -	if (folio_test_large(folio) || !folio_test_lru(folio))
> +	if (folio_test_large(folio) || folio_test_lru(folio))
>  		return false;
>  	mapping = (struct address_space *)
> -- 
> 2.43.2
> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: BUG: unable to handle kernel paging request in fuse_copy_do
  2024-03-22 21:56                 ` David Hildenbrand
  2024-03-24 10:29                   ` Mike Rapoport
@ 2024-03-25 11:21                   ` Miklos Szeredi
  1 sibling, 0 replies; 12+ messages in thread
From: Miklos Szeredi @ 2024-03-25 11:21 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: xingwei lee, linux-fsdevel, linux-kernel, samsun1006219,
	syzkaller-bugs, linux-mm, Mike Rapoport

On Fri, 22 Mar 2024 at 22:56, David Hildenbrand <david@redhat.com> wrote:

>  From 85558a46d9f249f26bd77dd3b18d14f248464845 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Fri, 22 Mar 2024 22:45:36 +0100
> Subject: [PATCH] mm/secretmem: fix GUP-fast succeeding on secretmem folios
>
> folio_is_secretmem() states that secretmem folios cannot be LRU folios:
> so we may only exit early if we find an LRU folio. Yet, we exit early if
> we find a folio that is not a secretmem folio.
>
> Consequently, folio_is_secretmem() fails to detect secretmem folios and,
> therefore, we can succeed in grabbing a secretmem folio during GUP-fast,
> crashing the kernel when we later try reading/writing to the folio, because
> the folio has been unmapped from the directmap.
>
> Reported-by: xingwei lee <xrivendell7@gmail.com>
> Reported-by: yue sun <samsun1006219@gmail.com>
> Debugged-by: Miklos Szeredi <miklos@szeredi.hu>
> Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: David Hildenbrand <david@redhat.com>

Verified that it's no longer crashing with the reproducers.

Tested-by: Miklos Szeredi <mszeredi@redhat.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-03-25 11:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-03-21  7:52 BUG: unable to handle kernel paging request in fuse_copy_do xingwei lee
2024-03-22 13:50 ` Miklos Szeredi
2024-03-22 15:41   ` David Hildenbrand
2024-03-22 19:46     ` Miklos Szeredi
2024-03-22 21:08       ` David Hildenbrand
2024-03-22 21:13         ` Miklos Szeredi
2024-03-22 21:18           ` David Hildenbrand
2024-03-22 21:33             ` David Hildenbrand
2024-03-22 21:37               ` David Hildenbrand
2024-03-22 21:56                 ` David Hildenbrand
2024-03-24 10:29                   ` Mike Rapoport
2024-03-25 11:21                   ` Miklos Szeredi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).