public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Josef Bacik <josef@toxicpanda.com>
To: Omar Sandoval <osandov@osandov.com>
Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com
Subject: Re: [PATCH v2] btrfs: fix deadlock when defragging transparent huge pages
Date: Wed, 20 Oct 2021 09:47:37 -0400	[thread overview]
Message-ID: <YXAd+Y6Tp32RV2im@localhost.localdomain> (raw)
In-Reply-To: <321ffae6065e36a6c698912a86f8019ea00f43b4.1634700835.git.osandov@fb.com>

On Tue, Oct 19, 2021 at 08:35:01PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Attempting to defragment a Btrfs file containing a transparent huge page
> immediately deadlocks with the following stack trace:
> 
>   #0  context_switch (kernel/sched/core.c:4940:2)
>   #1  __schedule (kernel/sched/core.c:6287:8)
>   #2  schedule (kernel/sched/core.c:6366:3)
>   #3  io_schedule (kernel/sched/core.c:8389:2)
>   #4  wait_on_page_bit_common (mm/filemap.c:1356:4)
>   #5  __lock_page (mm/filemap.c:1648:2)
>   #6  lock_page (./include/linux/pagemap.h:625:3)
>   #7  pagecache_get_page (mm/filemap.c:1910:4)
>   #8  find_or_create_page (./include/linux/pagemap.h:420:9)
>   #9  defrag_prepare_one_page (fs/btrfs/ioctl.c:1068:9)
>   #10 defrag_one_range (fs/btrfs/ioctl.c:1326:14)
>   #11 defrag_one_cluster (fs/btrfs/ioctl.c:1421:9)
>   #12 btrfs_defrag_file (fs/btrfs/ioctl.c:1523:9)
>   #13 btrfs_ioctl_defrag (fs/btrfs/ioctl.c:3117:9)
>   #14 btrfs_ioctl (fs/btrfs/ioctl.c:4872:10)
>   #15 vfs_ioctl (fs/ioctl.c:51:10)
>   #16 __do_sys_ioctl (fs/ioctl.c:874:11)
>   #17 __se_sys_ioctl (fs/ioctl.c:860:1)
>   #18 __x64_sys_ioctl (fs/ioctl.c:860:1)
>   #19 do_syscall_x64 (arch/x86/entry/common.c:50:14)
>   #20 do_syscall_64 (arch/x86/entry/common.c:80:7)
>   #21 entry_SYSCALL_64+0x7c/0x15b (arch/x86/entry/entry_64.S:113)
> 
> A huge page is represented by a compound page, which consists of a
> struct page for each PAGE_SIZE page within the huge page. The first
> struct page is the "head page", and the remaining are "tail pages".
> 
> Defragmentation attempts to lock each page in the range. However,
> lock_page() on a tail page actually locks the corresponding head page.
> So, if defragmentation tries to lock more than one struct page in a
> compound page, it tries to lock the same head page twice and deadlocks
> with itself.
> 
> Ideally, we should be able to defragment transparent huge pages.
> However, THP for filesystems is currently read-only, so a lot of code is
> not ready to use huge pages for I/O. For now, let's just return
> ETXTBUSY.
> 
> This can be reproduced with the following on a kernel with
> CONFIG_READ_ONLY_THP_FOR_FS=y:
> 
>   $ cat create_thp_file.c
>   #include <fcntl.h>
>   #include <stdbool.h>
>   #include <stdio.h>
>   #include <stdint.h>
>   #include <stdlib.h>
>   #include <unistd.h>
>   #include <sys/mman.h>
> 
>   static const char zeroes[1024 * 1024];
>   static const size_t FILE_SIZE = 2 * 1024 * 1024;
> 
>   int main(int argc, char **argv)
>   {
>           if (argc != 2) {
>                   fprintf(stderr, "usage: %s PATH\n", argv[0]);
>                   return EXIT_FAILURE;
>           }
>           int fd = creat(argv[1], 0777);
>           if (fd == -1) {
>                   perror("creat");
>                   return EXIT_FAILURE;
>           }
>           size_t written = 0;
>           while (written < FILE_SIZE) {
>                   ssize_t ret = write(fd, zeroes,
>                                       sizeof(zeroes) < FILE_SIZE - written ?
>                                       sizeof(zeroes) : FILE_SIZE - written);
>                   if (ret < 0) {
>                           perror("write");
>                           return EXIT_FAILURE;
>                   }
>                   written += ret;
>           }
>           close(fd);
>           fd = open(argv[1], O_RDONLY);
>           if (fd == -1) {
>                   perror("open");
>                   return EXIT_FAILURE;
>           }
> 
>           /*
>            * Reserve some address space so that we can align the file mapping to
>            * the huge page size.
>            */
>           void *placeholder_map = mmap(NULL, FILE_SIZE * 2, PROT_NONE,
>                                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>           if (placeholder_map == MAP_FAILED) {
>                   perror("mmap (placeholder)");
>                   return EXIT_FAILURE;
>           }
> 
>           void *aligned_address =
>                   (void *)(((uintptr_t)placeholder_map + FILE_SIZE - 1) & ~(FILE_SIZE - 1));
> 
>           void *map = mmap(aligned_address, FILE_SIZE, PROT_READ | PROT_EXEC,
>                            MAP_SHARED | MAP_FIXED, fd, 0);
>           if (map == MAP_FAILED) {
>                   perror("mmap");
>                   return EXIT_FAILURE;
>           }
>           if (madvise(map, FILE_SIZE, MADV_HUGEPAGE) < 0) {
>                   perror("madvise");
>                   return EXIT_FAILURE;
>           }
> 
>           char *line = NULL;
>           size_t line_capacity = 0;
>           FILE *smaps_file = fopen("/proc/self/smaps", "r");
>           if (!smaps_file) {
>                   perror("fopen");
>                   return EXIT_FAILURE;
>           }
>           for (;;) {
>                   for (size_t off = 0; off < FILE_SIZE; off += 4096)
>                           ((volatile char *)map)[off];
> 
>                   ssize_t ret;
>                   bool this_mapping = false;
>                   while ((ret = getline(&line, &line_capacity, smaps_file)) > 0) {
>                           unsigned long start, end, huge;
>                           if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
>                                   this_mapping = (start <= (uintptr_t)map &&
>                                                   (uintptr_t)map < end);
>                           } else if (this_mapping &&
>                                      sscanf(line, "FilePmdMapped: %ld", &huge) == 1 &&
>                                      huge > 0) {
>                                   return EXIT_SUCCESS;
>                           }
>                   }
> 
>                   sleep(6);
>                   rewind(smaps_file);
>                   fflush(smaps_file);
>           }
>   }
>   $ ./create_thp_file huge
>   $ btrfs fi defrag -czstd ./huge
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>

Reviewed-by: Josef Bacik <josef@toxicpanda.com>

Thanks,

Josef

  reply	other threads:[~2021-10-20 13:47 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-20  3:35 [PATCH v2] btrfs: fix deadlock when defragging transparent huge pages Omar Sandoval
2021-10-20 13:47 ` Josef Bacik [this message]
2021-10-21 16:28 ` David Sterba

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YXAd+Y6Tp32RV2im@localhost.localdomain \
    --to=josef@toxicpanda.com \
    --cc=kernel-team@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=osandov@osandov.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox