From: Josef Bacik <josef@toxicpanda.com>
To: Omar Sandoval <osandov@osandov.com>
Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com
Subject: Re: [PATCH v2] btrfs: fix deadlock when defragging transparent huge pages
Date: Wed, 20 Oct 2021 09:47:37 -0400 [thread overview]
Message-ID: <YXAd+Y6Tp32RV2im@localhost.localdomain> (raw)
In-Reply-To: <321ffae6065e36a6c698912a86f8019ea00f43b4.1634700835.git.osandov@fb.com>
On Tue, Oct 19, 2021 at 08:35:01PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
>
> Attempting to defragment a Btrfs file containing a transparent huge page
> immediately deadlocks with the following stack trace:
>
> #0 context_switch (kernel/sched/core.c:4940:2)
> #1 __schedule (kernel/sched/core.c:6287:8)
> #2 schedule (kernel/sched/core.c:6366:3)
> #3 io_schedule (kernel/sched/core.c:8389:2)
> #4 wait_on_page_bit_common (mm/filemap.c:1356:4)
> #5 __lock_page (mm/filemap.c:1648:2)
> #6 lock_page (./include/linux/pagemap.h:625:3)
> #7 pagecache_get_page (mm/filemap.c:1910:4)
> #8 find_or_create_page (./include/linux/pagemap.h:420:9)
> #9 defrag_prepare_one_page (fs/btrfs/ioctl.c:1068:9)
> #10 defrag_one_range (fs/btrfs/ioctl.c:1326:14)
> #11 defrag_one_cluster (fs/btrfs/ioctl.c:1421:9)
> #12 btrfs_defrag_file (fs/btrfs/ioctl.c:1523:9)
> #13 btrfs_ioctl_defrag (fs/btrfs/ioctl.c:3117:9)
> #14 btrfs_ioctl (fs/btrfs/ioctl.c:4872:10)
> #15 vfs_ioctl (fs/ioctl.c:51:10)
> #16 __do_sys_ioctl (fs/ioctl.c:874:11)
> #17 __se_sys_ioctl (fs/ioctl.c:860:1)
> #18 __x64_sys_ioctl (fs/ioctl.c:860:1)
> #19 do_syscall_x64 (arch/x86/entry/common.c:50:14)
> #20 do_syscall_64 (arch/x86/entry/common.c:80:7)
> #21 entry_SYSCALL_64+0x7c/0x15b (arch/x86/entry/entry_64.S:113)
>
> A huge page is represented by a compound page, which consists of a
> struct page for each PAGE_SIZE page within the huge page. The first
> struct page is the "head page", and the remaining are "tail pages".
>
> Defragmentation attempts to lock each page in the range. However,
> lock_page() on a tail page actually locks the corresponding head page.
> So, if defragmentation tries to lock more than one struct page in a
> compound page, it tries to lock the same head page twice and deadlocks
> with itself.
>
> Ideally, we should be able to defragment transparent huge pages.
> However, THP for filesystems is currently read-only, so a lot of code is
> not ready to use huge pages for I/O. For now, let's just return
> ETXTBUSY.
>
> This can be reproduced with the following on a kernel with
> CONFIG_READ_ONLY_THP_FOR_FS=y:
>
> $ cat create_thp_file.c
> #include <fcntl.h>
> #include <stdbool.h>
> #include <stdio.h>
> #include <stdint.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/mman.h>
>
> static const char zeroes[1024 * 1024];
> static const size_t FILE_SIZE = 2 * 1024 * 1024;
>
> int main(int argc, char **argv)
> {
> if (argc != 2) {
> fprintf(stderr, "usage: %s PATH\n", argv[0]);
> return EXIT_FAILURE;
> }
> int fd = creat(argv[1], 0777);
> if (fd == -1) {
> perror("creat");
> return EXIT_FAILURE;
> }
> size_t written = 0;
> while (written < FILE_SIZE) {
> ssize_t ret = write(fd, zeroes,
> sizeof(zeroes) < FILE_SIZE - written ?
> sizeof(zeroes) : FILE_SIZE - written);
> if (ret < 0) {
> perror("write");
> return EXIT_FAILURE;
> }
> written += ret;
> }
> close(fd);
> fd = open(argv[1], O_RDONLY);
> if (fd == -1) {
> perror("open");
> return EXIT_FAILURE;
> }
>
> /*
> * Reserve some address space so that we can align the file mapping to
> * the huge page size.
> */
> void *placeholder_map = mmap(NULL, FILE_SIZE * 2, PROT_NONE,
> MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> if (placeholder_map == MAP_FAILED) {
> perror("mmap (placeholder)");
> return EXIT_FAILURE;
> }
>
> void *aligned_address =
> (void *)(((uintptr_t)placeholder_map + FILE_SIZE - 1) & ~(FILE_SIZE - 1));
>
> void *map = mmap(aligned_address, FILE_SIZE, PROT_READ | PROT_EXEC,
> MAP_SHARED | MAP_FIXED, fd, 0);
> if (map == MAP_FAILED) {
> perror("mmap");
> return EXIT_FAILURE;
> }
> if (madvise(map, FILE_SIZE, MADV_HUGEPAGE) < 0) {
> perror("madvise");
> return EXIT_FAILURE;
> }
>
> char *line = NULL;
> size_t line_capacity = 0;
> FILE *smaps_file = fopen("/proc/self/smaps", "r");
> if (!smaps_file) {
> perror("fopen");
> return EXIT_FAILURE;
> }
> for (;;) {
> for (size_t off = 0; off < FILE_SIZE; off += 4096)
> ((volatile char *)map)[off];
>
> ssize_t ret;
> bool this_mapping = false;
> while ((ret = getline(&line, &line_capacity, smaps_file)) > 0) {
> unsigned long start, end, huge;
> if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
> this_mapping = (start <= (uintptr_t)map &&
> (uintptr_t)map < end);
> } else if (this_mapping &&
> sscanf(line, "FilePmdMapped: %ld", &huge) == 1 &&
> huge > 0) {
> return EXIT_SUCCESS;
> }
> }
>
> sleep(6);
> rewind(smaps_file);
> fflush(smaps_file);
> }
> }
> $ ./create_thp_file huge
> $ btrfs fi defrag -czstd ./huge
>
> Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Thanks,
Josef
next prev parent reply other threads:[~2021-10-20 13:47 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-20 3:35 [PATCH v2] btrfs: fix deadlock when defragging transparent huge pages Omar Sandoval
2021-10-20 13:47 ` Josef Bacik [this message]
2021-10-21 16:28 ` David Sterba
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=YXAd+Y6Tp32RV2im@localhost.localdomain \
--to=josef@toxicpanda.com \
--cc=kernel-team@fb.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=osandov@osandov.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox