From: Liu Bo <bo.li.liu@oracle.com>
To: josef@toxicpanda.com
Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com,
Josef Bacik <jbacik@fb.com>,
jeffm@suse.com
Subject: Re: [PATCH 1/2][v2] btrfs: fix readdir deadlock with pagefault
Date: Fri, 28 Jul 2017 11:33:14 -0600 [thread overview]
Message-ID: <20170728173314.GA15815@localhost.localdomain> (raw)
In-Reply-To: <1500923666-29670-1-git-send-email-jbacik@fb.com>
On Mon, Jul 24, 2017 at 03:14:25PM -0400, josef@toxicpanda.com wrote:
> From: Josef Bacik <jbacik@fb.com>
>
> Readdir does dir_emit while under the btree lock. dir_emit can trigger
> the page fault which means we can deadlock. Fix this by allocating a
> buffer on opening a directory and copying the readdir into this buffer
> and doing dir_emit from outside of the tree lock.
>
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
> v1->v2:
> - use kzalloc instead of alloc_page().
> - make struct btrfs_file_private so you can still start a userspace trans on a
> directory.
>
> fs/btrfs/ctree.h | 5 +++
> fs/btrfs/file.c | 9 ++++-
> fs/btrfs/inode.c | 107 +++++++++++++++++++++++++++++++++++++++++--------------
> fs/btrfs/ioctl.c | 19 ++++++----
> 4 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 5ee9f10..33e942b 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1264,6 +1264,11 @@ struct btrfs_root {
> atomic64_t qgroup_meta_rsv;
> };
>
> +struct btrfs_file_private {
> + struct btrfs_trans_handle *trans;
> + void *filldir_buf;
> +};
> +
> static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
> {
> return btrfs_sb(inode->i_sb)->sectorsize;
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 0f102a1..1897c3b 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1973,8 +1973,15 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>
> int btrfs_release_file(struct inode *inode, struct file *filp)
> {
> - if (filp->private_data)
> + struct btrfs_file_private *private = filp->private_data;
> +
> + if (private && private->trans)
> btrfs_ioctl_trans_end(filp);
> + if (private && private->filldir_buf)
> + kfree(private->filldir_buf);
> + kfree(private);
> + filp->private_data = NULL;
> +
> /*
> * ordered_data_close is set by settattr when we are about to truncate
> * a file from a non-zero size to a zero size. This tries to
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 9a4413a..bbdbeea 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -5877,25 +5877,73 @@ unsigned char btrfs_filetype_table[] = {
> DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
> };
>
> +/*
> + * All this infrastructure exists because dir_emit can fault, and we are holding
> + * the tree lock when doing readdir. For now just allocate a buffer and copy
> + * our information into that, and then dir_emit from the buffer. This is
> + * similar to what NFS does, only we don't keep the buffer around in pagecache
> + * because I'm afraid I'll fuck that up. Long term we need to make filldir do
> + * copy_to_user_inatomic so we don't have to worry about page faulting under the
> + * tree lock.
> + */
> +static int btrfs_opendir(struct inode *inode, struct file *file)
> +{
> + struct btrfs_file_private *private;
> +
> + private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
> + if (!private)
> + return -ENOMEM;
> + private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
> + if (!private->filldir_buf) {
> + kfree(private);
> + return -ENOMEM;
> + }
> + file->private_data = private;
> + return 0;
> +}
> +
> +struct dir_entry {
> + u64 ino;
> + u64 offset;
> + unsigned type;
> + int name_len;
> +};
> +
> +static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
> +{
> + while (entries--) {
> + struct dir_entry *entry = addr;
> + char *name = (char *)(entry + 1);
> + ctx->pos = entry->offset;
> + if (!dir_emit(ctx, name, entry->name_len, entry->ino,
> + entry->type))
> + return 1;
> + addr += sizeof(struct dir_entry) + entry->name_len;
> + ctx->pos++;
> + }
> + return 0;
> +}
> +
> static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
> {
> struct inode *inode = file_inode(file);
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> struct btrfs_root *root = BTRFS_I(inode)->root;
> + struct btrfs_file_private *private = file->private_data;
> struct btrfs_dir_item *di;
> struct btrfs_key key;
> struct btrfs_key found_key;
> struct btrfs_path *path;
> + void *addr;
> struct list_head ins_list;
> struct list_head del_list;
> int ret;
> struct extent_buffer *leaf;
> int slot;
> - unsigned char d_type;
> - int over = 0;
> - char tmp_name[32];
> char *name_ptr;
> int name_len;
> + int entries = 0;
> + int total_len = 0;
> bool put = false;
> struct btrfs_key location;
>
> @@ -5906,12 +5954,14 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
> if (!path)
> return -ENOMEM;
>
> + addr = private->filldir_buf;
> path->reada = READA_FORWARD;
>
> INIT_LIST_HEAD(&ins_list);
> INIT_LIST_HEAD(&del_list);
> put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
>
> +again:
> key.type = BTRFS_DIR_INDEX_KEY;
> key.offset = ctx->pos;
> key.objectid = btrfs_ino(BTRFS_I(inode));
> @@ -5921,6 +5971,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
> goto err;
>
> while (1) {
> + struct dir_entry *entry;
> leaf = path->nodes[0];
> slot = path->slots[0];
> if (slot >= btrfs_header_nritems(leaf)) {
> @@ -5942,41 +5993,44 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
> goto next;
> if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
> goto next;
> -
> - ctx->pos = found_key.offset;
> -
> di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
> if (verify_dir_item(fs_info, leaf, slot, di))
> goto next;
>
> name_len = btrfs_dir_name_len(leaf, di);
> - if (name_len <= sizeof(tmp_name)) {
> - name_ptr = tmp_name;
> - } else {
> - name_ptr = kmalloc(name_len, GFP_KERNEL);
> - if (!name_ptr) {
> - ret = -ENOMEM;
> - goto err;
> - }
> + if ((total_len + sizeof(struct dir_entry) + name_len) >=
> + PAGE_SIZE) {
> + btrfs_release_path(path);
> + ret = btrfs_filldir(private->filldir_buf, entries,
> + ctx);
> + if (ret)
> + goto nopos;
> + addr = private->filldir_buf;
> + entries = 0;
> + total_len = 0;
> + goto again;
> }
> +
> + entry = addr;
> + entry->name_len = name_len;
> + name_ptr = (char *)(entry + 1);
> read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
> name_len);
> -
> - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
> + entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
> btrfs_dir_item_key_to_cpu(leaf, di, &location);
> -
> - over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
> - d_type);
> -
> - if (name_ptr != tmp_name)
> - kfree(name_ptr);
> -
> - if (over)
> - goto nopos;
> - ctx->pos++;
> + entry->ino = location.objectid;
> + entry->offset = found_key.offset;
> + entries++;
> + addr += sizeof(struct dir_entry) + name_len;
> + total_len += sizeof(struct dir_entry) + name_len;
To Jeff,
As you've fixed a loop bug in getdents
(commit d2fbb2b589ece9060635b43c2b2333d0b0a0fbf2 btrfs: increment ctx->pos for every emitted or skipped dirent in readdir),
could you please check if this patch is OK with that bug?
(It looks good to me.)
> next:
> path->slots[0]++;
> }
> + btrfs_release_path(path);
> +
> + ret = btrfs_filldir(private->filldir_buf, entries, ctx);
> + if (ret)
> + goto nopos;
>
> ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
> if (ret)
> @@ -10777,6 +10831,7 @@ static const struct file_operations btrfs_dir_file_operations = {
> .llseek = generic_file_llseek,
> .read = generic_read_dir,
> .iterate_shared = btrfs_real_readdir,
> + .open = btrfs_opendir,
> .unlocked_ioctl = btrfs_ioctl,
> #ifdef CONFIG_COMPAT
> .compat_ioctl = btrfs_compat_ioctl,
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index bedeec6..6bb1054 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -3966,6 +3966,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> struct btrfs_root *root = BTRFS_I(inode)->root;
> struct btrfs_trans_handle *trans;
> + struct btrfs_file_private *private;
> int ret;
>
> ret = -EPERM;
> @@ -3973,8 +3974,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
> goto out;
>
> ret = -EINPROGRESS;
> - if (file->private_data)
> + private = file->private_data;
> + if (private && private->trans)
> goto out;
> + if (!private) {
> + private = kzalloc(sizeof(struct btrfs_file_private),
> + GFP_KERNEL);
> + if (!private)
> + return -ENOMEM;
> + }
>
Missing 'private->trans = trans'?
Thanks,
-liubo
> ret = -EROFS;
> if (btrfs_root_readonly(root))
> @@ -4246,14 +4254,13 @@ long btrfs_ioctl_trans_end(struct file *file)
> {
> struct inode *inode = file_inode(file);
> struct btrfs_root *root = BTRFS_I(inode)->root;
> - struct btrfs_trans_handle *trans;
> + struct btrfs_file_private *private = file->private_data;
>
> - trans = file->private_data;
> - if (!trans)
> + if (!private || !private->trans)
> return -EINVAL;
> - file->private_data = NULL;
>
> - btrfs_end_transaction(trans);
> + btrfs_end_transaction(private->trans);
> + private->trans = NULL;
>
> atomic_dec(&root->fs_info->open_ioctl_trans);
>
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
prev parent reply other threads:[~2017-07-28 18:34 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-07-24 19:14 [PATCH 1/2][v2] btrfs: fix readdir deadlock with pagefault josef
2017-07-24 19:14 ` [PATCH 2/2] btrfs: increase ctx->pos for delayed dir index josef
2017-07-28 17:33 ` Liu Bo
2017-07-28 15:27 ` [PATCH 1/2][v2] btrfs: fix readdir deadlock with pagefault David Sterba
2017-07-28 17:33 ` Liu Bo [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170728173314.GA15815@localhost.localdomain \
--to=bo.li.liu@oracle.com \
--cc=jbacik@fb.com \
--cc=jeffm@suse.com \
--cc=josef@toxicpanda.com \
--cc=kernel-team@fb.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).