From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from userp1040.oracle.com ([156.151.31.81]:32056 "EHLO userp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752229AbdG1Se1 (ORCPT ); Fri, 28 Jul 2017 14:34:27 -0400 Date: Fri, 28 Jul 2017 11:33:14 -0600 From: Liu Bo To: josef@toxicpanda.com Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com, Josef Bacik , jeffm@suse.com Subject: Re: [PATCH 1/2][v2] btrfs: fix readdir deadlock with pagefault Message-ID: <20170728173314.GA15815@localhost.localdomain> Reply-To: bo.li.liu@oracle.com References: <1500923666-29670-1-git-send-email-jbacik@fb.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii In-Reply-To: <1500923666-29670-1-git-send-email-jbacik@fb.com> Sender: linux-btrfs-owner@vger.kernel.org List-ID: On Mon, Jul 24, 2017 at 03:14:25PM -0400, josef@toxicpanda.com wrote: > From: Josef Bacik > > Readdir does dir_emit while under the btree lock. dir_emit can trigger > the page fault which means we can deadlock. Fix this by allocating a > buffer on opening a directory and copying the readdir into this buffer > and doing dir_emit from outside of the tree lock. > > Signed-off-by: Josef Bacik > --- > v1->v2: > - use kzalloc instead of alloc_page(). > - make struct btrfs_file_private so you can still start a userspace trans on a > directory. > > fs/btrfs/ctree.h | 5 +++ > fs/btrfs/file.c | 9 ++++- > fs/btrfs/inode.c | 107 +++++++++++++++++++++++++++++++++++++++++-------------- > fs/btrfs/ioctl.c | 19 ++++++---- > 4 files changed, 107 insertions(+), 33 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 5ee9f10..33e942b 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1264,6 +1264,11 @@ struct btrfs_root { > atomic64_t qgroup_meta_rsv; > }; > > +struct btrfs_file_private { > + struct btrfs_trans_handle *trans; > + void *filldir_buf; > +}; > + > static inline u32 btrfs_inode_sectorsize(const struct inode *inode) > { > return btrfs_sb(inode->i_sb)->sectorsize; > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 0f102a1..1897c3b 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -1973,8 +1973,15 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, > > int btrfs_release_file(struct inode *inode, struct file *filp) > { > - if (filp->private_data) > + struct btrfs_file_private *private = filp->private_data; > + > + if (private && private->trans) > btrfs_ioctl_trans_end(filp); > + if (private && private->filldir_buf) > + kfree(private->filldir_buf); > + kfree(private); > + filp->private_data = NULL; > + > /* > * ordered_data_close is set by settattr when we are about to truncate > * a file from a non-zero size to a zero size. This tries to > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 9a4413a..bbdbeea 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -5877,25 +5877,73 @@ unsigned char btrfs_filetype_table[] = { > DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK > }; > > +/* > + * All this infrastructure exists because dir_emit can fault, and we are holding > + * the tree lock when doing readdir. For now just allocate a buffer and copy > + * our information into that, and then dir_emit from the buffer. This is > + * similar to what NFS does, only we don't keep the buffer around in pagecache > + * because I'm afraid I'll fuck that up. Long term we need to make filldir do > + * copy_to_user_inatomic so we don't have to worry about page faulting under the > + * tree lock. > + */ > +static int btrfs_opendir(struct inode *inode, struct file *file) > +{ > + struct btrfs_file_private *private; > + > + private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); > + if (!private) > + return -ENOMEM; > + private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); > + if (!private->filldir_buf) { > + kfree(private); > + return -ENOMEM; > + } > + file->private_data = private; > + return 0; > +} > + > +struct dir_entry { > + u64 ino; > + u64 offset; > + unsigned type; > + int name_len; > +}; > + > +static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) > +{ > + while (entries--) { > + struct dir_entry *entry = addr; > + char *name = (char *)(entry + 1); > + ctx->pos = entry->offset; > + if (!dir_emit(ctx, name, entry->name_len, entry->ino, > + entry->type)) > + return 1; > + addr += sizeof(struct dir_entry) + entry->name_len; > + ctx->pos++; > + } > + return 0; > +} > + > static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) > { > struct inode *inode = file_inode(file); > struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); > struct btrfs_root *root = BTRFS_I(inode)->root; > + struct btrfs_file_private *private = file->private_data; > struct btrfs_dir_item *di; > struct btrfs_key key; > struct btrfs_key found_key; > struct btrfs_path *path; > + void *addr; > struct list_head ins_list; > struct list_head del_list; > int ret; > struct extent_buffer *leaf; > int slot; > - unsigned char d_type; > - int over = 0; > - char tmp_name[32]; > char *name_ptr; > int name_len; > + int entries = 0; > + int total_len = 0; > bool put = false; > struct btrfs_key location; > > @@ -5906,12 +5954,14 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) > if (!path) > return -ENOMEM; > > + addr = private->filldir_buf; > path->reada = READA_FORWARD; > > INIT_LIST_HEAD(&ins_list); > INIT_LIST_HEAD(&del_list); > put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); > > +again: > key.type = BTRFS_DIR_INDEX_KEY; > key.offset = ctx->pos; > key.objectid = btrfs_ino(BTRFS_I(inode)); > @@ -5921,6 +5971,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) > goto err; > > while (1) { > + struct dir_entry *entry; > leaf = path->nodes[0]; > slot = path->slots[0]; > if (slot >= btrfs_header_nritems(leaf)) { > @@ -5942,41 +5993,44 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) > goto next; > if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) > goto next; > - > - ctx->pos = found_key.offset; > - > di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); > if (verify_dir_item(fs_info, leaf, slot, di)) > goto next; > > name_len = btrfs_dir_name_len(leaf, di); > - if (name_len <= sizeof(tmp_name)) { > - name_ptr = tmp_name; > - } else { > - name_ptr = kmalloc(name_len, GFP_KERNEL); > - if (!name_ptr) { > - ret = -ENOMEM; > - goto err; > - } > + if ((total_len + sizeof(struct dir_entry) + name_len) >= > + PAGE_SIZE) { > + btrfs_release_path(path); > + ret = btrfs_filldir(private->filldir_buf, entries, > + ctx); > + if (ret) > + goto nopos; > + addr = private->filldir_buf; > + entries = 0; > + total_len = 0; > + goto again; > } > + > + entry = addr; > + entry->name_len = name_len; > + name_ptr = (char *)(entry + 1); > read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), > name_len); > - > - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; > + entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; > btrfs_dir_item_key_to_cpu(leaf, di, &location); > - > - over = !dir_emit(ctx, name_ptr, name_len, location.objectid, > - d_type); > - > - if (name_ptr != tmp_name) > - kfree(name_ptr); > - > - if (over) > - goto nopos; > - ctx->pos++; > + entry->ino = location.objectid; > + entry->offset = found_key.offset; > + entries++; > + addr += sizeof(struct dir_entry) + name_len; > + total_len += sizeof(struct dir_entry) + name_len; To Jeff, As you've fixed a loop bug in getdents (commit d2fbb2b589ece9060635b43c2b2333d0b0a0fbf2 btrfs: increment ctx->pos for every emitted or skipped dirent in readdir), could you please check if this patch is OK with that bug? (It looks good to me.) > next: > path->slots[0]++; > } > + btrfs_release_path(path); > + > + ret = btrfs_filldir(private->filldir_buf, entries, ctx); > + if (ret) > + goto nopos; > > ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); > if (ret) > @@ -10777,6 +10831,7 @@ static const struct file_operations btrfs_dir_file_operations = { > .llseek = generic_file_llseek, > .read = generic_read_dir, > .iterate_shared = btrfs_real_readdir, > + .open = btrfs_opendir, > .unlocked_ioctl = btrfs_ioctl, > #ifdef CONFIG_COMPAT > .compat_ioctl = btrfs_compat_ioctl, > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index bedeec6..6bb1054 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -3966,6 +3966,7 @@ static long btrfs_ioctl_trans_start(struct file *file) > struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); > struct btrfs_root *root = BTRFS_I(inode)->root; > struct btrfs_trans_handle *trans; > + struct btrfs_file_private *private; > int ret; > > ret = -EPERM; > @@ -3973,8 +3974,15 @@ static long btrfs_ioctl_trans_start(struct file *file) > goto out; > > ret = -EINPROGRESS; > - if (file->private_data) > + private = file->private_data; > + if (private && private->trans) > goto out; > + if (!private) { > + private = kzalloc(sizeof(struct btrfs_file_private), > + GFP_KERNEL); > + if (!private) > + return -ENOMEM; > + } > Missing 'private->trans = trans'? Thanks, -liubo > ret = -EROFS; > if (btrfs_root_readonly(root)) > @@ -4246,14 +4254,13 @@ long btrfs_ioctl_trans_end(struct file *file) > { > struct inode *inode = file_inode(file); > struct btrfs_root *root = BTRFS_I(inode)->root; > - struct btrfs_trans_handle *trans; > + struct btrfs_file_private *private = file->private_data; > > - trans = file->private_data; > - if (!trans) > + if (!private || !private->trans) > return -EINVAL; > - file->private_data = NULL; > > - btrfs_end_transaction(trans); > + btrfs_end_transaction(private->trans); > + private->trans = NULL; > > atomic_dec(&root->fs_info->open_ioctl_trans); > > -- > 2.7.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html