From: Chandan Babu R <chandanrlinux@gmail.com>
To: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: linux-xfs@vger.kernel.org, hch@lst.de
Subject: Re: [PATCH 2/2] xfs: fix deadlock and streamline xfs_getfsmap performance
Date: Wed, 07 Oct 2020 14:10:36 +0530 [thread overview]
Message-ID: <5181946.o3KTjg5V7k@garuda> (raw)
In-Reply-To: <160192209677.2569942.16673759463630442919.stgit@magnolia>
On Monday 5 October 2020 11:51:36 PM IST Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> Refactor xfs_getfsmap to improve its performance: instead of indirectly
> calling a function that copies one record to userspace at a time, create
> a shadow buffer in the kernel and copy the whole array once at the end.
> On the author's computer, this reduces the runtime on his /home by ~20%.
>
> This also eliminates a deadlock when running GETFSMAP against the
> realtime device. The current code locks the rtbitmap to create
> fsmappings and copies them into userspace, having not released the
> rtbitmap lock. If the userspace buffer is an mmap of a sparse file that
> itself resides on the realtime device, the write page fault will recurse
> into the fs for allocation, which will deadlock on the rtbitmap lock.
>
The changes look good to me,
Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>
> Fixes: 4c934c7dd60c ("xfs: report realtime space information via the rtbitmap")
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/xfs/xfs_fsmap.c | 45 +++++++++-------
> fs/xfs/xfs_fsmap.h | 6 --
> fs/xfs/xfs_ioctl.c | 146 +++++++++++++++++++++++++++++++++++-----------------
> 3 files changed, 125 insertions(+), 72 deletions(-)
>
>
> diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
> index aa36e7daf82c..9ce5e7d5bf8f 100644
> --- a/fs/xfs/xfs_fsmap.c
> +++ b/fs/xfs/xfs_fsmap.c
> @@ -26,7 +26,7 @@
> #include "xfs_rtalloc.h"
>
> /* Convert an xfs_fsmap to an fsmap. */
> -void
> +static void
> xfs_fsmap_from_internal(
> struct fsmap *dest,
> struct xfs_fsmap *src)
> @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap(
> /* getfsmap query state */
> struct xfs_getfsmap_info {
> struct xfs_fsmap_head *head;
> - xfs_fsmap_format_t formatter; /* formatting fn */
> - void *format_arg; /* format buffer */
> + struct fsmap *fsmap_recs; /* mapping records */
> struct xfs_buf *agf_bp; /* AGF, for refcount queries */
> xfs_daddr_t next_daddr; /* next daddr we expect */
> u64 missing_owner; /* owner of holes */
> @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared(
> return 0;
> }
>
> +static inline void
> +xfs_getfsmap_format(
> + struct xfs_mount *mp,
> + struct xfs_fsmap *xfm,
> + struct xfs_getfsmap_info *info)
> +{
> + struct fsmap *rec;
> +
> + trace_xfs_getfsmap_mapping(mp, xfm);
> +
> + rec = &info->fsmap_recs[info->head->fmh_entries++];
> + xfs_fsmap_from_internal(rec, xfm);
> +}
> +
> /*
> * Format a reverse mapping for getfsmap, having translated rm_startblock
> * into the appropriate daddr units.
> @@ -288,10 +301,7 @@ xfs_getfsmap_helper(
> fmr.fmr_offset = 0;
> fmr.fmr_length = rec_daddr - info->next_daddr;
> fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
> - error = info->formatter(&fmr, info->format_arg);
> - if (error)
> - return error;
> - info->head->fmh_entries++;
> + xfs_getfsmap_format(mp, &fmr, info);
> }
>
> if (info->last)
> @@ -323,11 +333,8 @@ xfs_getfsmap_helper(
> if (shared)
> fmr.fmr_flags |= FMR_OF_SHARED;
> }
> - error = info->formatter(&fmr, info->format_arg);
> - if (error)
> - return error;
> - info->head->fmh_entries++;
>
> + xfs_getfsmap_format(mp, &fmr, info);
> out:
> rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
> if (info->next_daddr < rec_daddr)
> @@ -795,11 +802,11 @@ xfs_getfsmap_check_keys(
> #endif /* CONFIG_XFS_RT */
>
> /*
> - * Get filesystem's extents as described in head, and format for
> - * output. Calls formatter to fill the user's buffer until all
> - * extents are mapped, until the passed-in head->fmh_count slots have
> - * been filled, or until the formatter short-circuits the loop, if it
> - * is tracking filled-in extents on its own.
> + * Get filesystem's extents as described in head, and format for output. Fills
> + * in the supplied records array until there are no more reverse mappings to
> + * return or head.fmh_entries == head.fmh_count. In the second case, this
> + * function returns -ECANCELED to indicate that more records would have been
> + * returned.
> *
> * Key to Confusion
> * ----------------
> @@ -819,8 +826,7 @@ int
> xfs_getfsmap(
> struct xfs_mount *mp,
> struct xfs_fsmap_head *head,
> - xfs_fsmap_format_t formatter,
> - void *arg)
> + struct fsmap *fsmap_recs)
> {
> struct xfs_trans *tp = NULL;
> struct xfs_fsmap dkeys[2]; /* per-dev keys */
> @@ -895,8 +901,7 @@ xfs_getfsmap(
>
> info.next_daddr = head->fmh_keys[0].fmr_physical +
> head->fmh_keys[0].fmr_length;
> - info.formatter = formatter;
> - info.format_arg = arg;
> + info.fsmap_recs = fsmap_recs;
> info.head = head;
>
> /*
> diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
> index c6c57739b862..a0775788e7b1 100644
> --- a/fs/xfs/xfs_fsmap.h
> +++ b/fs/xfs/xfs_fsmap.h
> @@ -27,13 +27,9 @@ struct xfs_fsmap_head {
> struct xfs_fsmap fmh_keys[2]; /* low and high keys */
> };
>
> -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
> void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
>
> -/* fsmap to userspace formatter - copy to user & advance pointer */
> -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
> -
> int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
> - xfs_fsmap_format_t formatter, void *arg);
> + struct fsmap *out_recs);
>
> #endif /* __XFS_FSMAP_H__ */
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index bca7659fb5c6..3fbd98f61ea5 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1716,39 +1716,17 @@ xfs_ioc_getbmap(
> return error;
> }
>
> -struct getfsmap_info {
> - struct xfs_mount *mp;
> - struct fsmap_head __user *data;
> - unsigned int idx;
> - __u32 last_flags;
> -};
> -
> -STATIC int
> -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
> -{
> - struct getfsmap_info *info = priv;
> - struct fsmap fm;
> -
> - trace_xfs_getfsmap_mapping(info->mp, xfm);
> -
> - info->last_flags = xfm->fmr_flags;
> - xfs_fsmap_from_internal(&fm, xfm);
> - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
> - sizeof(struct fsmap)))
> - return -EFAULT;
> -
> - return 0;
> -}
> -
> STATIC int
> xfs_ioc_getfsmap(
> struct xfs_inode *ip,
> struct fsmap_head __user *arg)
> {
> - struct getfsmap_info info = { NULL };
> struct xfs_fsmap_head xhead = {0};
> struct fsmap_head head;
> - bool aborted = false;
> + struct fsmap *recs;
> + unsigned int count;
> + __u32 last_flags = 0;
> + bool done = false;
> int error;
>
> if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
> @@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap(
> sizeof(head.fmh_keys[1].fmr_reserved)))
> return -EINVAL;
>
> + /*
> + * Use an internal memory buffer so that we don't have to copy fsmap
> + * data to userspace while holding locks. Start by trying to allocate
> + * up to 128k for the buffer, but fall back to a single page if needed.
> + */
> + count = min_t(unsigned int, head.fmh_count,
> + 131072 / sizeof(struct fsmap));
> + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
> + if (!recs) {
> + count = min_t(unsigned int, head.fmh_count,
> + PAGE_SIZE / sizeof(struct fsmap));
> + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL);
> + if (!recs)
> + return -ENOMEM;
> + }
> +
> xhead.fmh_iflags = head.fmh_iflags;
> - xhead.fmh_count = head.fmh_count;
> xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
> xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
>
> trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
> trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
>
> - info.mp = ip->i_mount;
> - info.data = arg;
> - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
> - if (error == -ECANCELED) {
> - error = 0;
> - aborted = true;
> - } else if (error)
> - return error;
> -
> - /* If we didn't abort, set the "last" flag in the last fmx */
> - if (!aborted && info.idx) {
> - info.last_flags |= FMR_OF_LAST;
> - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
> - &info.last_flags, sizeof(info.last_flags)))
> - return -EFAULT;
> + head.fmh_entries = 0;
> + do {
> + struct fsmap __user *user_recs;
> + struct fsmap *last_rec;
> +
> + user_recs = &arg->fmh_recs[head.fmh_entries];
> + xhead.fmh_entries = 0;
> + xhead.fmh_count = min_t(unsigned int, count,
> + head.fmh_count - head.fmh_entries);
> +
> + /* Run query, record how many entries we got. */
> + error = xfs_getfsmap(ip->i_mount, &xhead, recs);
> + switch (error) {
> + case 0:
> + /*
> + * There are no more records in the result set. Copy
> + * whatever we got to userspace and break out.
> + */
> + done = true;
> + break;
> + case -ECANCELED:
> + /*
> + * The internal memory buffer is full. Copy whatever
> + * records we got to userspace and go again if we have
> + * not yet filled the userspace buffer.
> + */
> + error = 0;
> + break;
> + default:
> + goto out_free;
> + }
> + head.fmh_entries += xhead.fmh_entries;
> + head.fmh_oflags = xhead.fmh_oflags;
> +
> + /*
> + * If the caller wanted a record count or there aren't any
> + * new records to return, we're done.
> + */
> + if (head.fmh_count == 0 || xhead.fmh_entries == 0)
> + break;
> +
> + /* Copy all the records we got out to userspace. */
> + if (copy_to_user(user_recs, recs,
> + xhead.fmh_entries * sizeof(struct fsmap))) {
> + error = -EFAULT;
> + goto out_free;
> + }
> +
> + /* Remember the last record flags we copied to userspace. */
> + last_rec = &recs[xhead.fmh_entries - 1];
> + last_flags = last_rec->fmr_flags;
> +
> + /* Set up the low key for the next iteration. */
> + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
> + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
> + } while (!done && head.fmh_entries < head.fmh_count);
> +
> + /*
> + * If there are no more records in the query result set and we're not
> + * in counting mode, mark the last record returned with the LAST flag.
> + */
> + if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
> + struct fsmap __user *user_rec;
> +
> + last_flags |= FMR_OF_LAST;
> + user_rec = &arg->fmh_recs[head.fmh_entries - 1];
> +
> + if (copy_to_user(&user_rec->fmr_flags, &last_flags,
> + sizeof(last_flags))) {
> + error = -EFAULT;
> + goto out_free;
> + }
> }
>
> /* copy back header */
> - head.fmh_entries = xhead.fmh_entries;
> - head.fmh_oflags = xhead.fmh_oflags;
> - if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
> - return -EFAULT;
> + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
> + error = -EFAULT;
> + goto out_free;
> + }
>
> - return 0;
> +out_free:
> + kmem_free(recs);
> + return error;
> }
>
> STATIC int
>
>
--
chandan
prev parent reply other threads:[~2020-10-07 8:40 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-05 18:21 [PATCH v2 0/2] xfs: a few fixes and cleanups to GETFSMAP Darrick J. Wong
2020-10-05 18:21 ` [PATCH 1/2] xfs: limit entries returned when counting fsmap records Darrick J. Wong
2020-10-05 18:21 ` [PATCH 2/2] xfs: fix deadlock and streamline xfs_getfsmap performance Darrick J. Wong
2020-10-06 6:26 ` Christoph Hellwig
2020-10-07 8:40 ` Chandan Babu R [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5181946.o3KTjg5V7k@garuda \
--to=chandanrlinux@gmail.com \
--cc=darrick.wong@oracle.com \
--cc=hch@lst.de \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox