From: "Darrick J. Wong" <djwong@kernel.org>
To: Wengang Wang <wen.gang.wang@oracle.com>
Cc: linux-xfs@vger.kernel.org
Subject: Re: [PATCH 6/9] spaceman/defrag: workaround kernel xfs_reflink_try_clear_inode_flag()
Date: Tue, 9 Jul 2024 13:51:21 -0700 [thread overview]
Message-ID: <20240709205121.GV612460@frogsfrogsfrogs> (raw)
In-Reply-To: <20240709191028.2329-7-wen.gang.wang@oracle.com>
On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
> of extents and none of the extents are shared.
>
> workaround:
> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
> quickly to save cpu times and speed up defrag significantly.
I wonder if a better solution would be to change xfs_reflink_unshare
only to try to clear the reflink iflag if offset/len cover the entire
file? It's a pity we can't set time budgets on fallocate requests.
--D
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
> spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 172 insertions(+), 2 deletions(-)
>
> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
> index f8e6713c..b5c5b187 100644
> --- a/spaceman/defrag.c
> +++ b/spaceman/defrag.c
> @@ -327,6 +327,155 @@ defrag_fs_limit_hit(int fd)
> return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes;
> }
>
> +static bool g_enable_first_ext_share = true;
> +
> +static int
> +defrag_get_first_real_ext(int fd, struct getbmapx *mapx)
> +{
> + int err;
> +
> + while (1) {
> + err = defrag_get_next_extent(fd, mapx);
> + if (err)
> + break;
> +
> + defrag_move_next_extent();
> + if (!(mapx->bmv_oflags & BMV_OF_PREALLOC))
> + break;
> + }
> + return err;
> +}
> +
> +static __u64 g_share_offset = -1ULL;
> +static __u64 g_share_len = 0ULL;
> +#define SHARE_MAX_SIZE 32768 /* 32KiB */
> +
> +/* share the first real extent with scrach */
> +static void
> +defrag_share_first_extent(int defrag_fd, int scratch_fd)
> +{
> +#define OFFSET_1PB 0x4000000000000LL
> + struct file_clone_range clone;
> + struct getbmapx mapx;
> + int err;
> +
> + if (g_enable_first_ext_share == false)
> + return;
> +
> + err = defrag_get_first_real_ext(defrag_fd, &mapx);
> + if (err)
> + return;
> +
> + clone.src_fd = defrag_fd;
> + clone.src_offset = mapx.bmv_offset * 512;
> + clone.src_length = mapx.bmv_length * 512;
> + /* shares at most SHARE_MAX_SIZE length */
> + if (clone.src_length > SHARE_MAX_SIZE)
> + clone.src_length = SHARE_MAX_SIZE;
> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
> + /* if the first is extent is reaching the EoF, no need to share */
> + if (clone.src_offset + clone.src_length >= g_defrag_file_size)
> + return;
> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
> + if (err != 0) {
> + fprintf(stderr, "cloning first extent failed: %s\n",
> + strerror(errno));
> + return;
> + }
> +
> + /* safe the offset and length for re-share */
> + g_share_offset = clone.src_offset;
> + g_share_len = clone.src_length;
> +}
> +
> +/* re-share the blocks we shared previous if then are no longer shared */
> +static void
> +defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd)
> +{
> +#define NR_GET_EXT 9
> + struct getbmapx mapx[NR_GET_EXT];
> + struct file_clone_range clone;
> + __u64 new_share_len;
> + int idx, err;
> +
> + if (g_enable_first_ext_share == false)
> + return;
> +
> + if (g_share_len == 0ULL)
> + return;
> +
> + /*
> + * check if previous shareing still exist
> + * we are done if (partially) so.
> + */
> + mapx[0].bmv_offset = g_share_offset;
> + mapx[0].bmv_length = g_share_len;
> + mapx[0].bmv_count = NR_GET_EXT;
> + mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC;
> + err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx);
> + if (err) {
> + fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n",
> + strerror(errno));
> + /* won't try share again */
> + g_share_len = 0ULL;
> + return;
> + }
> +
> + if (mapx[0].bmv_entries == 0) {
> + /* shared blocks all became hole, won't try share again */
> + g_share_len = 0ULL;
> + return;
> + }
> +
> + if (g_share_offset != 512 * mapx[1].bmv_offset) {
> + /* first shared block became hole, won't try share again */
> + g_share_len = 0ULL;
> + return;
> + }
> +
> + /* we check up to only the first NR_GET_EXT - 1 extents */
> + for (idx = 1; idx <= mapx[0].bmv_entries; idx++) {
> + if (mapx[idx].bmv_oflags & BMV_OF_SHARED) {
> + /* some blocks still shared, done */
> + return;
> + }
> + }
> +
> + /*
> + * The previously shared blocks are no longer shared, re-share.
> + * deallocate the blocks in scrath file first
> + */
> + err = fallocate(scratch_fd,
> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> + OFFSET_1PB + g_share_offset, g_share_len);
> + if (err != 0) {
> + fprintf(stderr, "punch hole failed %s\n",
> + strerror(errno));
> + g_share_len = 0;
> + return;
> + }
> +
> + new_share_len = 512 * mapx[1].bmv_length;
> + if (new_share_len > SHARE_MAX_SIZE)
> + new_share_len = SHARE_MAX_SIZE;
> +
> + clone.src_fd = defrag_fd;
> + /* keep starting offset unchanged */
> + clone.src_offset = g_share_offset;
> + clone.src_length = new_share_len;
> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
> +
> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
> + if (err) {
> + fprintf(stderr, "FICLONERANGE failed %s\n",
> + strerror(errno));
> + g_share_len = 0;
> + return;
> + }
> +
> + g_share_len = new_share_len;
> + }
> +
> /*
> * defragment a file
> * return 0 if successfully done, 1 otherwise
> @@ -377,6 +526,12 @@ defrag_xfs_defrag(char *file_path) {
>
> signal(SIGINT, defrag_sigint_handler);
>
> + /*
> + * share the first extent to work around kernel consuming time
> + * in xfs_reflink_try_clear_inode_flag()
> + */
> + defrag_share_first_extent(defrag_fd, scratch_fd);
> +
> do {
> struct timeval t_clone, t_unshare, t_punch_hole;
> struct defrag_segment segment;
> @@ -454,6 +609,15 @@ defrag_xfs_defrag(char *file_path) {
> if (time_delta > max_unshare_us)
> max_unshare_us = time_delta;
>
> + /*
> + * if unshare used more than 1 second, time is very possibly
> + * used in checking if the file is sharing extents now.
> + * to avoid that happen again we re-share the blocks in front
> + * to workaround that.
> + */
> + if (time_delta > 1000000)
> + defrag_reshare_blocks_in_front(defrag_fd, scratch_fd);
> +
> /*
> * Punch out the original extents we shared to the
> * scratch file so they are returned to free space.
> @@ -514,6 +678,8 @@ static void defrag_help(void)
> " -f free_space -- specify shrethod of the XFS free space in MiB, when\n"
> " XFS free space is lower than that, shared segments \n"
> " are excluded from defragmentation, 1024 by default\n"
> +" -n -- disable the \"share first extent\" featue, it's\n"
> +" enabled by default to speed up\n"
> ));
> }
>
> @@ -525,7 +691,7 @@ defrag_f(int argc, char **argv)
> int i;
> int c;
>
> - while ((c = getopt(argc, argv, "s:f:")) != EOF) {
> + while ((c = getopt(argc, argv, "s:f:n")) != EOF) {
> switch(c) {
> case 's':
> g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512;
> @@ -539,6 +705,10 @@ defrag_f(int argc, char **argv)
> g_limit_free_bytes = atol(optarg) * 1024 * 1024;
> break;
>
> + case 'n':
> + g_enable_first_ext_share = false;
> + break;
> +
> default:
> command_usage(&defrag_cmd);
> return 1;
> @@ -556,7 +726,7 @@ void defrag_init(void)
> defrag_cmd.cfunc = defrag_f;
> defrag_cmd.argmin = 0;
> defrag_cmd.argmax = 4;
> - defrag_cmd.args = "[-s segment_size] [-f free_space]";
> + defrag_cmd.args = "[-s segment_size] [-f free_space] [-n]";
> defrag_cmd.flags = CMD_FLAG_ONESHOT;
> defrag_cmd.oneline = _("Defragment XFS files");
> defrag_cmd.help = defrag_help;
> --
> 2.39.3 (Apple Git-146)
>
>
next prev parent reply other threads:[~2024-07-09 20:51 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-09 19:10 [PATCH 0/9] introduce defrag to xfs_spaceman Wengang Wang
2024-07-09 19:10 ` [PATCH 1/9] xfsprogs: introduce defrag command to spaceman Wengang Wang
2024-07-09 21:18 ` Darrick J. Wong
2024-07-11 21:54 ` Wengang Wang
2024-07-15 21:30 ` Wengang Wang
2024-07-15 22:44 ` Darrick J. Wong
2024-07-09 19:10 ` [PATCH 2/9] spaceman/defrag: pick up segments from target file Wengang Wang
2024-07-09 21:50 ` [PATCH 2/9] spaceman/defrag: pick up segments from target fileOM Darrick J. Wong
2024-07-11 22:37 ` Wengang Wang
2024-07-15 23:40 ` [PATCH 2/9] spaceman/defrag: pick up segments from target file Dave Chinner
2024-07-16 20:23 ` Wengang Wang
2024-07-17 4:11 ` Dave Chinner
2024-07-18 19:03 ` Wengang Wang
2024-07-19 4:59 ` Dave Chinner
2024-07-19 4:01 ` Christoph Hellwig
2024-07-24 19:22 ` Wengang Wang
2024-07-30 22:13 ` Dave Chinner
2024-07-09 19:10 ` [PATCH 3/9] spaceman/defrag: defrag segments Wengang Wang
2024-07-09 21:57 ` Darrick J. Wong
2024-07-11 22:49 ` Wengang Wang
2024-07-12 19:07 ` Wengang Wang
2024-07-15 22:42 ` Darrick J. Wong
2024-07-16 0:08 ` Dave Chinner
2024-07-18 18:06 ` Wengang Wang
2024-07-09 19:10 ` [PATCH 4/9] spaceman/defrag: ctrl-c handler Wengang Wang
2024-07-09 21:08 ` Darrick J. Wong
2024-07-11 22:58 ` Wengang Wang
2024-07-15 22:56 ` Darrick J. Wong
2024-07-16 16:21 ` Wengang Wang
2024-07-09 19:10 ` [PATCH 5/9] spaceman/defrag: exclude shared segments on low free space Wengang Wang
2024-07-09 21:05 ` Darrick J. Wong
2024-07-11 23:08 ` Wengang Wang
2024-07-15 22:58 ` Darrick J. Wong
2024-07-09 19:10 ` [PATCH 6/9] spaceman/defrag: workaround kernel xfs_reflink_try_clear_inode_flag() Wengang Wang
2024-07-09 20:51 ` Darrick J. Wong [this message]
2024-07-11 23:11 ` Wengang Wang
2024-07-16 0:25 ` Dave Chinner
2024-07-18 18:24 ` Wengang Wang
2024-07-31 22:25 ` Dave Chinner
2024-07-09 19:10 ` [PATCH 7/9] spaceman/defrag: sleeps between segments Wengang Wang
2024-07-09 20:46 ` Darrick J. Wong
2024-07-11 23:26 ` Wengang Wang
2024-07-11 23:30 ` Wengang Wang
2024-07-09 19:10 ` [PATCH 8/9] spaceman/defrag: readahead for better performance Wengang Wang
2024-07-09 20:27 ` Darrick J. Wong
2024-07-11 23:29 ` Wengang Wang
2024-07-16 0:56 ` Dave Chinner
2024-07-18 18:40 ` Wengang Wang
2024-07-31 3:10 ` Dave Chinner
2024-08-02 18:31 ` Wengang Wang
2024-07-09 19:10 ` [PATCH 9/9] spaceman/defrag: warn on extsize Wengang Wang
2024-07-09 20:21 ` Darrick J. Wong
2024-07-11 23:36 ` Wengang Wang
2024-07-16 0:29 ` Dave Chinner
2024-07-22 18:01 ` Wengang Wang
2024-07-30 22:43 ` Dave Chinner
2024-07-15 23:03 ` [PATCH 0/9] introduce defrag to xfs_spaceman Dave Chinner
2024-07-16 19:45 ` Wengang Wang
2024-07-31 2:51 ` Dave Chinner
2024-08-02 18:14 ` Wengang Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240709205121.GV612460@frogsfrogsfrogs \
--to=djwong@kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=wen.gang.wang@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox