From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5503D1B86DD for ; Tue, 9 Jul 2024 20:51:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1720558282; cv=none; b=FXb1A+cvZfx1zlMHQgr5EDjS4nH9ykKhRV2ToDzH4ghkRZnPwMtL3xRK0vbtOIqIh8RoJMs4HOgum2iDQ/bpSgpL26z3WkuHyLqvlrg/NUMPAVpKjr42S4juIMqgLAgAM8wePDcgyGSHocHEyGWzMt2ZX9dgQf9uSo0JXZ5oR8Y= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1720558282; c=relaxed/simple; bh=yIR7RcUyE5kmnxE22lm4QrDMm1vG/gqPJGkG0cVhQSQ=; h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version: Content-Type:Content-Disposition:In-Reply-To; b=bZnG8t4yitK+expjEBtqa8CeyakYDwRpFrbBna5VxIsFex8sgEuS1w9bIjou2r4rRZOkpSHPq8C6nioYuz/HBPid9kuunQAqjyADu6+JIoi/6acXE7yW5XLSvDzVMxlAePx13f0KTFTnJ2ZqTAnyd8rKYJW2dmKBZW4siQwsD8Y= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=WZrPnaRQ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="WZrPnaRQ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id EEF59C3277B; Tue, 9 Jul 2024 20:51:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1720558282; bh=yIR7RcUyE5kmnxE22lm4QrDMm1vG/gqPJGkG0cVhQSQ=; h=Date:From:To:Cc:Subject:References:In-Reply-To:From; b=WZrPnaRQJ6ezwEj3uDpUXL47SaE2mJlP/WmOyNjg2aeH5juh80HN+eLHgVFu94T17 od0s/aKFQyghtuXYGNMkdjda27l/oZk+T8BH+7WZo6X+nSbxOGhk0G6U2KVtxeG0fl Ja4ocksnfJEb8E72MdqcA/yCNMn9zbn4njm0feWvLmxRR+EH4EXhxVWHprDUsUMF5L klFnHjm6aoZ9VHFc7W3OUK3gmOZn4KIb9e6c2A3juaERQ3i1Sf1NdHL+Bdn+Z2QC7B qyv4reDjfVABQ16JGjLyV3Kb3YVdDWYRJCQ/DnC7aBc3JMGFHZYdFitQ8+i7qCUg1K Mz1BW5uRgIjcQ== Date: Tue, 9 Jul 2024 13:51:21 -0700 From: "Darrick J. Wong" To: Wengang Wang Cc: linux-xfs@vger.kernel.org Subject: Re: [PATCH 6/9] spaceman/defrag: workaround kernel xfs_reflink_try_clear_inode_flag() Message-ID: <20240709205121.GV612460@frogsfrogsfrogs> References: <20240709191028.2329-1-wen.gang.wang@oracle.com> <20240709191028.2329-7-wen.gang.wang@oracle.com> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20240709191028.2329-7-wen.gang.wang@oracle.com> On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote: > xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number > of extents and none of the extents are shared. > > workaround: > share the first real extent so that xfs_reflink_try_clear_inode_flag() returns > quickly to save cpu times and speed up defrag significantly. I wonder if a better solution would be to change xfs_reflink_unshare only to try to clear the reflink iflag if offset/len cover the entire file? It's a pity we can't set time budgets on fallocate requests. --D > Signed-off-by: Wengang Wang > --- > spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 172 insertions(+), 2 deletions(-) > > diff --git a/spaceman/defrag.c b/spaceman/defrag.c > index f8e6713c..b5c5b187 100644 > --- a/spaceman/defrag.c > +++ b/spaceman/defrag.c > @@ -327,6 +327,155 @@ defrag_fs_limit_hit(int fd) > return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes; > } > > +static bool g_enable_first_ext_share = true; > + > +static int > +defrag_get_first_real_ext(int fd, struct getbmapx *mapx) > +{ > + int err; > + > + while (1) { > + err = defrag_get_next_extent(fd, mapx); > + if (err) > + break; > + > + defrag_move_next_extent(); > + if (!(mapx->bmv_oflags & BMV_OF_PREALLOC)) > + break; > + } > + return err; > +} > + > +static __u64 g_share_offset = -1ULL; > +static __u64 g_share_len = 0ULL; > +#define SHARE_MAX_SIZE 32768 /* 32KiB */ > + > +/* share the first real extent with scrach */ > +static void > +defrag_share_first_extent(int defrag_fd, int scratch_fd) > +{ > +#define OFFSET_1PB 0x4000000000000LL > + struct file_clone_range clone; > + struct getbmapx mapx; > + int err; > + > + if (g_enable_first_ext_share == false) > + return; > + > + err = defrag_get_first_real_ext(defrag_fd, &mapx); > + if (err) > + return; > + > + clone.src_fd = defrag_fd; > + clone.src_offset = mapx.bmv_offset * 512; > + clone.src_length = mapx.bmv_length * 512; > + /* shares at most SHARE_MAX_SIZE length */ > + if (clone.src_length > SHARE_MAX_SIZE) > + clone.src_length = SHARE_MAX_SIZE; > + clone.dest_offset = OFFSET_1PB + clone.src_offset; > + /* if the first is extent is reaching the EoF, no need to share */ > + if (clone.src_offset + clone.src_length >= g_defrag_file_size) > + return; > + err = ioctl(scratch_fd, FICLONERANGE, &clone); > + if (err != 0) { > + fprintf(stderr, "cloning first extent failed: %s\n", > + strerror(errno)); > + return; > + } > + > + /* safe the offset and length for re-share */ > + g_share_offset = clone.src_offset; > + g_share_len = clone.src_length; > +} > + > +/* re-share the blocks we shared previous if then are no longer shared */ > +static void > +defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd) > +{ > +#define NR_GET_EXT 9 > + struct getbmapx mapx[NR_GET_EXT]; > + struct file_clone_range clone; > + __u64 new_share_len; > + int idx, err; > + > + if (g_enable_first_ext_share == false) > + return; > + > + if (g_share_len == 0ULL) > + return; > + > + /* > + * check if previous shareing still exist > + * we are done if (partially) so. > + */ > + mapx[0].bmv_offset = g_share_offset; > + mapx[0].bmv_length = g_share_len; > + mapx[0].bmv_count = NR_GET_EXT; > + mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC; > + err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx); > + if (err) { > + fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n", > + strerror(errno)); > + /* won't try share again */ > + g_share_len = 0ULL; > + return; > + } > + > + if (mapx[0].bmv_entries == 0) { > + /* shared blocks all became hole, won't try share again */ > + g_share_len = 0ULL; > + return; > + } > + > + if (g_share_offset != 512 * mapx[1].bmv_offset) { > + /* first shared block became hole, won't try share again */ > + g_share_len = 0ULL; > + return; > + } > + > + /* we check up to only the first NR_GET_EXT - 1 extents */ > + for (idx = 1; idx <= mapx[0].bmv_entries; idx++) { > + if (mapx[idx].bmv_oflags & BMV_OF_SHARED) { > + /* some blocks still shared, done */ > + return; > + } > + } > + > + /* > + * The previously shared blocks are no longer shared, re-share. > + * deallocate the blocks in scrath file first > + */ > + err = fallocate(scratch_fd, > + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, > + OFFSET_1PB + g_share_offset, g_share_len); > + if (err != 0) { > + fprintf(stderr, "punch hole failed %s\n", > + strerror(errno)); > + g_share_len = 0; > + return; > + } > + > + new_share_len = 512 * mapx[1].bmv_length; > + if (new_share_len > SHARE_MAX_SIZE) > + new_share_len = SHARE_MAX_SIZE; > + > + clone.src_fd = defrag_fd; > + /* keep starting offset unchanged */ > + clone.src_offset = g_share_offset; > + clone.src_length = new_share_len; > + clone.dest_offset = OFFSET_1PB + clone.src_offset; > + > + err = ioctl(scratch_fd, FICLONERANGE, &clone); > + if (err) { > + fprintf(stderr, "FICLONERANGE failed %s\n", > + strerror(errno)); > + g_share_len = 0; > + return; > + } > + > + g_share_len = new_share_len; > + } > + > /* > * defragment a file > * return 0 if successfully done, 1 otherwise > @@ -377,6 +526,12 @@ defrag_xfs_defrag(char *file_path) { > > signal(SIGINT, defrag_sigint_handler); > > + /* > + * share the first extent to work around kernel consuming time > + * in xfs_reflink_try_clear_inode_flag() > + */ > + defrag_share_first_extent(defrag_fd, scratch_fd); > + > do { > struct timeval t_clone, t_unshare, t_punch_hole; > struct defrag_segment segment; > @@ -454,6 +609,15 @@ defrag_xfs_defrag(char *file_path) { > if (time_delta > max_unshare_us) > max_unshare_us = time_delta; > > + /* > + * if unshare used more than 1 second, time is very possibly > + * used in checking if the file is sharing extents now. > + * to avoid that happen again we re-share the blocks in front > + * to workaround that. > + */ > + if (time_delta > 1000000) > + defrag_reshare_blocks_in_front(defrag_fd, scratch_fd); > + > /* > * Punch out the original extents we shared to the > * scratch file so they are returned to free space. > @@ -514,6 +678,8 @@ static void defrag_help(void) > " -f free_space -- specify shrethod of the XFS free space in MiB, when\n" > " XFS free space is lower than that, shared segments \n" > " are excluded from defragmentation, 1024 by default\n" > +" -n -- disable the \"share first extent\" featue, it's\n" > +" enabled by default to speed up\n" > )); > } > > @@ -525,7 +691,7 @@ defrag_f(int argc, char **argv) > int i; > int c; > > - while ((c = getopt(argc, argv, "s:f:")) != EOF) { > + while ((c = getopt(argc, argv, "s:f:n")) != EOF) { > switch(c) { > case 's': > g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512; > @@ -539,6 +705,10 @@ defrag_f(int argc, char **argv) > g_limit_free_bytes = atol(optarg) * 1024 * 1024; > break; > > + case 'n': > + g_enable_first_ext_share = false; > + break; > + > default: > command_usage(&defrag_cmd); > return 1; > @@ -556,7 +726,7 @@ void defrag_init(void) > defrag_cmd.cfunc = defrag_f; > defrag_cmd.argmin = 0; > defrag_cmd.argmax = 4; > - defrag_cmd.args = "[-s segment_size] [-f free_space]"; > + defrag_cmd.args = "[-s segment_size] [-f free_space] [-n]"; > defrag_cmd.flags = CMD_FLAG_ONESHOT; > defrag_cmd.oneline = _("Defragment XFS files"); > defrag_cmd.help = defrag_help; > -- > 2.39.3 (Apple Git-146) > >