* [PATCH 1/7] libfrog: move statx.h from io/ to libfrog/
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
@ 2025-07-01 18:07 ` Darrick J. Wong
2025-07-02 9:13 ` John Garry
2025-07-01 18:07 ` [PATCH 2/7] xfs_db: create an untorn_max subcommand Darrick J. Wong
` (5 subsequent siblings)
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:07 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Move this header file so we can use it elsewhere.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
libfrog/statx.h | 17 +++++++++++++++++
io/stat.c | 20 ++------------------
libfrog/Makefile | 1 +
3 files changed, 20 insertions(+), 18 deletions(-)
rename io/statx.h => libfrog/statx.h (96%)
diff --git a/io/statx.h b/libfrog/statx.h
similarity index 96%
rename from io/statx.h
rename to libfrog/statx.h
index f7ef1d2784a2a9..b76dfae21e7092 100644
--- a/io/statx.h
+++ b/libfrog/statx.h
@@ -146,7 +146,24 @@ struct statx {
__u64 __spare3[9]; /* Spare space for future expansion */
/* 0x100 */
};
+
+static inline ssize_t
+statx(
+ int dfd,
+ const char *filename,
+ unsigned int flags,
+ unsigned int mask,
+ struct statx *buffer)
+{
+#ifdef __NR_statx
+ return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
+#else
+ errno = ENOSYS;
+ return -1;
#endif
+}
+
+#endif /* OVERRIDE_SYSTEM_STATX */
#ifndef STATX_TYPE
/*
diff --git a/io/stat.c b/io/stat.c
index c3a4bb15229ee5..46475df343470c 100644
--- a/io/stat.c
+++ b/io/stat.c
@@ -14,7 +14,7 @@
#include "input.h"
#include "init.h"
#include "io.h"
-#include "statx.h"
+#include "libfrog/statx.h"
#include "libxfs.h"
#include "libfrog/logging.h"
#include "libfrog/fsgeom.h"
@@ -305,22 +305,6 @@ statfs_f(
return 0;
}
-static ssize_t
-_statx(
- int dfd,
- const char *filename,
- unsigned int flags,
- unsigned int mask,
- struct statx *buffer)
-{
-#ifdef __NR_statx
- return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
-#else
- errno = ENOSYS;
- return -1;
-#endif
-}
-
struct statx_masks {
const char *name;
unsigned int mask;
@@ -525,7 +509,7 @@ statx_f(
return command_usage(&statx_cmd);
memset(&stx, 0xbf, sizeof(stx));
- if (_statx(file->fd, "", atflag | AT_EMPTY_PATH, mask, &stx) < 0) {
+ if (statx(file->fd, "", atflag | AT_EMPTY_PATH, mask, &stx) < 0) {
perror("statx");
exitcode = 1;
return 0;
diff --git a/libfrog/Makefile b/libfrog/Makefile
index b64ca4597f4ea9..560bad417ee434 100644
--- a/libfrog/Makefile
+++ b/libfrog/Makefile
@@ -62,6 +62,7 @@ ptvar.h \
radix-tree.h \
randbytes.h \
scrub.h \
+statx.h \
workqueue.h
GETTEXT_PY = \
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH 2/7] xfs_db: create an untorn_max subcommand
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
2025-07-01 18:07 ` [PATCH 1/7] libfrog: move statx.h from io/ to libfrog/ Darrick J. Wong
@ 2025-07-01 18:07 ` Darrick J. Wong
2025-07-09 15:39 ` John Garry
2025-07-01 18:07 ` [PATCH 3/7] xfs_io: dump new atomic_write_unit_max_opt statx field Darrick J. Wong
` (4 subsequent siblings)
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:07 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Create a debugger command to compute the either the logres needed to
perform an untorn cow write completion for a given number of blocks; or
the number of blocks that can be completed given a log reservation.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
include/libxfs.h | 1
libxfs/libxfs_api_defs.h | 4 +
db/logformat.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++
man/man8/xfs_db.8 | 10 ++++
4 files changed, 144 insertions(+)
diff --git a/include/libxfs.h b/include/libxfs.h
index b968a2b88da372..1e0d1a48fbb698 100644
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -102,6 +102,7 @@ struct iomap;
#include "xfs_rtbitmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_ag_resv.h"
+#include "defer_item.h"
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index c5fcb5e3229ae4..4bd02c57b496e6 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -108,6 +108,10 @@
#define xfs_bunmapi libxfs_bunmapi
#define xfs_bwrite libxfs_bwrite
#define xfs_calc_dquots_per_chunk libxfs_calc_dquots_per_chunk
+#define xfs_calc_finish_bui_reservation libxfs_calc_finish_bui_reservation
+#define xfs_calc_finish_cui_reservation libxfs_calc_finish_cui_reservation
+#define xfs_calc_finish_efi_reservation libxfs_calc_finish_efi_reservation
+#define xfs_calc_finish_rui_reservation libxfs_calc_finish_rui_reservation
#define xfs_cntbt_init_cursor libxfs_cntbt_init_cursor
#define xfs_compute_rextslog libxfs_compute_rextslog
#define xfs_compute_rgblklog libxfs_compute_rgblklog
diff --git a/db/logformat.c b/db/logformat.c
index 5edaa5494637c8..454ea20c0c7d5c 100644
--- a/db/logformat.c
+++ b/db/logformat.c
@@ -192,8 +192,137 @@ static const struct cmdinfo logres_cmd = {
.help = logres_help,
};
+STATIC void
+untorn_cow_limits(
+ struct xfs_mount *mp,
+ unsigned int logres,
+ unsigned int desired_max)
+{
+ const unsigned int efi = xfs_efi_log_space(1);
+ const unsigned int efd = xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1);
+ const unsigned int rud = xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1);
+ const unsigned int cud = xfs_cud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1);
+ const unsigned int bud = xfs_bud_log_space();
+
+ /*
+ * Maximum overhead to complete an untorn write ioend in software:
+ * remove data fork extent + remove cow fork extent + map extent into
+ * data fork.
+ *
+ * tx0: Creates a BUI and a CUI and that's all it needs.
+ *
+ * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
+ * enough space to relog the CUI (== CUI + CUD).
+ *
+ * tx2: Roll again to finish the RUI. Need space for the RUD and space
+ * to relog the CUI.
+ *
+ * tx3: Roll again, need space for the CUD and possibly a new EFI.
+ *
+ * tx4: Roll again, need space for an EFD.
+ *
+ * If the extent referenced by the pair of BUI/CUI items is not the one
+ * being currently processed, then we need to reserve space to relog
+ * both items.
+ */
+ const unsigned int tx0 = bui + cui;
+ const unsigned int tx1 = bud + rui + cui + cud;
+ const unsigned int tx2 = rud + cui + cud;
+ const unsigned int tx3 = cud + efi;
+ const unsigned int tx4 = efd;
+ const unsigned int relog = bui + bud + cui + cud;
+
+ const unsigned int per_intent = max(max3(tx0, tx1, tx2),
+ max3(tx3, tx4, relog));
+
+ /* Overhead to finish one step of each intent item type */
+ const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
+
+ /* We only finish one item per transaction in a chain */
+ const unsigned int step_size = max(f4, max3(f1, f2, f3));
+
+ if (desired_max) {
+ dbprintf(
+ "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
+ desired_max, step_size, per_intent,
+ (desired_max * per_intent) + step_size);
+ } else if (logres) {
+ dbprintf(
+ "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
+ logres, step_size, per_intent,
+ logres >= step_size ? (logres - step_size) / per_intent : 0);
+ }
+}
+
+static void
+untorn_max_help(void)
+{
+ dbprintf(_(
+"\n"
+" The 'untorn_max' command computes either the log reservation needed to\n"
+" complete an untorn write of a given block count; or the maximum number of\n"
+" blocks that can be completed given a specific log reservation.\n"
+"\n"
+ ));
+}
+
+static int
+untorn_max_f(
+ int argc,
+ char **argv)
+{
+ unsigned int logres = 0;
+ unsigned int desired_max = 0;
+ int c;
+
+ while ((c = getopt(argc, argv, "l:b:")) != EOF) {
+ switch (c) {
+ case 'l':
+ logres = atoi(optarg);
+ break;
+ case 'b':
+ desired_max = atoi(optarg);
+ break;
+ default:
+ untorn_max_help();
+ return 0;
+ }
+ }
+
+ if (!logres && !desired_max) {
+ dbprintf("untorn_max needs -l or -b option\n");
+ return 0;
+ }
+
+ if (xfs_has_reflink(mp))
+ untorn_cow_limits(mp, logres, desired_max);
+ else
+ dbprintf("untorn write emulation not supported\n");
+
+ return 0;
+}
+
+static const struct cmdinfo untorn_max_cmd = {
+ .name = "untorn_max",
+ .altname = NULL,
+ .cfunc = untorn_max_f,
+ .argmin = 0,
+ .argmax = -1,
+ .canpush = 0,
+ .args = NULL,
+ .oneline = N_("compute untorn write max"),
+ .help = logres_help,
+};
+
void
logres_init(void)
{
add_command(&logres_cmd);
+ add_command(&untorn_max_cmd);
}
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 2a9322560584b0..d4531fc0e380a3 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -1366,6 +1366,16 @@ .SH COMMANDS
.IR name .
The file being targetted will not be put on the iunlink list.
.TP
+.BI "untorn_max [\-b " blockcount "|\-l " logres "]"
+If
+.B -l
+is specified, compute the maximum (in fsblocks) untorn write that we can
+emulate with copy on write given a log reservation size (in bytes).
+If
+.B -b
+is specified, compute the log reservation size that would be needed to
+emulate an untorn write of the given number of fsblocks.
+.TP
.BI "uuid [" uuid " | " generate " | " rewrite " | " restore ]
Set the filesystem universally unique identifier (UUID).
The filesystem UUID can be used by
^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH 2/7] xfs_db: create an untorn_max subcommand
2025-07-01 18:07 ` [PATCH 2/7] xfs_db: create an untorn_max subcommand Darrick J. Wong
@ 2025-07-09 15:39 ` John Garry
2025-07-09 16:35 ` Darrick J. Wong
0 siblings, 1 reply; 33+ messages in thread
From: John Garry @ 2025-07-09 15:39 UTC (permalink / raw)
To: Darrick J. Wong, aalbersh; +Cc: catherine.hoang, linux-xfs
> };
Generally it looks ok, just some small comments.
If you are not too concerned with any comment, then feel free to add the
following:
Reviewed-by: John Garry <john.g.garry@oracle.com>
>
> +STATIC void
> +untorn_cow_limits(
> + struct xfs_mount *mp,
> + unsigned int logres,
> + unsigned int desired_max)
> +{
> + const unsigned int efi = xfs_efi_log_space(1);
> + const unsigned int efd = xfs_efd_log_space(1);
> + const unsigned int rui = xfs_rui_log_space(1);
> + const unsigned int rud = xfs_rud_log_space();
> + const unsigned int cui = xfs_cui_log_space(1);
> + const unsigned int cud = xfs_cud_log_space();
> + const unsigned int bui = xfs_bui_log_space(1);
> + const unsigned int bud = xfs_bud_log_space();
> +
> + /*
> + * Maximum overhead to complete an untorn write ioend in software:
> + * remove data fork extent + remove cow fork extent + map extent into
> + * data fork.
> + *
> + * tx0: Creates a BUI and a CUI and that's all it needs.
> + *
> + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
> + * enough space to relog the CUI (== CUI + CUD).
> + *
> + * tx2: Roll again to finish the RUI. Need space for the RUD and space
> + * to relog the CUI.
> + *
> + * tx3: Roll again, need space for the CUD and possibly a new EFI.
> + *
> + * tx4: Roll again, need space for an EFD.
> + *
> + * If the extent referenced by the pair of BUI/CUI items is not the one
> + * being currently processed, then we need to reserve space to relog
> + * both items.
> + */
> + const unsigned int tx0 = bui + cui;
> + const unsigned int tx1 = bud + rui + cui + cud;
> + const unsigned int tx2 = rud + cui + cud;
> + const unsigned int tx3 = cud + efi;
> + const unsigned int tx4 = efd;
> + const unsigned int relog = bui + bud + cui + cud;
> +
> + const unsigned int per_intent = max(max3(tx0, tx1, tx2),
> + max3(tx3, tx4, relog));
> +
> + /* Overhead to finish one step of each intent item type */
> + const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> + const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
> + const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
> + const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
> +
> + /* We only finish one item per transaction in a chain */
> + const unsigned int step_size = max(f4, max3(f1, f2, f3));
This all looks to match xfs_calc_atomic_write_ioend_geometry(). I assume
that there is a good reason why that code cannot be reused.
> +
> + if (desired_max) {
> + dbprintf(
> + "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> + desired_max, step_size, per_intent,
> + (desired_max * per_intent) + step_size);
> + } else if (logres) {
> + dbprintf(
> + "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> + logres, step_size, per_intent,
> + logres >= step_size ? (logres - step_size) / per_intent : 0);
> + }
> +}
> +
> +static void
> +untorn_max_help(void)
> +{
> + dbprintf(_(
> +"\n"
> +" The 'untorn_max' command computes either the log reservation needed to\n"
> +" complete an untorn write of a given block count; or the maximum number of\n"
> +" blocks that can be completed given a specific log reservation.\n"
> +"\n"
> + ));
> +}
> +
> +static int
> +untorn_max_f(
> + int argc,
> + char **argv)
> +{
> + unsigned int logres = 0;
> + unsigned int desired_max = 0;
> + int c;
> +
> + while ((c = getopt(argc, argv, "l:b:")) != EOF) {
> + switch (c) {
> + case 'l':
> + logres = atoi(optarg);
> + break;
> + case 'b':
> + desired_max = atoi(optarg);
> + break;
> + default:
> + untorn_max_help();
> + return 0;
> + }
> + }
From untorn_cow_limits(), it seems that it's best not give both 'l' and
'b', as we only ever print one value. As such, would be better to set
argmax = 1 (or whatever is needed to only accept only 'l' or 'b')?
> +
> + if (!logres && !desired_max) {
> + dbprintf("untorn_max needs -l or -b option\n");
> + return 0;
similar db command handlers use -1, but I guess that it's not important
here since you just rely on the print message output always
> + }
> +
> + if (xfs_has_reflink(mp))
this check could be put earlier
> + untorn_cow_limits(mp, logres, desired_max);
> + else
> + dbprintf("untorn write emulation not supported\n");
> +
> + return 0;
> +}
> +
> +static const struct cmdinfo untorn_max_cmd = {
it would be nice to use untorn_write_max_cmd
> + .name = "untorn_max",
> + .altname = NULL,
> + .cfunc = untorn_max_f,
> + .argmin = 0,
> + .argmax = -1,
> + .canpush = 0,
> + .args = NULL,
> + .oneline = N_("compute untorn write max"),
> + .help = logres_help,
> +};
> +
> void
> logres_init(void)
> {
> add_command(&logres_cmd);
> + add_command(&untorn_max_cmd);
> }
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index 2a9322560584b0..d4531fc0e380a3 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -1366,6 +1366,16 @@ .SH COMMANDS
> .IR name .
> The file being targetted will not be put on the iunlink list.
> .TP
> +.BI "untorn_max [\-b " blockcount "|\-l " logres "]"
> +If
> +.B -l
> +is specified, compute the maximum (in fsblocks) untorn write that we can
> +emulate with copy on write given a log reservation size (in bytes).
> +If
> +.B -b
> +is specified,
> compute the log reservation size that would be needed to
> +emulate an untorn write of the given number of fsblocks.
> +.TP
> .BI "uuid [" uuid " | " generate " | " rewrite " | " restore ]
> Set the filesystem universally unique identifier (UUID).
> The filesystem UUID can be used by
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 2/7] xfs_db: create an untorn_max subcommand
2025-07-09 15:39 ` John Garry
@ 2025-07-09 16:35 ` Darrick J. Wong
0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-09 16:35 UTC (permalink / raw)
To: John Garry; +Cc: aalbersh, catherine.hoang, linux-xfs
On Wed, Jul 09, 2025 at 04:39:12PM +0100, John Garry wrote:
> > };
>
> Generally it looks ok, just some small comments.
>
> If you are not too concerned with any comment, then feel free to add the
> following:
>
> Reviewed-by: John Garry <john.g.garry@oracle.com>
Thanks! Replies below.
> > +STATIC void
> > +untorn_cow_limits(
> > + struct xfs_mount *mp,
> > + unsigned int logres,
> > + unsigned int desired_max)
> > +{
> > + const unsigned int efi = xfs_efi_log_space(1);
> > + const unsigned int efd = xfs_efd_log_space(1);
> > + const unsigned int rui = xfs_rui_log_space(1);
> > + const unsigned int rud = xfs_rud_log_space();
> > + const unsigned int cui = xfs_cui_log_space(1);
> > + const unsigned int cud = xfs_cud_log_space();
> > + const unsigned int bui = xfs_bui_log_space(1);
> > + const unsigned int bud = xfs_bud_log_space();
> > +
> > + /*
> > + * Maximum overhead to complete an untorn write ioend in software:
> > + * remove data fork extent + remove cow fork extent + map extent into
> > + * data fork.
> > + *
> > + * tx0: Creates a BUI and a CUI and that's all it needs.
> > + *
> > + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
> > + * enough space to relog the CUI (== CUI + CUD).
> > + *
> > + * tx2: Roll again to finish the RUI. Need space for the RUD and space
> > + * to relog the CUI.
> > + *
> > + * tx3: Roll again, need space for the CUD and possibly a new EFI.
> > + *
> > + * tx4: Roll again, need space for an EFD.
> > + *
> > + * If the extent referenced by the pair of BUI/CUI items is not the one
> > + * being currently processed, then we need to reserve space to relog
> > + * both items.
> > + */
> > + const unsigned int tx0 = bui + cui;
> > + const unsigned int tx1 = bud + rui + cui + cud;
> > + const unsigned int tx2 = rud + cui + cud;
> > + const unsigned int tx3 = cud + efi;
> > + const unsigned int tx4 = efd;
> > + const unsigned int relog = bui + bud + cui + cud;
> > +
> > + const unsigned int per_intent = max(max3(tx0, tx1, tx2),
> > + max3(tx3, tx4, relog));
> > +
> > + /* Overhead to finish one step of each intent item type */
> > + const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> > + const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
> > + const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
> > + const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
> > +
> > + /* We only finish one item per transaction in a chain */
> > + const unsigned int step_size = max(f4, max3(f1, f2, f3));
>
> This all looks to match xfs_calc_atomic_write_ioend_geometry(). I assume
> that there is a good reason why that code cannot be reused.
Hrmm, that /would/ be a good refactoring opportunity. Oh, ugh:
STATIC unsigned int
xfs_calc_atomic_write_ioend_geometry(
struct xfs_mount *mp,
unsigned int *step_size)
Ok, I'd have to get that into 6.17... :(
> > +
> > + if (desired_max) {
> > + dbprintf(
> > + "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> > + desired_max, step_size, per_intent,
> > + (desired_max * per_intent) + step_size);
> > + } else if (logres) {
> > + dbprintf(
> > + "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> > + logres, step_size, per_intent,
> > + logres >= step_size ? (logres - step_size) / per_intent : 0);
> > + }
> > +}
> > +
> > +static void
> > +untorn_max_help(void)
> > +{
> > + dbprintf(_(
> > +"\n"
> > +" The 'untorn_max' command computes either the log reservation needed to\n"
> > +" complete an untorn write of a given block count; or the maximum number of\n"
> > +" blocks that can be completed given a specific log reservation.\n"
> > +"\n"
> > + ));
> > +}
> > +
> > +static int
> > +untorn_max_f(
> > + int argc,
> > + char **argv)
> > +{
> > + unsigned int logres = 0;
> > + unsigned int desired_max = 0;
> > + int c;
> > +
> > + while ((c = getopt(argc, argv, "l:b:")) != EOF) {
> > + switch (c) {
> > + case 'l':
> > + logres = atoi(optarg);
> > + break;
> > + case 'b':
> > + desired_max = atoi(optarg);
> > + break;
> > + default:
> > + untorn_max_help();
> > + return 0;
> > + }
> > + }
>
> From untorn_cow_limits(), it seems that it's best not give both 'l' and 'b',
> as we only ever print one value. As such, would be better to set argmax = 1
> (or whatever is needed to only accept only 'l' or 'b')?
>
> > +
> > + if (!logres && !desired_max) {
> > + dbprintf("untorn_max needs -l or -b option\n");
> > + return 0;
>
> similar db command handlers use -1, but I guess that it's not important here
> since you just rely on the print message output always
I think you'd have to set argmax = 2 to pick up the parameter, right?
And then you'd still allow "untorn_max -l -b" which would immediately
fail, obviously. But this works just as well.
> > + }
> > +
> > + if (xfs_has_reflink(mp))
>
> this check could be put earlier
Sure, but what would be gained? All we've done so far is parsed the CLI
options.
> > + untorn_cow_limits(mp, logres, desired_max);
> > + else
> > + dbprintf("untorn write emulation not supported\n");
> > +
> > + return 0;
> > +}
> > +
> > +static const struct cmdinfo untorn_max_cmd = {
>
> it would be nice to use untorn_write_max_cmd
<nod> I'll s/untorn_max/untorn_write_max/ in this file.
--D
> > + .name = "untorn_max",
> > + .altname = NULL,
> > + .cfunc = untorn_max_f,
> > + .argmin = 0,
> > + .argmax = -1,
> > + .canpush = 0,
> > + .args = NULL,
> > + .oneline = N_("compute untorn write max"),
> > + .help = logres_help,
> > +};
> > +
> > void
> > logres_init(void)
> > {
> > add_command(&logres_cmd);
> > + add_command(&untorn_max_cmd);
> > }
> > diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> > index 2a9322560584b0..d4531fc0e380a3 100644
> > --- a/man/man8/xfs_db.8
> > +++ b/man/man8/xfs_db.8
> > @@ -1366,6 +1366,16 @@ .SH COMMANDS
> > .IR name .
> > The file being targetted will not be put on the iunlink list.
> > .TP
> > +.BI "untorn_max [\-b " blockcount "|\-l " logres "]"
> > +If
> > +.B -l
> > +is specified, compute the maximum (in fsblocks) untorn write that we can
> > +emulate with copy on write given a log reservation size (in bytes).
> > +If
> > +.B -b
> > +is specified,
> > compute the log reservation size that would be needed to
> > +emulate an untorn write of the given number of fsblocks.
> > +.TP
> > .BI "uuid [" uuid " | " generate " | " rewrite " | " restore ]
> > Set the filesystem universally unique identifier (UUID).
> > The filesystem UUID can be used by
> >
>
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 3/7] xfs_io: dump new atomic_write_unit_max_opt statx field
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
2025-07-01 18:07 ` [PATCH 1/7] libfrog: move statx.h from io/ to libfrog/ Darrick J. Wong
2025-07-01 18:07 ` [PATCH 2/7] xfs_db: create an untorn_max subcommand Darrick J. Wong
@ 2025-07-01 18:07 ` Darrick J. Wong
2025-07-02 8:23 ` John Garry
2025-07-01 18:07 ` [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units Darrick J. Wong
` (3 subsequent siblings)
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:07 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Dump the new atomic writes statx field that's being submitted for 6.16.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
libfrog/statx.h | 6 +++++-
io/stat.c | 1 +
m4/package_libcdev.m4 | 2 +-
3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/libfrog/statx.h b/libfrog/statx.h
index b76dfae21e7092..e11e2d8f49fa5f 100644
--- a/libfrog/statx.h
+++ b/libfrog/statx.h
@@ -143,7 +143,11 @@ struct statx {
__u32 stx_dio_read_offset_align;
/* 0xb8 */
- __u64 __spare3[9]; /* Spare space for future expansion */
+ /* Optimised max atomic write unit in bytes */
+ __u32 stx_atomic_write_unit_max_opt;
+ __u32 __spare2[1];
+ /* 0xc0 */
+ __u64 __spare3[8]; /* Spare space for future expansion */
/* 0x100 */
};
diff --git a/io/stat.c b/io/stat.c
index 46475df343470c..c257037aa8eec3 100644
--- a/io/stat.c
+++ b/io/stat.c
@@ -396,6 +396,7 @@ dump_raw_statx(struct statx *stx)
printf("stat.atomic_write_unit_max = %u\n", stx->stx_atomic_write_unit_max);
printf("stat.atomic_write_segments_max = %u\n", stx->stx_atomic_write_segments_max);
printf("stat.dio_read_offset_align = %u\n", stx->stx_dio_read_offset_align);
+ printf("stat.atomic_write_unit_max_opt = %u\n", stx->stx_atomic_write_unit_max_opt);
return 0;
}
diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4
index 61353d0aa9d536..b77ac1a7580a80 100644
--- a/m4/package_libcdev.m4
+++ b/m4/package_libcdev.m4
@@ -126,7 +126,7 @@ AC_DEFUN([AC_NEED_INTERNAL_FSCRYPT_POLICY_V2],
AC_DEFUN([AC_NEED_INTERNAL_STATX],
[ AC_CHECK_TYPE(struct statx,
[
- AC_CHECK_MEMBER(struct statx.stx_dio_read_offset_align,
+ AC_CHECK_MEMBER(struct statx.stx_atomic_write_unit_max_opt,
,
need_internal_statx=yes,
[#include <linux/stat.h>]
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
` (2 preceding siblings ...)
2025-07-01 18:07 ` [PATCH 3/7] xfs_io: dump new atomic_write_unit_max_opt statx field Darrick J. Wong
@ 2025-07-01 18:07 ` Darrick J. Wong
2025-07-08 14:38 ` John Garry
2025-07-01 18:08 ` [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices Darrick J. Wong
` (2 subsequent siblings)
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:07 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: linux-xfs, catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
If mkfs declines to apply what it thinks is an overly large data device
stripe unit to the log device, it should only log a message about that
if the lsunit parameter was actually supplied by the caller. It should
not do that when the lsunit was autodetected from the block devices.
The cli parameters are zero-initialized in main and always have been.
Cc: <linux-xfs@vger.kernel.org> # v4.15.0
Fixes: 2f44b1b0e5adc4 ("mkfs: rework stripe calculations")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
mkfs/xfs_mkfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 812241c49a5494..8b946f3ef817da 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3629,7 +3629,7 @@ _("log stripe unit (%d) must be a multiple of the block size (%d)\n"),
if (cfg->sb_feat.log_version == 2 &&
cfg->lsunit * cfg->blocksize > 256 * 1024) {
/* Warn only if specified on commandline */
- if (cli->lsu || cli->lsunit != -1) {
+ if (cli->lsu || cli->lsunit) {
fprintf(stderr,
_("log stripe unit (%d bytes) is too large (maximum is 256KiB)\n"
"log stripe unit adjusted to 32KiB\n"),
^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units
2025-07-01 18:07 ` [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units Darrick J. Wong
@ 2025-07-08 14:38 ` John Garry
2025-07-08 15:05 ` Darrick J. Wong
0 siblings, 1 reply; 33+ messages in thread
From: John Garry @ 2025-07-08 14:38 UTC (permalink / raw)
To: Darrick J. Wong, aalbersh; +Cc: linux-xfs, catherine.hoang
On 01/07/2025 19:07, Darrick J. Wong wrote:
> From: Darrick J. Wong<djwong@kernel.org>
>
> If mkfs declines to apply what it thinks is an overly large data device
> stripe unit to the log device, it should only log a message about that
> if the lsunit parameter was actually supplied by the caller. It should
> not do that when the lsunit was autodetected from the block devices.
>
> The cli parameters are zero-initialized in main and always have been.
>
> Cc:<linux-xfs@vger.kernel.org> # v4.15.0
> Fixes: 2f44b1b0e5adc4 ("mkfs: rework stripe calculations")
> Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
Makes sense, so FWIW:
John Garry <john.g.garry@oracle.com>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units
2025-07-08 14:38 ` John Garry
@ 2025-07-08 15:05 ` Darrick J. Wong
2025-07-08 15:07 ` John Garry
0 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-08 15:05 UTC (permalink / raw)
To: John Garry; +Cc: aalbersh, linux-xfs, catherine.hoang
On Tue, Jul 08, 2025 at 03:38:10PM +0100, John Garry wrote:
> On 01/07/2025 19:07, Darrick J. Wong wrote:
> > From: Darrick J. Wong<djwong@kernel.org>
> >
> > If mkfs declines to apply what it thinks is an overly large data device
> > stripe unit to the log device, it should only log a message about that
> > if the lsunit parameter was actually supplied by the caller. It should
> > not do that when the lsunit was autodetected from the block devices.
> >
> > The cli parameters are zero-initialized in main and always have been.
> >
> > Cc:<linux-xfs@vger.kernel.org> # v4.15.0
> > Fixes: 2f44b1b0e5adc4 ("mkfs: rework stripe calculations")
> > Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
>
> Makes sense, so FWIW:
>
> John Garry <john.g.garry@oracle.com>
Um.... is this a Reviewed-by: ?
--D
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units
2025-07-08 15:05 ` Darrick J. Wong
@ 2025-07-08 15:07 ` John Garry
0 siblings, 0 replies; 33+ messages in thread
From: John Garry @ 2025-07-08 15:07 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: aalbersh, linux-xfs, catherine.hoang
On 08/07/2025 16:05, Darrick J. Wong wrote:
> On Tue, Jul 08, 2025 at 03:38:10PM +0100, John Garry wrote:
>> On 01/07/2025 19:07, Darrick J. Wong wrote:
>>> From: Darrick J. Wong<djwong@kernel.org>
>>>
>>> If mkfs declines to apply what it thinks is an overly large data device
>>> stripe unit to the log device, it should only log a message about that
>>> if the lsunit parameter was actually supplied by the caller. It should
>>> not do that when the lsunit was autodetected from the block devices.
>>>
>>> The cli parameters are zero-initialized in main and always have been.
>>>
>>> Cc:<linux-xfs@vger.kernel.org> # v4.15.0
>>> Fixes: 2f44b1b0e5adc4 ("mkfs: rework stripe calculations")
>>> Signed-off-by: "Darrick J. Wong"<djwong@kernel.org>
>>
>> Makes sense, so FWIW:
>>
>> John Garry <john.g.garry@oracle.com>
>
> Um.... is this a Reviewed-by: ?
oops, yes:
Reviewed-by: John Garry <john.g.garry@oracle.com>
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
` (3 preceding siblings ...)
2025-07-01 18:07 ` [PATCH 4/7] mkfs: don't complain about overly large auto-detected log stripe units Darrick J. Wong
@ 2025-07-01 18:08 ` Darrick J. Wong
2025-07-09 15:57 ` John Garry
2025-07-01 18:08 ` [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities Darrick J. Wong
2025-07-01 18:08 ` [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size Darrick J. Wong
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:08 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
If we're using an external log device and the caller doesn't give us a
lsunit, use the block device geometry (if present) to set it.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
mkfs/xfs_mkfs.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 8b946f3ef817da..6c8cc715d3476b 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3624,6 +3624,10 @@ _("log stripe unit (%d) must be a multiple of the block size (%d)\n"),
cfg->loginternal && cfg->dsunit) {
/* lsunit and dsunit now in fs blocks */
cfg->lsunit = cfg->dsunit;
+ } else if (cfg->sb_feat.log_version == 2 &&
+ !cfg->loginternal) {
+ /* use the external log device properties */
+ cfg->lsunit = DTOBT(ft->log.sunit, cfg->blocklog);
}
if (cfg->sb_feat.log_version == 2 &&
^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices
2025-07-01 18:08 ` [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices Darrick J. Wong
@ 2025-07-09 15:57 ` John Garry
2025-07-09 16:45 ` Darrick J. Wong
0 siblings, 1 reply; 33+ messages in thread
From: John Garry @ 2025-07-09 15:57 UTC (permalink / raw)
To: Darrick J. Wong, aalbersh; +Cc: catherine.hoang, linux-xfs
On 01/07/2025 19:08, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> If we're using an external log device and the caller doesn't give us a
> lsunit, use the block device geometry (if present) to set it.
this seems fine, but I am not imitatively familiar with the code. So, FWIW:
Reviewed-by: John Garry <john.g.garry@oracle.com>
There is a small question below, though.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> mkfs/xfs_mkfs.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
>
> diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
> index 8b946f3ef817da..6c8cc715d3476b 100644
> --- a/mkfs/xfs_mkfs.c
> +++ b/mkfs/xfs_mkfs.c
nit: maybe the comment on method of calculation (not shown, but begins
"check the log sunit is modulo ...") could be updated
> @@ -3624,6 +3624,10 @@ _("log stripe unit (%d) must be a multiple of the block size (%d)\n"),
> cfg->loginternal && cfg->dsunit) {
> /* lsunit and dsunit now in fs blocks */
> cfg->lsunit = cfg->dsunit;
> + } else if (cfg->sb_feat.log_version == 2 &&
> + !cfg->loginternal) {
> + /* use the external log device properties */
> + cfg->lsunit = DTOBT(ft->log.sunit, cfg->blocklog);
> }
>
> if (cfg->sb_feat.log_version == 2 &&
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices
2025-07-09 15:57 ` John Garry
@ 2025-07-09 16:45 ` Darrick J. Wong
0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-09 16:45 UTC (permalink / raw)
To: John Garry; +Cc: aalbersh, catherine.hoang, linux-xfs
On Wed, Jul 09, 2025 at 04:57:49PM +0100, John Garry wrote:
> On 01/07/2025 19:08, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > If we're using an external log device and the caller doesn't give us a
> > lsunit, use the block device geometry (if present) to set it.
>
> this seems fine, but I am not imitatively familiar with the code. So, FWIW:
>
> Reviewed-by: John Garry <john.g.garry@oracle.com>
>
> There is a small question below, though.
>
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> > mkfs/xfs_mkfs.c | 4 ++++
> > 1 file changed, 4 insertions(+)
> >
> >
> > diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
> > index 8b946f3ef817da..6c8cc715d3476b 100644
> > --- a/mkfs/xfs_mkfs.c
> > +++ b/mkfs/xfs_mkfs.c
>
> nit: maybe the comment on method of calculation (not shown, but begins
> "check the log sunit is modulo ...") could be updated
Ok.
/*
* check that log sunit is modulo fsblksize; default it to dsunit for
* an internal log; or the log device stripe unit if it's external.
*/
Thanks for the review.
--D
> > @@ -3624,6 +3624,10 @@ _("log stripe unit (%d) must be a multiple of the block size (%d)\n"),
> > cfg->loginternal && cfg->dsunit) {
> > /* lsunit and dsunit now in fs blocks */
> > cfg->lsunit = cfg->dsunit;
> > + } else if (cfg->sb_feat.log_version == 2 &&
> > + !cfg->loginternal) {
> > + /* use the external log device properties */
> > + cfg->lsunit = DTOBT(ft->log.sunit, cfg->blocklog);
> > }
> > if (cfg->sb_feat.log_version == 2 &&
> >
>
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
` (4 preceding siblings ...)
2025-07-01 18:08 ` [PATCH 5/7] mkfs: autodetect log stripe unit for external log devices Darrick J. Wong
@ 2025-07-01 18:08 ` Darrick J. Wong
2025-07-02 9:03 ` John Garry
2025-07-01 18:08 ` [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size Darrick J. Wong
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:08 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Try to align the AG size to the maximum hardware atomic write unit so
that we can give users maximum flexibility in choosing an RWF_ATOMIC
write size.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
libxfs/topology.h | 6 ++++--
libxfs/topology.c | 36 ++++++++++++++++++++++++++++++++++++
mkfs/xfs_mkfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 83 insertions(+), 7 deletions(-)
diff --git a/libxfs/topology.h b/libxfs/topology.h
index 207a8a7f150556..f0ca65f3576e92 100644
--- a/libxfs/topology.h
+++ b/libxfs/topology.h
@@ -13,8 +13,10 @@
struct device_topology {
int logical_sector_size; /* logical sector size */
int physical_sector_size; /* physical sector size */
- int sunit; /* stripe unit */
- int swidth; /* stripe width */
+ int sunit; /* stripe unit */
+ int swidth; /* stripe width */
+ int awu_min; /* min atomic write unit in bbcounts */
+ int awu_max; /* max atomic write unit in bbcounts */
};
struct fs_topology {
diff --git a/libxfs/topology.c b/libxfs/topology.c
index 96ee74b61b30f5..7764687beac000 100644
--- a/libxfs/topology.c
+++ b/libxfs/topology.c
@@ -4,11 +4,18 @@
* All Rights Reserved.
*/
+#ifdef OVERRIDE_SYSTEM_STATX
+#define statx sys_statx
+#endif
+#include <fcntl.h>
+#include <sys/stat.h>
+
#include "libxfs_priv.h"
#include "libxcmd.h"
#include <blkid/blkid.h>
#include "xfs_multidisk.h"
#include "libfrog/platform.h"
+#include "libfrog/statx.h"
#define TERABYTES(count, blog) ((uint64_t)(count) << (40 - (blog)))
#define GIGABYTES(count, blog) ((uint64_t)(count) << (30 - (blog)))
@@ -278,6 +285,34 @@ blkid_get_topology(
device);
}
+static void
+get_hw_atomic_writes_topology(
+ struct libxfs_dev *dev,
+ struct device_topology *dt)
+{
+ struct statx sx;
+ int fd;
+ int ret;
+
+ fd = open(dev->name, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ ret = statx(fd, "", AT_EMPTY_PATH, STATX_WRITE_ATOMIC, &sx);
+ if (ret)
+ goto out_close;
+
+ if (!(sx.stx_mask & STATX_WRITE_ATOMIC))
+ goto out_close;
+
+ dt->awu_min = sx.stx_atomic_write_unit_min >> 9;
+ dt->awu_max = max(sx.stx_atomic_write_unit_max_opt,
+ sx.stx_atomic_write_unit_max) >> 9;
+
+out_close:
+ close(fd);
+}
+
static void
get_device_topology(
struct libxfs_dev *dev,
@@ -316,6 +351,7 @@ get_device_topology(
}
} else {
blkid_get_topology(dev->name, dt, force_overwrite);
+ get_hw_atomic_writes_topology(dev, dt);
}
ASSERT(dt->logical_sector_size);
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 6c8cc715d3476b..7d3e9dd567b7b2 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -3379,6 +3379,32 @@ _("illegal CoW extent size hint %lld, must be less than %u and a multiple of %u.
}
}
+static void
+validate_device_awu(
+ struct mkfs_params *cfg,
+ struct device_topology *dt)
+{
+ /* Ignore hw atomic write capability if it can't do even 1 fsblock */
+ if (BBTOB(dt->awu_min) > cfg->blocksize ||
+ BBTOB(dt->awu_max) < cfg->blocksize) {
+ dt->awu_min = 0;
+ dt->awu_max = 0;
+ }
+}
+
+static void
+validate_hw_atomic_writes(
+ struct mkfs_params *cfg,
+ struct cli_params *cli,
+ struct fs_topology *ft)
+{
+ validate_device_awu(cfg, &ft->data);
+ if (cli->xi->log.name)
+ validate_device_awu(cfg, &ft->log);
+ if (cli->xi->rt.name)
+ validate_device_awu(cfg, &ft->rt);
+}
+
/* Complain if this filesystem is not a supported configuration. */
static void
validate_supported(
@@ -4051,10 +4077,20 @@ _("agsize (%s) not a multiple of fs blk size (%d)\n"),
*/
static void
align_ag_geometry(
- struct mkfs_params *cfg)
+ struct mkfs_params *cfg,
+ struct fs_topology *ft)
{
- uint64_t tmp_agsize;
- int dsunit = cfg->dsunit;
+ uint64_t tmp_agsize;
+ int dsunit = cfg->dsunit;
+
+ /*
+ * We've already validated (or discarded) the hardware atomic write
+ * geometry. Try to align the agsize to the maximum atomic write unit
+ * to give users maximum flexibility in choosing atomic write sizes.
+ */
+ if (ft->data.awu_max > 0)
+ dsunit = max(DTOBT(ft->data.awu_max, cfg->blocklog),
+ dsunit);
if (!dsunit)
goto validate;
@@ -4110,7 +4146,8 @@ _("agsize rounded to %lld, sunit = %d\n"),
(long long)cfg->agsize, dsunit);
}
- if ((cfg->agsize % cfg->dswidth) == 0 &&
+ if (cfg->dswidth > 0 &&
+ (cfg->agsize % cfg->dswidth) == 0 &&
cfg->dswidth != cfg->dsunit &&
cfg->agcount > 1) {
@@ -5874,6 +5911,7 @@ main(
cfg.rtblocks = calc_dev_size(cli.rtsize, &cfg, &ropts, R_SIZE, "rt");
validate_rtextsize(&cfg, &cli, &ft);
+ validate_hw_atomic_writes(&cfg, &cli, &ft);
/*
* Open and validate the device configurations
@@ -5892,7 +5930,7 @@ main(
* aligns to device geometry correctly.
*/
calculate_initial_ag_geometry(&cfg, &cli, &xi);
- align_ag_geometry(&cfg);
+ align_ag_geometry(&cfg, &ft);
if (cfg.sb_feat.zoned)
calculate_zone_geometry(&cfg, &cli, &xi, &zt);
else
^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities
2025-07-01 18:08 ` [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities Darrick J. Wong
@ 2025-07-02 9:03 ` John Garry
2025-07-02 19:00 ` Darrick J. Wong
0 siblings, 1 reply; 33+ messages in thread
From: John Garry @ 2025-07-02 9:03 UTC (permalink / raw)
To: Darrick J. Wong, aalbersh; +Cc: catherine.hoang, linux-xfs
On 01/07/2025 19:08, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Try to align the AG size to the maximum hardware atomic write unit so
> that we can give users maximum flexibility in choosing an RWF_ATOMIC
> write size.
Regardless of comments below, FWIW:
Reviewed-by: John Garry <john.g.garry@oracle.com>
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> ---
> libxfs/topology.h | 6 ++++--
> libxfs/topology.c | 36 ++++++++++++++++++++++++++++++++++++
> mkfs/xfs_mkfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++-----
> 3 files changed, 83 insertions(+), 7 deletions(-)
>
>
> diff --git a/libxfs/topology.h b/libxfs/topology.h
> index 207a8a7f150556..f0ca65f3576e92 100644
> --- a/libxfs/topology.h
> +++ b/libxfs/topology.h
> @@ -13,8 +13,10 @@
> struct device_topology {
> int logical_sector_size; /* logical sector size */
> int physical_sector_size; /* physical sector size */
> - int sunit; /* stripe unit */
> - int swidth; /* stripe width */
> + int sunit; /* stripe unit */
> + int swidth; /* stripe width */
> + int awu_min; /* min atomic write unit in bbcounts */
awu_min is not really used, but, like the kernel code does, I suppose
useful to store it
> + int awu_max; /* max atomic write unit in bbcounts */
> };
>
> struct fs_topology {
> diff --git a/libxfs/topology.c b/libxfs/topology.c
> index 96ee74b61b30f5..7764687beac000 100644
> --- a/libxfs/topology.c
> +++ b/libxfs/topology.c
> @@ -4,11 +4,18 @@
> * All Rights Reserved.
> */
>
> +#ifdef OVERRIDE_SYSTEM_STATX
> +#define statx sys_statx
> +#endif
> +#include <fcntl.h>
> +#include <sys/stat.h>
> +
> #include "libxfs_priv.h"
> #include "libxcmd.h"
> #include <blkid/blkid.h>
> #include "xfs_multidisk.h"
> #include "libfrog/platform.h"
> +#include "libfrog/statx.h"
>
> #define TERABYTES(count, blog) ((uint64_t)(count) << (40 - (blog)))
> #define GIGABYTES(count, blog) ((uint64_t)(count) << (30 - (blog)))
> @@ -278,6 +285,34 @@ blkid_get_topology(
> device);
> }
>
> +static void
> +get_hw_atomic_writes_topology(
> + struct libxfs_dev *dev,
> + struct device_topology *dt)
> +{
> + struct statx sx;
> + int fd;
> + int ret;
> +
> + fd = open(dev->name, O_RDONLY);
> + if (fd < 0)
> + return;
> +
> + ret = statx(fd, "", AT_EMPTY_PATH, STATX_WRITE_ATOMIC, &sx);
> + if (ret)
> + goto out_close;
> +
> + if (!(sx.stx_mask & STATX_WRITE_ATOMIC))
> + goto out_close;
> +
> + dt->awu_min = sx.stx_atomic_write_unit_min >> 9;
> + dt->awu_max = max(sx.stx_atomic_write_unit_max_opt,
> + sx.stx_atomic_write_unit_max) >> 9;
for a bdev, stx_atomic_write_unit_max_opt should be zero
However, I suppose some bdev could have hybrid atomic write support,
just like xfs, so what you are doing looks good
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities
2025-07-02 9:03 ` John Garry
@ 2025-07-02 19:00 ` Darrick J. Wong
0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-02 19:00 UTC (permalink / raw)
To: John Garry; +Cc: aalbersh, catherine.hoang, linux-xfs
On Wed, Jul 02, 2025 at 10:03:54AM +0100, John Garry wrote:
> On 01/07/2025 19:08, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Try to align the AG size to the maximum hardware atomic write unit so
> > that we can give users maximum flexibility in choosing an RWF_ATOMIC
> > write size.
>
>
> Regardless of comments below, FWIW:
>
> Reviewed-by: John Garry <john.g.garry@oracle.com>
>
>
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > ---
> > libxfs/topology.h | 6 ++++--
> > libxfs/topology.c | 36 ++++++++++++++++++++++++++++++++++++
> > mkfs/xfs_mkfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++-----
> > 3 files changed, 83 insertions(+), 7 deletions(-)
> >
> >
> > diff --git a/libxfs/topology.h b/libxfs/topology.h
> > index 207a8a7f150556..f0ca65f3576e92 100644
> > --- a/libxfs/topology.h
> > +++ b/libxfs/topology.h
> > @@ -13,8 +13,10 @@
> > struct device_topology {
> > int logical_sector_size; /* logical sector size */
> > int physical_sector_size; /* physical sector size */
> > - int sunit; /* stripe unit */
> > - int swidth; /* stripe width */
> > + int sunit; /* stripe unit */
> > + int swidth; /* stripe width */
> > + int awu_min; /* min atomic write unit in bbcounts */
>
> awu_min is not really used, but, like the kernel code does, I suppose useful
> to store it
>
> > + int awu_max; /* max atomic write unit in bbcounts */
> > };
> > struct fs_topology {
> > diff --git a/libxfs/topology.c b/libxfs/topology.c
> > index 96ee74b61b30f5..7764687beac000 100644
> > --- a/libxfs/topology.c
> > +++ b/libxfs/topology.c
> > @@ -4,11 +4,18 @@
> > * All Rights Reserved.
> > */
> > +#ifdef OVERRIDE_SYSTEM_STATX
> > +#define statx sys_statx
> > +#endif
> > +#include <fcntl.h>
> > +#include <sys/stat.h>
> > +
> > #include "libxfs_priv.h"
> > #include "libxcmd.h"
> > #include <blkid/blkid.h>
> > #include "xfs_multidisk.h"
> > #include "libfrog/platform.h"
> > +#include "libfrog/statx.h"
> > #define TERABYTES(count, blog) ((uint64_t)(count) << (40 - (blog)))
> > #define GIGABYTES(count, blog) ((uint64_t)(count) << (30 - (blog)))
> > @@ -278,6 +285,34 @@ blkid_get_topology(
> > device);
> > }
> > +static void
> > +get_hw_atomic_writes_topology(
> > + struct libxfs_dev *dev,
> > + struct device_topology *dt)
> > +{
> > + struct statx sx;
> > + int fd;
> > + int ret;
> > +
> > + fd = open(dev->name, O_RDONLY);
> > + if (fd < 0)
> > + return;
> > +
> > + ret = statx(fd, "", AT_EMPTY_PATH, STATX_WRITE_ATOMIC, &sx);
> > + if (ret)
> > + goto out_close;
> > +
> > + if (!(sx.stx_mask & STATX_WRITE_ATOMIC))
> > + goto out_close;
> > +
> > + dt->awu_min = sx.stx_atomic_write_unit_min >> 9;
> > + dt->awu_max = max(sx.stx_atomic_write_unit_max_opt,
> > + sx.stx_atomic_write_unit_max) >> 9;
>
> for a bdev, stx_atomic_write_unit_max_opt should be zero
>
> However, I suppose some bdev could have hybrid atomic write support, just
> like xfs, so what you are doing looks good
Yeah, it's unlikely ever to happen but if (say) you had a loop device
backed by an xfs file then maybe it'd useful to pass through both atomic
write maxima.
Thanks for the review.
--D
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size
2025-07-01 18:05 ` [PATCHSET 2/3] xfsprogs: atomic writes Darrick J. Wong
` (5 preceding siblings ...)
2025-07-01 18:08 ` [PATCH 6/7] mkfs: try to align AG size based on atomic write capabilities Darrick J. Wong
@ 2025-07-01 18:08 ` Darrick J. Wong
2025-07-02 8:50 ` John Garry
6 siblings, 1 reply; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-01 18:08 UTC (permalink / raw)
To: djwong, aalbersh; +Cc: catherine.hoang, john.g.garry, linux-xfs
From: Darrick J. Wong <djwong@kernel.org>
Allow callers of mkfs.xfs to specify a desired maximum atomic write
size. This value will cause the log size to be adjusted to support
software atomic writes, and the AG size to be aligned to support
hardware atomic writes.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
include/bitops.h | 12 +++
libxfs/libxfs_api_defs.h | 1
man/man8/mkfs.xfs.8.in | 7 ++
mkfs/xfs_mkfs.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 213 insertions(+), 1 deletion(-)
diff --git a/include/bitops.h b/include/bitops.h
index 1f1adceccf5d2b..d0c55827044e54 100644
--- a/include/bitops.h
+++ b/include/bitops.h
@@ -113,4 +113,16 @@ static inline int lowbit64(uint64_t v)
return n - 1;
}
+/**
+ * __rounddown_pow_of_two() - round down to nearest power of two
+ * @n: value to round down
+ */
+static inline __attribute__((const))
+unsigned long __rounddown_pow_of_two(unsigned long n)
+{
+ return 1UL << (fls_long(n) - 1);
+}
+
+#define rounddown_pow_of_two(n) __rounddown_pow_of_two(n)
+
#endif
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 4bd02c57b496e6..fe00e19bada9d8 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -107,6 +107,7 @@
#define xfs_buftarg_drain libxfs_buftarg_drain
#define xfs_bunmapi libxfs_bunmapi
#define xfs_bwrite libxfs_bwrite
+#define xfs_calc_atomic_write_log_geometry libxfs_calc_atomic_write_log_geometry
#define xfs_calc_dquots_per_chunk libxfs_calc_dquots_per_chunk
#define xfs_calc_finish_bui_reservation libxfs_calc_finish_bui_reservation
#define xfs_calc_finish_cui_reservation libxfs_calc_finish_cui_reservation
diff --git a/man/man8/mkfs.xfs.8.in b/man/man8/mkfs.xfs.8.in
index bc80493187f6f9..5f59d4b2da6e02 100644
--- a/man/man8/mkfs.xfs.8.in
+++ b/man/man8/mkfs.xfs.8.in
@@ -742,6 +742,13 @@ .SH OPTIONS
directories, symbolic links, and realtime metadata files.
This feature is disabled by default.
This feature is only available for filesystems formatted with -m crc=1.
+.TP
+.BI max_atomic_write[= value]
+When enabled, application programs can use the RWF_ATOMIC write flag to
+persist changes of up to this size without tearing.
+The default is chosen to allow a reasonable amount of scalability.
+This value must also be passed via mount option.
+This feature is only available for filesystems formatted with reflink.
.RE
.PP
.PD 0
diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c
index 7d3e9dd567b7b2..fc769df22357a6 100644
--- a/mkfs/xfs_mkfs.c
+++ b/mkfs/xfs_mkfs.c
@@ -94,6 +94,7 @@ enum {
I_SPINODES,
I_NREXT64,
I_EXCHANGE,
+ I_MAX_ATOMIC_WRITE,
I_MAX_OPTS,
};
@@ -489,6 +490,7 @@ static struct opt_params iopts = {
[I_SPINODES] = "sparse",
[I_NREXT64] = "nrext64",
[I_EXCHANGE] = "exchange",
+ [I_MAX_ATOMIC_WRITE] = "max_atomic_write",
[I_MAX_OPTS] = NULL,
},
.subopt_params = {
@@ -550,6 +552,13 @@ static struct opt_params iopts = {
.maxval = 1,
.defaultval = 1,
},
+ { .index = I_MAX_ATOMIC_WRITE,
+ .conflicts = { { NULL, LAST_CONFLICT } },
+ .convert = true,
+ .minval = 1,
+ .maxval = 1ULL << 30, /* 1GiB */
+ .defaultval = SUBOPT_NEEDS_VAL,
+ },
},
};
@@ -1069,6 +1078,7 @@ struct cli_params {
char *rtsize;
char *rtstart;
uint64_t rtreserved;
+ char *max_atomic_write;
/* parameters where 0 is a valid CLI value */
int dsunit;
@@ -1157,6 +1167,8 @@ struct mkfs_params {
struct sb_feat_args sb_feat;
uint64_t rtstart;
uint64_t rtreserved;
+
+ uint64_t max_atomic_write;
};
/*
@@ -1197,7 +1209,7 @@ usage( void )
/* force overwrite */ [-f]\n\
/* inode size */ [-i perblock=n|size=num,maxpct=n,attr=0|1|2,\n\
projid32bit=0|1,sparse=0|1,nrext64=0|1,\n\
- exchange=0|1]\n\
+ exchange=0|1,max_atomic_write=n]\n\
/* no discard */ [-K]\n\
/* log subvol */ [-l agnum=n,internal,size=num,logdev=xxx,version=n\n\
sunit=value|su=num,sectsize=num,lazy-count=0|1,\n\
@@ -1927,6 +1939,9 @@ inode_opts_parser(
case I_EXCHANGE:
cli->sb_feat.exchrange = getnum(value, opts, subopt);
break;
+ case I_MAX_ATOMIC_WRITE:
+ cli->max_atomic_write = getstr(value, opts, subopt);
+ break;
default:
return -EINVAL;
}
@@ -4092,6 +4107,18 @@ align_ag_geometry(
dsunit = max(DTOBT(ft->data.awu_max, cfg->blocklog),
dsunit);
+ /*
+ * If the user gave us a maximum atomic write size that is less than
+ * a whole AG, try to align the AG size to that value.
+ */
+ if (cfg->max_atomic_write > 0) {
+ xfs_extlen_t max_atomic_fsbs =
+ cfg->max_atomic_write >> cfg->blocklog;
+
+ if (max_atomic_fsbs < cfg->agsize)
+ dsunit = max(dsunit, max_atomic_fsbs);
+ }
+
if (!dsunit)
goto validate;
@@ -4971,6 +4998,140 @@ calc_concurrency_logblocks(
return logblocks;
}
+#define MAX_RW_COUNT (INT_MAX & ~(getpagesize() - 1))
+
+/* Maximum atomic write IO size that the kernel allows. */
+static inline xfs_extlen_t calc_atomic_write_max(struct mkfs_params *cfg)
+{
+ return rounddown_pow_of_two(MAX_RW_COUNT >> cfg->blocklog);
+}
+
+static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
+{
+ return 1 << (ffs(nr) - 1);
+}
+
+/*
+ * If the data device advertises atomic write support, limit the size of data
+ * device atomic writes to the greatest power-of-two factor of the AG size so
+ * that every atomic write unit aligns with the start of every AG. This is
+ * required so that the per-AG allocations for an atomic write will always be
+ * aligned compatibly with the alignment requirements of the storage.
+ *
+ * If the data device doesn't advertise atomic writes, then there are no
+ * alignment restrictions and the largest out-of-place write we can do
+ * ourselves is the number of blocks that user files can allocate from any AG.
+ */
+static inline xfs_extlen_t
+calc_perag_awu_max(
+ struct mkfs_params *cfg,
+ struct fs_topology *ft)
+{
+ if (ft->data.awu_min > 0)
+ return max_pow_of_two_factor(cfg->agsize);
+ return cfg->agsize;
+}
+
+/*
+ * Reflink on the realtime device requires rtgroups, and atomic writes require
+ * reflink.
+ *
+ * If the realtime device advertises atomic write support, limit the size of
+ * data device atomic writes to the greatest power-of-two factor of the rtgroup
+ * size so that every atomic write unit aligns with the start of every rtgroup.
+ * This is required so that the per-rtgroup allocations for an atomic write
+ * will always be aligned compatibly with the alignment requirements of the
+ * storage.
+ *
+ * If the rt device doesn't advertise atomic writes, then there are no
+ * alignment restrictions and the largest out-of-place write we can do
+ * ourselves is the number of blocks that user files can allocate from any
+ * rtgroup.
+ */
+static inline xfs_extlen_t
+calc_rtgroup_awu_max(
+ struct mkfs_params *cfg,
+ struct fs_topology *ft)
+{
+ if (ft->rt.awu_min > 0)
+ return max_pow_of_two_factor(cfg->rgsize);
+ return cfg->rgsize;
+}
+
+/*
+ * Validate the maximum atomic out of place write size passed in by the user.
+ */
+static void
+validate_max_atomic_write(
+ struct mkfs_params *cfg,
+ struct cli_params *cli,
+ struct fs_topology *ft,
+ struct xfs_mount *mp)
+{
+ const xfs_extlen_t max_write = calc_atomic_write_max(cfg);
+ xfs_filblks_t max_atomic_fsbcount;
+
+ cfg->max_atomic_write = getnum(cli->max_atomic_write, &iopts,
+ I_MAX_ATOMIC_WRITE);
+ max_atomic_fsbcount = cfg->max_atomic_write >> cfg->blocklog;
+
+ /* generic_atomic_write_valid enforces power of two length */
+ if (!is_power_of_2(cfg->max_atomic_write)) {
+ fprintf(stderr,
+ _("Max atomic write size of %llu bytes is not a power of 2\n"),
+ (unsigned long long)cfg->max_atomic_write);
+ exit(1);
+ }
+
+ if (cfg->max_atomic_write % cfg->blocksize) {
+ fprintf(stderr,
+ _("Max atomic write size of %llu bytes not aligned with fsblock.\n"),
+ (unsigned long long)cfg->max_atomic_write);
+ exit(1);
+ }
+
+ if (max_atomic_fsbcount > max_write) {
+ fprintf(stderr,
+ _("Max atomic write size of %lluk cannot be larger than max write size %lluk.\n"),
+ (unsigned long long)cfg->max_atomic_write >> 10,
+ (unsigned long long)max_write << (cfg->blocklog - 10));
+ exit(1);
+ }
+}
+
+/*
+ * Validate the maximum atomic out of place write size passed in by the user
+ * actually works with the allocation groups sizes.
+ */
+static void
+validate_max_atomic_write_ags(
+ struct mkfs_params *cfg,
+ struct fs_topology *ft,
+ struct xfs_mount *mp)
+{
+ const xfs_extlen_t max_group = max(cfg->agsize, cfg->rgsize);
+ const xfs_extlen_t max_group_write =
+ max(calc_perag_awu_max(cfg, ft), calc_rtgroup_awu_max(cfg, ft));
+ xfs_filblks_t max_atomic_fsbcount =
+ XFS_B_TO_FSBT(mp, cfg->max_atomic_write);
+
+ if (max_atomic_fsbcount > max_group) {
+ fprintf(stderr,
+ _("Max atomic write size of %lluk cannot be larger than allocation group size %lluk.\n"),
+ (unsigned long long)cfg->max_atomic_write >> 10,
+ (unsigned long long)XFS_FSB_TO_B(mp, max_group) >> 10);
+ exit(1);
+ }
+
+ if (max_atomic_fsbcount > max_group_write) {
+ fprintf(stderr,
+ _("Max atomic write size of %lluk cannot be larger than max allocation group write size %lluk.\n"),
+ (unsigned long long)cfg->max_atomic_write >> 10,
+ (unsigned long long)XFS_FSB_TO_B(mp, max_group_write) >> 10);
+ exit(1);
+ }
+}
+
static void
calculate_log_size(
struct mkfs_params *cfg,
@@ -4996,6 +5157,22 @@ calculate_log_size(
libxfs_log_get_max_trans_res(&mount, &res);
max_tx_bytes = res.tr_logres * res.tr_logcount;
}
+ if (cfg->max_atomic_write > 0) {
+ unsigned int dontcare;
+ xfs_extlen_t atomic_min_logblocks =
+ libxfs_calc_atomic_write_log_geometry(&mount,
+ cfg->max_atomic_write >> cfg->blocklog,
+ &dontcare);
+
+ if (!atomic_min_logblocks) {
+ fprintf(stderr,
+ _("atomic write size %lluk is too big for the log to handle.\n"),
+ (unsigned long long)cfg->max_atomic_write >> 10);
+ exit(1);
+ }
+
+ min_logblocks = max(min_logblocks, atomic_min_logblocks);
+ }
libxfs_umount(&mount);
ASSERT(min_logblocks);
@@ -5923,6 +6100,13 @@ main(
validate_rtdev(&cfg, &cli, &zt);
calc_stripe_factors(&cfg, &cli, &ft);
+ /*
+ * Now that we have basic geometry set up, we can validate the CLI
+ * max atomic write parameter.
+ */
+ if (cli.max_atomic_write)
+ validate_max_atomic_write(&cfg, &cli, &ft, mp);
+
/*
* At this point when know exactly what size all the devices are,
* so we can start validating and calculating layout options that are
@@ -5946,6 +6130,14 @@ main(
start_superblock_setup(&cfg, mp, sbp);
initialise_mount(mp, sbp);
+ /*
+ * Now that we have computed the allocation group geometry, we can
+ * continue validating the maximum software atomic write parameter, if
+ * one was given.
+ */
+ if (cfg.max_atomic_write)
+ validate_max_atomic_write_ags(&cfg, &ft, mp);
+
/*
* With the mount set up, we can finally calculate the log size
* constraints and do default size calculations and final validation
^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size
2025-07-01 18:08 ` [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size Darrick J. Wong
@ 2025-07-02 8:50 ` John Garry
2025-07-02 19:01 ` Darrick J. Wong
0 siblings, 1 reply; 33+ messages in thread
From: John Garry @ 2025-07-02 8:50 UTC (permalink / raw)
To: Darrick J. Wong, aalbersh; +Cc: catherine.hoang, linux-xfs
On 01/07/2025 19:08, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
>
> Allow callers of mkfs.xfs to specify a desired maximum atomic write
> size. This value will cause the log size to be adjusted to support
> software atomic writes, and the AG size to be aligned to support
> hardware atomic writes.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
thanks, regardless of comments below, FWIW:
Reviewed-by: John Garry <john.g.garry@oracle.com>
> goto validate;
>
> @@ -4971,6 +4998,140 @@ calc_concurrency_logblocks(
> return logblocks;
> }
>
> +#define MAX_RW_COUNT (INT_MAX & ~(getpagesize() - 1))
> +
> +/* Maximum atomic write IO size that the kernel allows. */
FWIW, statx atomic write unit max is a 32b value, so we get a 2GB limit
just from that factor
> +static inline xfs_extlen_t calc_atomic_write_max(struct mkfs_params *cfg)
> +{
> + return rounddown_pow_of_two(MAX_RW_COUNT >> cfg->blocklog);
> +}
> +
> +static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
> +{
> + return 1 << (ffs(nr) - 1);
> +}
> +
> +/*
> + * If the data device advertises atomic write support, limit the size of data
> + * device atomic writes to the greatest power-of-two factor of the AG size so
> + * that every atomic write unit aligns with the start of every AG. This is
> + * required so that the per-AG allocations for an atomic write will always be
> + * aligned compatibly with the alignment requirements of the storage.
> + *
> + * If the data device doesn't advertise atomic writes, then there are no
> + * alignment restrictions and the largest out-of-place write we can do
> + * ourselves is the number of blocks that user files can allocate from any AG.
> + */
> +static inline xfs_extlen_t
> +calc_perag_awu_max(
> + struct mkfs_params *cfg,
> + struct fs_topology *ft)
> +{
> + if (ft->data.awu_min > 0)
> + return max_pow_of_two_factor(cfg->agsize);
> + return cfg->agsize;
out of curiosity, for out-of-place atomic writes, is there anything to
stop the blocks being allocated across multiple AGs?
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 7/7] mkfs: allow users to configure the desired maximum atomic write size
2025-07-02 8:50 ` John Garry
@ 2025-07-02 19:01 ` Darrick J. Wong
0 siblings, 0 replies; 33+ messages in thread
From: Darrick J. Wong @ 2025-07-02 19:01 UTC (permalink / raw)
To: John Garry; +Cc: aalbersh, catherine.hoang, linux-xfs
On Wed, Jul 02, 2025 at 09:50:04AM +0100, John Garry wrote:
> On 01/07/2025 19:08, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> >
> > Allow callers of mkfs.xfs to specify a desired maximum atomic write
> > size. This value will cause the log size to be adjusted to support
> > software atomic writes, and the AG size to be aligned to support
> > hardware atomic writes.
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
>
> thanks, regardless of comments below, FWIW:
>
> Reviewed-by: John Garry <john.g.garry@oracle.com>
>
> > goto validate;
> > @@ -4971,6 +4998,140 @@ calc_concurrency_logblocks(
> > return logblocks;
> > }
> > +#define MAX_RW_COUNT (INT_MAX & ~(getpagesize() - 1))
> > +
> > +/* Maximum atomic write IO size that the kernel allows. */
>
> FWIW, statx atomic write unit max is a 32b value, so we get a 2GB limit just
> from that factor
<nod> But we might as well mirror the kernel's calculations...
> > +static inline xfs_extlen_t calc_atomic_write_max(struct mkfs_params *cfg)
> > +{
> > + return rounddown_pow_of_two(MAX_RW_COUNT >> cfg->blocklog);
> > +}
> > +
> > +static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
> > +{
> > + return 1 << (ffs(nr) - 1);
> > +}
> > +
> > +/*
> > + * If the data device advertises atomic write support, limit the size of data
> > + * device atomic writes to the greatest power-of-two factor of the AG size so
> > + * that every atomic write unit aligns with the start of every AG. This is
> > + * required so that the per-AG allocations for an atomic write will always be
> > + * aligned compatibly with the alignment requirements of the storage.
> > + *
> > + * If the data device doesn't advertise atomic writes, then there are no
> > + * alignment restrictions and the largest out-of-place write we can do
> > + * ourselves is the number of blocks that user files can allocate from any AG.
> > + */
> > +static inline xfs_extlen_t
> > +calc_perag_awu_max(
> > + struct mkfs_params *cfg,
> > + struct fs_topology *ft)
> > +{
> > + if (ft->data.awu_min > 0)
> > + return max_pow_of_two_factor(cfg->agsize);
> > + return cfg->agsize;
>
> out of curiosity, for out-of-place atomic writes, is there anything to stop
> the blocks being allocated across multiple AGs?
Nope. But they'll at least get the software fallback, same as if they
were writing to a severely fragmented filesystem.
--D
--D
^ permalink raw reply [flat|nested] 33+ messages in thread