* [PATCH 01/43] xfs: constify feature checks
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 20:44 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 02/43] xfs: factor out a xfs_rt_check_size helper Christoph Hellwig
` (41 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
We'll need to call them on a const structure in growfs in a bit.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_rtgroup.c | 2 +-
fs/xfs/scrub/scrub.h | 2 +-
fs/xfs/xfs_mount.h | 10 +++++-----
3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index a6468e591232..d84d32f1b48f 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -338,7 +338,7 @@ struct xfs_rtginode_ops {
unsigned int fmt_mask; /* all valid data fork formats */
/* Does the fs have this feature? */
- bool (*enabled)(struct xfs_mount *mp);
+ bool (*enabled)(const struct xfs_mount *mp);
/* Create this rtgroup metadata inode and initialize it. */
int (*create)(struct xfs_rtgroup *rtg,
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index a1086f1f06d0..a3f1abc91390 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -96,7 +96,7 @@ struct xchk_meta_ops {
int (*repair_eval)(struct xfs_scrub *sc);
/* Decide if we even have this piece of metadata. */
- bool (*has)(struct xfs_mount *);
+ bool (*has)(const struct xfs_mount *);
/* type describing required/allowed inputs */
enum xchk_type type;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9a1516080e63..fbed172d6770 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -357,7 +357,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */
#define __XFS_HAS_FEAT(name, NAME) \
-static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \
{ \
return mp->m_features & XFS_FEAT_ ## NAME; \
}
@@ -393,25 +393,25 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
-static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
+static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
/* all metadir file systems also allow rtgroups */
return xfs_has_metadir(mp);
}
-static inline bool xfs_has_rtsb(struct xfs_mount *mp)
+static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
}
-static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
+static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
{
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
xfs_has_rmapbt(mp);
}
-static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
+static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
{
return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
xfs_has_reflink(mp);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 01/43] xfs: constify feature checks
2024-12-11 8:54 ` [PATCH 01/43] xfs: constify feature checks Christoph Hellwig
@ 2024-12-12 20:44 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 20:44 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:26AM +0100, Christoph Hellwig wrote:
> We'll need to call them on a const structure in growfs in a bit.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks good to me,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/libxfs/xfs_rtgroup.c | 2 +-
> fs/xfs/scrub/scrub.h | 2 +-
> fs/xfs/xfs_mount.h | 10 +++++-----
> 3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
> index a6468e591232..d84d32f1b48f 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.c
> +++ b/fs/xfs/libxfs/xfs_rtgroup.c
> @@ -338,7 +338,7 @@ struct xfs_rtginode_ops {
> unsigned int fmt_mask; /* all valid data fork formats */
>
> /* Does the fs have this feature? */
> - bool (*enabled)(struct xfs_mount *mp);
> + bool (*enabled)(const struct xfs_mount *mp);
>
> /* Create this rtgroup metadata inode and initialize it. */
> int (*create)(struct xfs_rtgroup *rtg,
> diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
> index a1086f1f06d0..a3f1abc91390 100644
> --- a/fs/xfs/scrub/scrub.h
> +++ b/fs/xfs/scrub/scrub.h
> @@ -96,7 +96,7 @@ struct xchk_meta_ops {
> int (*repair_eval)(struct xfs_scrub *sc);
>
> /* Decide if we even have this piece of metadata. */
> - bool (*has)(struct xfs_mount *);
> + bool (*has)(const struct xfs_mount *);
>
> /* type describing required/allowed inputs */
> enum xchk_type type;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 9a1516080e63..fbed172d6770 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -357,7 +357,7 @@ typedef struct xfs_mount {
> #define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */
>
> #define __XFS_HAS_FEAT(name, NAME) \
> -static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
> +static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \
> { \
> return mp->m_features & XFS_FEAT_ ## NAME; \
> }
> @@ -393,25 +393,25 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
> __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
> __XFS_HAS_FEAT(metadir, METADIR)
>
> -static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
> +static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
> {
> /* all metadir file systems also allow rtgroups */
> return xfs_has_metadir(mp);
> }
>
> -static inline bool xfs_has_rtsb(struct xfs_mount *mp)
> +static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
> {
> /* all rtgroups filesystems with an rt section have an rtsb */
> return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
> }
>
> -static inline bool xfs_has_rtrmapbt(struct xfs_mount *mp)
> +static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
> {
> return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) &&
> xfs_has_rmapbt(mp);
> }
>
> -static inline bool xfs_has_rtreflink(struct xfs_mount *mp)
> +static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
> {
> return xfs_has_metadir(mp) && xfs_has_realtime(mp) &&
> xfs_has_reflink(mp);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 02/43] xfs: factor out a xfs_rt_check_size helper
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
2024-12-11 8:54 ` [PATCH 01/43] xfs: constify feature checks Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:11 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 03/43] xfs: add a rtg_blocks helper Christoph Hellwig
` (40 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Add a helper to check that the last block of a RT device is readable
to share the code between mount and growfs. This also adds the mount
time overflow check to growfs and improves the error messages.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_rtalloc.c | 62 ++++++++++++++++++++++----------------------
1 file changed, 31 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d8e6d073d64d..bc18b694db75 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1248,6 +1248,34 @@ xfs_grow_last_rtg(
mp->m_sb.sb_rgextents;
}
+/*
+ * Read in the last block of the RT device to make sure it is accessible.
+ */
+static int
+xfs_rt_check_size(
+ struct xfs_mount *mp,
+ xfs_rfsblock_t last_block)
+{
+ xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
+ struct xfs_buf *bp;
+ int error;
+
+ if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
+ xfs_warn(mp, "RT device size overflow: %llu != %llu",
+ XFS_BB_TO_FSB(mp, daddr), last_block);
+ return -EFBIG;
+ }
+
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp, daddr,
+ XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ if (error)
+ xfs_warn(mp, "cannot read last RT device sector (%lld)",
+ last_block);
+ else
+ xfs_buf_relse(bp);
+ return error;
+}
+
/*
* Grow the realtime area of the filesystem.
*/
@@ -1259,7 +1287,6 @@ xfs_growfs_rt(
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
xfs_rgnumber_t new_rgcount = 1;
xfs_rgnumber_t rgno;
- struct xfs_buf *bp;
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
int error;
@@ -1302,15 +1329,10 @@ xfs_growfs_rt(
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
goto out_unlock;
- /*
- * Read in the last block of the device, make sure it exists.
- */
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- XFS_FSB_TO_BB(mp, in->newblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+
+ error = xfs_rt_check_size(mp, in->newblocks - 1);
if (error)
goto out_unlock;
- xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
@@ -1444,10 +1466,6 @@ int /* error */
xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
- struct xfs_buf *bp; /* buffer for last block of subvolume */
- xfs_daddr_t d; /* address of last block of subvolume */
- int error;
-
if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
@@ -1458,25 +1476,7 @@ xfs_rtmount_init(
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
- /*
- * Check that the realtime section is an ok size.
- */
- d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
- xfs_warn(mp, "realtime mount -- %llu != %llu",
- (unsigned long long) XFS_BB_TO_FSB(mp, d),
- (unsigned long long) mp->m_sb.sb_rblocks);
- return -EFBIG;
- }
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
- if (error) {
- xfs_warn(mp, "realtime device size check failed");
- return error;
- }
- xfs_buf_relse(bp);
- return 0;
+ return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
}
static int
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 02/43] xfs: factor out a xfs_rt_check_size helper
2024-12-11 8:54 ` [PATCH 02/43] xfs: factor out a xfs_rt_check_size helper Christoph Hellwig
@ 2024-12-12 21:11 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:11 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:27AM +0100, Christoph Hellwig wrote:
> Add a helper to check that the last block of a RT device is readable
> to share the code between mount and growfs. This also adds the mount
> time overflow check to growfs and improves the error messages.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Seems fine to me...
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_rtalloc.c | 62 ++++++++++++++++++++++----------------------
> 1 file changed, 31 insertions(+), 31 deletions(-)
>
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index d8e6d073d64d..bc18b694db75 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1248,6 +1248,34 @@ xfs_grow_last_rtg(
> mp->m_sb.sb_rgextents;
> }
>
> +/*
> + * Read in the last block of the RT device to make sure it is accessible.
> + */
> +static int
> +xfs_rt_check_size(
> + struct xfs_mount *mp,
> + xfs_rfsblock_t last_block)
> +{
> + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
> + struct xfs_buf *bp;
> + int error;
> +
> + if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
> + xfs_warn(mp, "RT device size overflow: %llu != %llu",
> + XFS_BB_TO_FSB(mp, daddr), last_block);
> + return -EFBIG;
> + }
> +
> + error = xfs_buf_read_uncached(mp->m_rtdev_targp, daddr,
> + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
> + if (error)
> + xfs_warn(mp, "cannot read last RT device sector (%lld)",
> + last_block);
> + else
> + xfs_buf_relse(bp);
> + return error;
> +}
> +
> /*
> * Grow the realtime area of the filesystem.
> */
> @@ -1259,7 +1287,6 @@ xfs_growfs_rt(
> xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
> xfs_rgnumber_t new_rgcount = 1;
> xfs_rgnumber_t rgno;
> - struct xfs_buf *bp;
> xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
> int error;
>
> @@ -1302,15 +1329,10 @@ xfs_growfs_rt(
> error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
> if (error)
> goto out_unlock;
> - /*
> - * Read in the last block of the device, make sure it exists.
> - */
> - error = xfs_buf_read_uncached(mp->m_rtdev_targp,
> - XFS_FSB_TO_BB(mp, in->newblocks - 1),
> - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
> +
> + error = xfs_rt_check_size(mp, in->newblocks - 1);
> if (error)
> goto out_unlock;
> - xfs_buf_relse(bp);
>
> /*
> * Calculate new parameters. These are the final values to be reached.
> @@ -1444,10 +1466,6 @@ int /* error */
> xfs_rtmount_init(
> struct xfs_mount *mp) /* file system mount structure */
> {
> - struct xfs_buf *bp; /* buffer for last block of subvolume */
> - xfs_daddr_t d; /* address of last block of subvolume */
> - int error;
> -
> if (mp->m_sb.sb_rblocks == 0)
> return 0;
> if (mp->m_rtdev_targp == NULL) {
> @@ -1458,25 +1476,7 @@ xfs_rtmount_init(
>
> mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
>
> - /*
> - * Check that the realtime section is an ok size.
> - */
> - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
> - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
> - xfs_warn(mp, "realtime mount -- %llu != %llu",
> - (unsigned long long) XFS_BB_TO_FSB(mp, d),
> - (unsigned long long) mp->m_sb.sb_rblocks);
> - return -EFBIG;
> - }
> - error = xfs_buf_read_uncached(mp->m_rtdev_targp,
> - d - XFS_FSB_TO_BB(mp, 1),
> - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
> - if (error) {
> - xfs_warn(mp, "realtime device size check failed");
> - return error;
> - }
> - xfs_buf_relse(bp);
> - return 0;
> + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
> }
>
> static int
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 03/43] xfs: add a rtg_blocks helper
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
2024-12-11 8:54 ` [PATCH 01/43] xfs: constify feature checks Christoph Hellwig
2024-12-11 8:54 ` [PATCH 02/43] xfs: factor out a xfs_rt_check_size helper Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:12 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c Christoph Hellwig
` (39 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Shortcut dereferencing the xg_block_count field in the generic group
structure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_rtgroup.c | 2 +-
fs/xfs/libxfs/xfs_rtgroup.h | 5 +++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index d84d32f1b48f..97aad8967149 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -270,7 +270,7 @@ xfs_rtgroup_get_geometry(
/* Fill out form. */
memset(rgeo, 0, sizeof(*rgeo));
rgeo->rg_number = rtg_rgno(rtg);
- rgeo->rg_length = rtg_group(rtg)->xg_block_count;
+ rgeo->rg_length = rtg_blocks(rtg);
xfs_rtgroup_geom_health(rtg, rgeo);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index de4eeb381fc9..0e1d9474ab77 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -66,6 +66,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
+static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_block_count;
+}
+
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 03/43] xfs: add a rtg_blocks helper
2024-12-11 8:54 ` [PATCH 03/43] xfs: add a rtg_blocks helper Christoph Hellwig
@ 2024-12-12 21:12 ` Darrick J. Wong
2024-12-13 5:00 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:12 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:28AM +0100, Christoph Hellwig wrote:
> Shortcut dereferencing the xg_block_count field in the generic group
> structure.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks good, though I imagine there are a few more places where you could
use this helper?
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/libxfs/xfs_rtgroup.c | 2 +-
> fs/xfs/libxfs/xfs_rtgroup.h | 5 +++++
> 2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
> index d84d32f1b48f..97aad8967149 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.c
> +++ b/fs/xfs/libxfs/xfs_rtgroup.c
> @@ -270,7 +270,7 @@ xfs_rtgroup_get_geometry(
> /* Fill out form. */
> memset(rgeo, 0, sizeof(*rgeo));
> rgeo->rg_number = rtg_rgno(rtg);
> - rgeo->rg_length = rtg_group(rtg)->xg_block_count;
> + rgeo->rg_length = rtg_blocks(rtg);
> xfs_rtgroup_geom_health(rtg, rgeo);
> return 0;
> }
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> index de4eeb381fc9..0e1d9474ab77 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.h
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -66,6 +66,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
> return rtg->rtg_group.xg_gno;
> }
>
> +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
> +{
> + return rtg->rtg_group.xg_block_count;
> +}
> +
> static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
> {
> return rtg->rtg_inodes[XFS_RTGI_BITMAP];
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 03/43] xfs: add a rtg_blocks helper
2024-12-12 21:12 ` Darrick J. Wong
@ 2024-12-13 5:00 ` Christoph Hellwig
2024-12-15 18:10 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:00 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:12:25PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:28AM +0100, Christoph Hellwig wrote:
> > Shortcut dereferencing the xg_block_count field in the generic group
> > structure.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Looks good, though I imagine there are a few more places where you could
> use this helper?
While the zoned code uses it a lot, there are surprisingly few uses
in your baseline. But when reassuring me I noticed your recently added
RT-aware failure notifier could use it, so at least one more.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 03/43] xfs: add a rtg_blocks helper
2024-12-13 5:00 ` Christoph Hellwig
@ 2024-12-15 18:10 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-15 18:10 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:00:46AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 01:12:25PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:28AM +0100, Christoph Hellwig wrote:
> > > Shortcut dereferencing the xg_block_count field in the generic group
> > > structure.
> > >
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > Looks good, though I imagine there are a few more places where you could
> > use this helper?
>
> While the zoned code uses it a lot, there are surprisingly few uses
> in your baseline. But when reassuring me I noticed your recently added
> RT-aware failure notifier could use it, so at least one more.
<nod> Well at least it's an easy enough cleanup after all the dust
settles. :)
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (2 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 03/43] xfs: add a rtg_blocks helper Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:18 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs Christoph Hellwig
` (38 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Delalloc reservations are not supported in userspace, and thus it doesn't
make sense to share this helper with xfsprogs.c. Move it to xfs_iomap.c
toward the two callers.
Note that there rest of the delalloc handling should probably eventually
also move out of xfs_bmap.c, but that will require a bit more surgery.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_bmap.c | 295 +--------------------------------------
fs/xfs/libxfs/xfs_bmap.h | 5 +-
fs/xfs/xfs_iomap.c | 279 ++++++++++++++++++++++++++++++++++++
3 files changed, 287 insertions(+), 292 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 40ad22fb808b..861945a5fce3 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,7 +34,6 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
@@ -171,18 +170,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
-STATIC xfs_filblks_t
+xfs_filblks_t
xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
+ struct xfs_mount *mp = ip->i_mount;
+ int maxrecs = mp->m_bmap_dmxr[0];
+ int level;
+ xfs_filblks_t rval;
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@@ -2571,146 +2568,6 @@ xfs_bmap_add_extent_unwritten_real(
#undef PREV
}
-/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork,
- struct xfs_iext_cursor *icur,
- xfs_bmbt_irec_t *new) /* new data to add to file extents */
-{
- struct xfs_ifork *ifp; /* inode fork pointer */
- xfs_bmbt_irec_t left; /* left neighbor extent entry */
- xfs_filblks_t newlen=0; /* new indirect size */
- xfs_filblks_t oldlen=0; /* old indirect size */
- xfs_bmbt_irec_t right; /* right neighbor extent entry */
- uint32_t state = xfs_bmap_fork_to_state(whichfork);
- xfs_filblks_t temp; /* temp for indirect calculations */
-
- ifp = xfs_ifork_ptr(ip, whichfork);
- ASSERT(isnullstartblock(new->br_startblock));
-
- /*
- * Check and set flags if this segment has a left neighbor
- */
- if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
- state |= BMAP_LEFT_VALID;
- if (isnullstartblock(left.br_startblock))
- state |= BMAP_LEFT_DELAY;
- }
-
- /*
- * Check and set flags if the current (right) segment exists.
- * If it doesn't exist, we're converting the hole at end-of-file.
- */
- if (xfs_iext_get_extent(ifp, icur, &right)) {
- state |= BMAP_RIGHT_VALID;
- if (isnullstartblock(right.br_startblock))
- state |= BMAP_RIGHT_DELAY;
- }
-
- /*
- * Set contiguity flags on the left and right neighbors.
- * Don't let extents get too large, even if the pieces are contiguous.
- */
- if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
- state |= BMAP_LEFT_CONTIG;
-
- if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
- (!(state & BMAP_LEFT_CONTIG) ||
- (left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
- state |= BMAP_RIGHT_CONTIG;
-
- /*
- * Switch out based on the contiguity flags.
- */
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with delayed allocations
- * on the left and on the right.
- * Merge all three into a single extent record.
- */
- temp = left.br_blockcount + new->br_blockcount +
- right.br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_startblock = nullstartblock(newlen);
- left.br_blockcount = temp;
-
- xfs_iext_remove(ip, icur, state);
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_LEFT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the left.
- * Merge the new allocation with the left neighbor.
- */
- temp = left.br_blockcount + new->br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_blockcount = temp;
- left.br_startblock = nullstartblock(newlen);
-
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the right.
- * Merge the new allocation with the right neighbor.
- */
- temp = new->br_blockcount + right.br_blockcount;
- oldlen = startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- right.br_startoff = new->br_startoff;
- right.br_startblock = nullstartblock(newlen);
- right.br_blockcount = temp;
- xfs_iext_update_extent(ip, state, icur, &right);
- break;
-
- case 0:
- /*
- * New allocation is not contiguous with another
- * delayed allocation.
- * Insert a new entry.
- */
- oldlen = newlen = 0;
- xfs_iext_insert(ip, icur, new, state);
- break;
- }
- if (oldlen != newlen) {
- ASSERT(oldlen > newlen);
- xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
-
- /*
- * Nothing to do for disk quota accounting here.
- */
- xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
- }
-}
-
/*
* Convert a hole to a real allocation.
*/
@@ -4038,144 +3895,6 @@ xfs_bmapi_read(
return 0;
}
-/*
- * Add a delayed allocation extent to an inode. Blocks are reserved from the
- * global pool and the extent inserted into the inode in-core extent tree.
- *
- * On entry, got refers to the first extent beyond the offset of the extent to
- * allocate or eof is specified if no such extent exists. On return, got refers
- * to the extent record that was inserted to the inode fork.
- *
- * Note that the allocated extent may have been merged with contiguous extents
- * during insertion into the inode fork. Thus, got does not reflect the current
- * state of the inode fork on return. If necessary, the caller can use lastx to
- * look up the updated record in the inode fork.
- */
-int
-xfs_bmapi_reserve_delalloc(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t off,
- xfs_filblks_t len,
- xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got,
- struct xfs_iext_cursor *icur,
- int eof)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- xfs_extlen_t alen;
- xfs_extlen_t indlen;
- uint64_t fdblocks;
- int error;
- xfs_fileoff_t aoff;
- bool use_cowextszhint =
- whichfork == XFS_COW_FORK && !prealloc;
-
-retry:
- /*
- * Cap the alloc length. Keep track of prealloc so we know whether to
- * tag the inode before we return.
- */
- aoff = off;
- alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
- if (!eof)
- alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
- if (prealloc && alen >= len)
- prealloc = alen - len;
-
- /*
- * If we're targetting the COW fork but aren't creating a speculative
- * posteof preallocation, try to expand the reservation to align with
- * the COW extent size hint if there's sufficient free space.
- *
- * Unlike the data fork, the CoW cancellation functions will free all
- * the reservations at inactivation, so we don't require that every
- * delalloc reservation have a dirty pagecache.
- */
- if (use_cowextszhint) {
- struct xfs_bmbt_irec prev;
- xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
-
- if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
- prev.br_startoff = NULLFILEOFF;
-
- error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
- 1, 0, &aoff, &alen);
- ASSERT(!error);
- }
-
- /*
- * Make a transaction-less quota reservation for delayed allocation
- * blocks. This number gets adjusted later. We return if we haven't
- * allocated blocks already inside this loop.
- */
- error = xfs_quota_reserve_blkres(ip, alen);
- if (error)
- goto out;
-
- /*
- * Split changing sb for alen and indlen since they could be coming
- * from different places.
- */
- indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- fdblocks = indlen;
- if (XFS_IS_REALTIME_INODE(ip)) {
- error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
- if (error)
- goto out_unreserve_quota;
- } else {
- fdblocks += alen;
- }
-
- error = xfs_dec_fdblocks(mp, fdblocks, false);
- if (error)
- goto out_unreserve_frextents;
-
- ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip, alen, indlen);
-
- got->br_startoff = aoff;
- got->br_startblock = nullstartblock(indlen);
- got->br_blockcount = alen;
- got->br_state = XFS_EXT_NORM;
-
- xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
-
- /*
- * Tag the inode if blocks were preallocated. Note that COW fork
- * preallocation can occur at the start or end of the extent, even when
- * prealloc == 0, so we must also check the aligned offset and length.
- */
- if (whichfork == XFS_DATA_FORK && prealloc)
- xfs_inode_set_eofblocks_tag(ip);
- if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
- xfs_inode_set_cowblocks_tag(ip);
-
- return 0;
-
-out_unreserve_frextents:
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
-out_unreserve_quota:
- if (XFS_IS_QUOTA_ON(mp))
- xfs_quota_unreserve_blkres(ip, alen);
-out:
- if (error == -ENOSPC || error == -EDQUOT) {
- trace_xfs_delalloc_enospc(ip, off, len);
-
- if (prealloc || use_cowextszhint) {
- /* retry without any preallocation */
- use_cowextszhint = false;
- prealloc = 0;
- goto retry;
- }
- }
- return error;
-}
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 4b721d935994..4d48087fd3a8 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
-int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
- int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5dd0922fe2d1..b3783d7b8ebe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -989,6 +990,284 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
+/*
+ * Convert a hole to a delayed allocation.
+ */
+static void
+xfs_bmap_add_extent_hole_delay(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *new) /* new data to add to file extents */
+{
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_filblks_t newlen=0; /* new indirect size */
+ xfs_filblks_t oldlen=0; /* old indirect size */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(isnullstartblock(new->br_startblock));
+
+ /*
+ * Check and set flags if this segment has a left neighbor
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if the current (right) segment exists.
+ * If it doesn't exist, we're converting the hole at end-of-file.
+ */
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * Set contiguity flags on the left and right neighbors.
+ * Don't let extents get too large, even if the pieces are contiguous.
+ */
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the contiguity flags.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with delayed allocations
+ * on the left and on the right.
+ * Merge all three into a single extent record.
+ */
+ temp = left.br_blockcount + new->br_blockcount +
+ right.br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ temp = left.br_blockcount + new->br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iext_insert(ip, icur, new, state);
+ break;
+ }
+ if (oldlen != newlen) {
+ ASSERT(oldlen > newlen);
+ xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
+
+ /*
+ * Nothing to do for disk quota accounting here.
+ */
+ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
+ }
+}
+
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
+static int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *icur,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ uint64_t fdblocks;
+ int error;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+
+retry:
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ aoff = off;
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
+
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
+ struct xfs_bmbt_irec prev;
+ xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
+
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_quota_reserve_blkres(ip, alen);
+ if (error)
+ goto out;
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ fdblocks = indlen;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+ if (error)
+ goto out_unreserve_quota;
+ } else {
+ fdblocks += alen;
+ }
+
+ error = xfs_dec_fdblocks(mp, fdblocks, false);
+ if (error)
+ goto out_unreserve_frextents;
+
+ ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip, alen, indlen);
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
+
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
+ return 0;
+
+out_unreserve_frextents:
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
+ return error;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c
2024-12-11 8:54 ` [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c Christoph Hellwig
@ 2024-12-12 21:18 ` Darrick J. Wong
2024-12-13 5:04 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:18 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:29AM +0100, Christoph Hellwig wrote:
> Delalloc reservations are not supported in userspace, and thus it doesn't
> make sense to share this helper with xfsprogs.c. Move it to xfs_iomap.c
> toward the two callers.
>
> Note that there rest of the delalloc handling should probably eventually
> also move out of xfs_bmap.c, but that will require a bit more surgery.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Not opposed since we /could/ move this back if userspace ever (a) grows
a fuse server and (b) decides to use delalloc with it, but is this move
totally necessary?
--D
> ---
> fs/xfs/libxfs/xfs_bmap.c | 295 +--------------------------------------
> fs/xfs/libxfs/xfs_bmap.h | 5 +-
> fs/xfs/xfs_iomap.c | 279 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 287 insertions(+), 292 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 40ad22fb808b..861945a5fce3 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -34,7 +34,6 @@
> #include "xfs_ag.h"
> #include "xfs_ag_resv.h"
> #include "xfs_refcount.h"
> -#include "xfs_icache.h"
> #include "xfs_iomap.h"
> #include "xfs_health.h"
> #include "xfs_bmap_item.h"
> @@ -171,18 +170,16 @@ xfs_bmbt_update(
> * Compute the worst-case number of indirect blocks that will be used
> * for ip's delayed extent of length "len".
> */
> -STATIC xfs_filblks_t
> +xfs_filblks_t
> xfs_bmap_worst_indlen(
> - xfs_inode_t *ip, /* incore inode pointer */
> - xfs_filblks_t len) /* delayed extent length */
> + struct xfs_inode *ip, /* incore inode pointer */
> + xfs_filblks_t len) /* delayed extent length */
> {
> - int level; /* btree level number */
> - int maxrecs; /* maximum record count at this level */
> - xfs_mount_t *mp; /* mount structure */
> - xfs_filblks_t rval; /* return value */
> + struct xfs_mount *mp = ip->i_mount;
> + int maxrecs = mp->m_bmap_dmxr[0];
> + int level;
> + xfs_filblks_t rval;
>
> - mp = ip->i_mount;
> - maxrecs = mp->m_bmap_dmxr[0];
> for (level = 0, rval = 0;
> level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
> level++) {
> @@ -2571,146 +2568,6 @@ xfs_bmap_add_extent_unwritten_real(
> #undef PREV
> }
>
> -/*
> - * Convert a hole to a delayed allocation.
> - */
> -STATIC void
> -xfs_bmap_add_extent_hole_delay(
> - xfs_inode_t *ip, /* incore inode pointer */
> - int whichfork,
> - struct xfs_iext_cursor *icur,
> - xfs_bmbt_irec_t *new) /* new data to add to file extents */
> -{
> - struct xfs_ifork *ifp; /* inode fork pointer */
> - xfs_bmbt_irec_t left; /* left neighbor extent entry */
> - xfs_filblks_t newlen=0; /* new indirect size */
> - xfs_filblks_t oldlen=0; /* old indirect size */
> - xfs_bmbt_irec_t right; /* right neighbor extent entry */
> - uint32_t state = xfs_bmap_fork_to_state(whichfork);
> - xfs_filblks_t temp; /* temp for indirect calculations */
> -
> - ifp = xfs_ifork_ptr(ip, whichfork);
> - ASSERT(isnullstartblock(new->br_startblock));
> -
> - /*
> - * Check and set flags if this segment has a left neighbor
> - */
> - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
> - state |= BMAP_LEFT_VALID;
> - if (isnullstartblock(left.br_startblock))
> - state |= BMAP_LEFT_DELAY;
> - }
> -
> - /*
> - * Check and set flags if the current (right) segment exists.
> - * If it doesn't exist, we're converting the hole at end-of-file.
> - */
> - if (xfs_iext_get_extent(ifp, icur, &right)) {
> - state |= BMAP_RIGHT_VALID;
> - if (isnullstartblock(right.br_startblock))
> - state |= BMAP_RIGHT_DELAY;
> - }
> -
> - /*
> - * Set contiguity flags on the left and right neighbors.
> - * Don't let extents get too large, even if the pieces are contiguous.
> - */
> - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
> - left.br_startoff + left.br_blockcount == new->br_startoff &&
> - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
> - state |= BMAP_LEFT_CONTIG;
> -
> - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
> - new->br_startoff + new->br_blockcount == right.br_startoff &&
> - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
> - (!(state & BMAP_LEFT_CONTIG) ||
> - (left.br_blockcount + new->br_blockcount +
> - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
> - state |= BMAP_RIGHT_CONTIG;
> -
> - /*
> - * Switch out based on the contiguity flags.
> - */
> - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
> - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
> - /*
> - * New allocation is contiguous with delayed allocations
> - * on the left and on the right.
> - * Merge all three into a single extent record.
> - */
> - temp = left.br_blockcount + new->br_blockcount +
> - right.br_blockcount;
> -
> - oldlen = startblockval(left.br_startblock) +
> - startblockval(new->br_startblock) +
> - startblockval(right.br_startblock);
> - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> - oldlen);
> - left.br_startblock = nullstartblock(newlen);
> - left.br_blockcount = temp;
> -
> - xfs_iext_remove(ip, icur, state);
> - xfs_iext_prev(ifp, icur);
> - xfs_iext_update_extent(ip, state, icur, &left);
> - break;
> -
> - case BMAP_LEFT_CONTIG:
> - /*
> - * New allocation is contiguous with a delayed allocation
> - * on the left.
> - * Merge the new allocation with the left neighbor.
> - */
> - temp = left.br_blockcount + new->br_blockcount;
> -
> - oldlen = startblockval(left.br_startblock) +
> - startblockval(new->br_startblock);
> - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> - oldlen);
> - left.br_blockcount = temp;
> - left.br_startblock = nullstartblock(newlen);
> -
> - xfs_iext_prev(ifp, icur);
> - xfs_iext_update_extent(ip, state, icur, &left);
> - break;
> -
> - case BMAP_RIGHT_CONTIG:
> - /*
> - * New allocation is contiguous with a delayed allocation
> - * on the right.
> - * Merge the new allocation with the right neighbor.
> - */
> - temp = new->br_blockcount + right.br_blockcount;
> - oldlen = startblockval(new->br_startblock) +
> - startblockval(right.br_startblock);
> - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> - oldlen);
> - right.br_startoff = new->br_startoff;
> - right.br_startblock = nullstartblock(newlen);
> - right.br_blockcount = temp;
> - xfs_iext_update_extent(ip, state, icur, &right);
> - break;
> -
> - case 0:
> - /*
> - * New allocation is not contiguous with another
> - * delayed allocation.
> - * Insert a new entry.
> - */
> - oldlen = newlen = 0;
> - xfs_iext_insert(ip, icur, new, state);
> - break;
> - }
> - if (oldlen != newlen) {
> - ASSERT(oldlen > newlen);
> - xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
> -
> - /*
> - * Nothing to do for disk quota accounting here.
> - */
> - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
> - }
> -}
> -
> /*
> * Convert a hole to a real allocation.
> */
> @@ -4038,144 +3895,6 @@ xfs_bmapi_read(
> return 0;
> }
>
> -/*
> - * Add a delayed allocation extent to an inode. Blocks are reserved from the
> - * global pool and the extent inserted into the inode in-core extent tree.
> - *
> - * On entry, got refers to the first extent beyond the offset of the extent to
> - * allocate or eof is specified if no such extent exists. On return, got refers
> - * to the extent record that was inserted to the inode fork.
> - *
> - * Note that the allocated extent may have been merged with contiguous extents
> - * during insertion into the inode fork. Thus, got does not reflect the current
> - * state of the inode fork on return. If necessary, the caller can use lastx to
> - * look up the updated record in the inode fork.
> - */
> -int
> -xfs_bmapi_reserve_delalloc(
> - struct xfs_inode *ip,
> - int whichfork,
> - xfs_fileoff_t off,
> - xfs_filblks_t len,
> - xfs_filblks_t prealloc,
> - struct xfs_bmbt_irec *got,
> - struct xfs_iext_cursor *icur,
> - int eof)
> -{
> - struct xfs_mount *mp = ip->i_mount;
> - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
> - xfs_extlen_t alen;
> - xfs_extlen_t indlen;
> - uint64_t fdblocks;
> - int error;
> - xfs_fileoff_t aoff;
> - bool use_cowextszhint =
> - whichfork == XFS_COW_FORK && !prealloc;
> -
> -retry:
> - /*
> - * Cap the alloc length. Keep track of prealloc so we know whether to
> - * tag the inode before we return.
> - */
> - aoff = off;
> - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
> - if (!eof)
> - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
> - if (prealloc && alen >= len)
> - prealloc = alen - len;
> -
> - /*
> - * If we're targetting the COW fork but aren't creating a speculative
> - * posteof preallocation, try to expand the reservation to align with
> - * the COW extent size hint if there's sufficient free space.
> - *
> - * Unlike the data fork, the CoW cancellation functions will free all
> - * the reservations at inactivation, so we don't require that every
> - * delalloc reservation have a dirty pagecache.
> - */
> - if (use_cowextszhint) {
> - struct xfs_bmbt_irec prev;
> - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
> -
> - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
> - prev.br_startoff = NULLFILEOFF;
> -
> - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
> - 1, 0, &aoff, &alen);
> - ASSERT(!error);
> - }
> -
> - /*
> - * Make a transaction-less quota reservation for delayed allocation
> - * blocks. This number gets adjusted later. We return if we haven't
> - * allocated blocks already inside this loop.
> - */
> - error = xfs_quota_reserve_blkres(ip, alen);
> - if (error)
> - goto out;
> -
> - /*
> - * Split changing sb for alen and indlen since they could be coming
> - * from different places.
> - */
> - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
> - ASSERT(indlen > 0);
> -
> - fdblocks = indlen;
> - if (XFS_IS_REALTIME_INODE(ip)) {
> - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
> - if (error)
> - goto out_unreserve_quota;
> - } else {
> - fdblocks += alen;
> - }
> -
> - error = xfs_dec_fdblocks(mp, fdblocks, false);
> - if (error)
> - goto out_unreserve_frextents;
> -
> - ip->i_delayed_blks += alen;
> - xfs_mod_delalloc(ip, alen, indlen);
> -
> - got->br_startoff = aoff;
> - got->br_startblock = nullstartblock(indlen);
> - got->br_blockcount = alen;
> - got->br_state = XFS_EXT_NORM;
> -
> - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
> -
> - /*
> - * Tag the inode if blocks were preallocated. Note that COW fork
> - * preallocation can occur at the start or end of the extent, even when
> - * prealloc == 0, so we must also check the aligned offset and length.
> - */
> - if (whichfork == XFS_DATA_FORK && prealloc)
> - xfs_inode_set_eofblocks_tag(ip);
> - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
> - xfs_inode_set_cowblocks_tag(ip);
> -
> - return 0;
> -
> -out_unreserve_frextents:
> - if (XFS_IS_REALTIME_INODE(ip))
> - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
> -out_unreserve_quota:
> - if (XFS_IS_QUOTA_ON(mp))
> - xfs_quota_unreserve_blkres(ip, alen);
> -out:
> - if (error == -ENOSPC || error == -EDQUOT) {
> - trace_xfs_delalloc_enospc(ip, off, len);
> -
> - if (prealloc || use_cowextszhint) {
> - /* retry without any preallocation */
> - use_cowextszhint = false;
> - prealloc = 0;
> - goto retry;
> - }
> - }
> - return error;
> -}
> -
> static int
> xfs_bmapi_allocate(
> struct xfs_bmalloca *bma)
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index 4b721d935994..4d48087fd3a8 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
> bool *done, xfs_fileoff_t stop_fsb);
> int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
> xfs_fileoff_t split_offset);
> -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
> - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
> - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
> - int eof);
> int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
> xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
> int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
> @@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
> int fork);
> int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
> struct xfs_alloc_arg *args);
> +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
>
> enum xfs_bmap_intent_type {
> XFS_BMAP_MAP = 1,
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 5dd0922fe2d1..b3783d7b8ebe 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -30,6 +30,7 @@
> #include "xfs_reflink.h"
> #include "xfs_health.h"
> #include "xfs_rtbitmap.h"
> +#include "xfs_icache.h"
>
> #define XFS_ALLOC_ALIGN(mp, off) \
> (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
> @@ -989,6 +990,284 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
> .iomap_end = xfs_dax_write_iomap_end,
> };
>
> +/*
> + * Convert a hole to a delayed allocation.
> + */
> +static void
> +xfs_bmap_add_extent_hole_delay(
> + struct xfs_inode *ip, /* incore inode pointer */
> + int whichfork,
> + struct xfs_iext_cursor *icur,
> + struct xfs_bmbt_irec *new) /* new data to add to file extents */
> +{
> + struct xfs_ifork *ifp; /* inode fork pointer */
> + xfs_bmbt_irec_t left; /* left neighbor extent entry */
> + xfs_filblks_t newlen=0; /* new indirect size */
> + xfs_filblks_t oldlen=0; /* old indirect size */
> + xfs_bmbt_irec_t right; /* right neighbor extent entry */
> + uint32_t state = xfs_bmap_fork_to_state(whichfork);
> + xfs_filblks_t temp; /* temp for indirect calculations */
> +
> + ifp = xfs_ifork_ptr(ip, whichfork);
> + ASSERT(isnullstartblock(new->br_startblock));
> +
> + /*
> + * Check and set flags if this segment has a left neighbor
> + */
> + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
> + state |= BMAP_LEFT_VALID;
> + if (isnullstartblock(left.br_startblock))
> + state |= BMAP_LEFT_DELAY;
> + }
> +
> + /*
> + * Check and set flags if the current (right) segment exists.
> + * If it doesn't exist, we're converting the hole at end-of-file.
> + */
> + if (xfs_iext_get_extent(ifp, icur, &right)) {
> + state |= BMAP_RIGHT_VALID;
> + if (isnullstartblock(right.br_startblock))
> + state |= BMAP_RIGHT_DELAY;
> + }
> +
> + /*
> + * Set contiguity flags on the left and right neighbors.
> + * Don't let extents get too large, even if the pieces are contiguous.
> + */
> + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
> + left.br_startoff + left.br_blockcount == new->br_startoff &&
> + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
> + state |= BMAP_LEFT_CONTIG;
> +
> + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
> + new->br_startoff + new->br_blockcount == right.br_startoff &&
> + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
> + (!(state & BMAP_LEFT_CONTIG) ||
> + (left.br_blockcount + new->br_blockcount +
> + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
> + state |= BMAP_RIGHT_CONTIG;
> +
> + /*
> + * Switch out based on the contiguity flags.
> + */
> + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
> + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
> + /*
> + * New allocation is contiguous with delayed allocations
> + * on the left and on the right.
> + * Merge all three into a single extent record.
> + */
> + temp = left.br_blockcount + new->br_blockcount +
> + right.br_blockcount;
> +
> + oldlen = startblockval(left.br_startblock) +
> + startblockval(new->br_startblock) +
> + startblockval(right.br_startblock);
> + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> + oldlen);
> + left.br_startblock = nullstartblock(newlen);
> + left.br_blockcount = temp;
> +
> + xfs_iext_remove(ip, icur, state);
> + xfs_iext_prev(ifp, icur);
> + xfs_iext_update_extent(ip, state, icur, &left);
> + break;
> +
> + case BMAP_LEFT_CONTIG:
> + /*
> + * New allocation is contiguous with a delayed allocation
> + * on the left.
> + * Merge the new allocation with the left neighbor.
> + */
> + temp = left.br_blockcount + new->br_blockcount;
> +
> + oldlen = startblockval(left.br_startblock) +
> + startblockval(new->br_startblock);
> + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> + oldlen);
> + left.br_blockcount = temp;
> + left.br_startblock = nullstartblock(newlen);
> +
> + xfs_iext_prev(ifp, icur);
> + xfs_iext_update_extent(ip, state, icur, &left);
> + break;
> +
> + case BMAP_RIGHT_CONTIG:
> + /*
> + * New allocation is contiguous with a delayed allocation
> + * on the right.
> + * Merge the new allocation with the right neighbor.
> + */
> + temp = new->br_blockcount + right.br_blockcount;
> + oldlen = startblockval(new->br_startblock) +
> + startblockval(right.br_startblock);
> + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
> + oldlen);
> + right.br_startoff = new->br_startoff;
> + right.br_startblock = nullstartblock(newlen);
> + right.br_blockcount = temp;
> + xfs_iext_update_extent(ip, state, icur, &right);
> + break;
> +
> + case 0:
> + /*
> + * New allocation is not contiguous with another
> + * delayed allocation.
> + * Insert a new entry.
> + */
> + oldlen = newlen = 0;
> + xfs_iext_insert(ip, icur, new, state);
> + break;
> + }
> + if (oldlen != newlen) {
> + ASSERT(oldlen > newlen);
> + xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
> +
> + /*
> + * Nothing to do for disk quota accounting here.
> + */
> + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
> + }
> +}
> +
> +/*
> + * Add a delayed allocation extent to an inode. Blocks are reserved from the
> + * global pool and the extent inserted into the inode in-core extent tree.
> + *
> + * On entry, got refers to the first extent beyond the offset of the extent to
> + * allocate or eof is specified if no such extent exists. On return, got refers
> + * to the extent record that was inserted to the inode fork.
> + *
> + * Note that the allocated extent may have been merged with contiguous extents
> + * during insertion into the inode fork. Thus, got does not reflect the current
> + * state of the inode fork on return. If necessary, the caller can use lastx to
> + * look up the updated record in the inode fork.
> + */
> +static int
> +xfs_bmapi_reserve_delalloc(
> + struct xfs_inode *ip,
> + int whichfork,
> + xfs_fileoff_t off,
> + xfs_filblks_t len,
> + xfs_filblks_t prealloc,
> + struct xfs_bmbt_irec *got,
> + struct xfs_iext_cursor *icur,
> + int eof)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
> + xfs_extlen_t alen;
> + xfs_extlen_t indlen;
> + uint64_t fdblocks;
> + int error;
> + xfs_fileoff_t aoff;
> + bool use_cowextszhint =
> + whichfork == XFS_COW_FORK && !prealloc;
> +
> +retry:
> + /*
> + * Cap the alloc length. Keep track of prealloc so we know whether to
> + * tag the inode before we return.
> + */
> + aoff = off;
> + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
> + if (!eof)
> + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
> + if (prealloc && alen >= len)
> + prealloc = alen - len;
> +
> + /*
> + * If we're targetting the COW fork but aren't creating a speculative
> + * posteof preallocation, try to expand the reservation to align with
> + * the COW extent size hint if there's sufficient free space.
> + *
> + * Unlike the data fork, the CoW cancellation functions will free all
> + * the reservations at inactivation, so we don't require that every
> + * delalloc reservation have a dirty pagecache.
> + */
> + if (use_cowextszhint) {
> + struct xfs_bmbt_irec prev;
> + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
> +
> + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
> + prev.br_startoff = NULLFILEOFF;
> +
> + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
> + 1, 0, &aoff, &alen);
> + ASSERT(!error);
> + }
> +
> + /*
> + * Make a transaction-less quota reservation for delayed allocation
> + * blocks. This number gets adjusted later. We return if we haven't
> + * allocated blocks already inside this loop.
> + */
> + error = xfs_quota_reserve_blkres(ip, alen);
> + if (error)
> + goto out;
> +
> + /*
> + * Split changing sb for alen and indlen since they could be coming
> + * from different places.
> + */
> + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
> + ASSERT(indlen > 0);
> +
> + fdblocks = indlen;
> + if (XFS_IS_REALTIME_INODE(ip)) {
> + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
> + if (error)
> + goto out_unreserve_quota;
> + } else {
> + fdblocks += alen;
> + }
> +
> + error = xfs_dec_fdblocks(mp, fdblocks, false);
> + if (error)
> + goto out_unreserve_frextents;
> +
> + ip->i_delayed_blks += alen;
> + xfs_mod_delalloc(ip, alen, indlen);
> +
> + got->br_startoff = aoff;
> + got->br_startblock = nullstartblock(indlen);
> + got->br_blockcount = alen;
> + got->br_state = XFS_EXT_NORM;
> +
> + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
> +
> + /*
> + * Tag the inode if blocks were preallocated. Note that COW fork
> + * preallocation can occur at the start or end of the extent, even when
> + * prealloc == 0, so we must also check the aligned offset and length.
> + */
> + if (whichfork == XFS_DATA_FORK && prealloc)
> + xfs_inode_set_eofblocks_tag(ip);
> + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
> + xfs_inode_set_cowblocks_tag(ip);
> +
> + return 0;
> +
> +out_unreserve_frextents:
> + if (XFS_IS_REALTIME_INODE(ip))
> + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
> +out_unreserve_quota:
> + if (XFS_IS_QUOTA_ON(mp))
> + xfs_quota_unreserve_blkres(ip, alen);
> +out:
> + if (error == -ENOSPC || error == -EDQUOT) {
> + trace_xfs_delalloc_enospc(ip, off, len);
> +
> + if (prealloc || use_cowextszhint) {
> + /* retry without any preallocation */
> + use_cowextszhint = false;
> + prealloc = 0;
> + goto retry;
> + }
> + }
> + return error;
> +}
> +
> static int
> xfs_buffered_write_iomap_begin(
> struct inode *inode,
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c
2024-12-12 21:18 ` Darrick J. Wong
@ 2024-12-13 5:04 ` Christoph Hellwig
2024-12-15 18:13 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:04 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:18:43PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:29AM +0100, Christoph Hellwig wrote:
> > Delalloc reservations are not supported in userspace, and thus it doesn't
> > make sense to share this helper with xfsprogs.c. Move it to xfs_iomap.c
> > toward the two callers.
> >
> > Note that there rest of the delalloc handling should probably eventually
> > also move out of xfs_bmap.c, but that will require a bit more surgery.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Not opposed since we /could/ move this back if userspace ever (a) grows
> a fuse server and (b) decides to use delalloc with it, but is this move
> totally necessary?
It's not totally necessary, we could also mark xfs_bmap_worst_indlen and
xfs_bmap_add_extent_hole_delay non-static and be done with it. But then
again I'd rather reduce the unused libxfs sync surface if I can.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c
2024-12-13 5:04 ` Christoph Hellwig
@ 2024-12-15 18:13 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-15 18:13 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:04:39AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 01:18:43PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:29AM +0100, Christoph Hellwig wrote:
> > > Delalloc reservations are not supported in userspace, and thus it doesn't
> > > make sense to share this helper with xfsprogs.c. Move it to xfs_iomap.c
> > > toward the two callers.
> > >
> > > Note that there rest of the delalloc handling should probably eventually
> > > also move out of xfs_bmap.c, but that will require a bit more surgery.
> > >
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > Not opposed since we /could/ move this back if userspace ever (a) grows
> > a fuse server and (b) decides to use delalloc with it, but is this move
> > totally necessary?
>
> It's not totally necessary, we could also mark xfs_bmap_worst_indlen and
> xfs_bmap_add_extent_hole_delay non-static and be done with it. But then
> again I'd rather reduce the unused libxfs sync surface if I can.
<nod>
Does anyone else have an opinion? Particularly any of the past xfsprogs
maintainers?
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (3 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 04/43] xfs: move xfs_bmapi_reserve_delalloc to xfs_iomap.c Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:42 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 06/43] xfs: refactor xfs_fs_statfs Christoph Hellwig
` (37 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The only non-constant value read under m_sb_lock in xfs_fs_statfs is
sb_dblocks, and it could become stale right after dropping the lock
anyway. Remove the thus pointless lock section.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_super.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0fa7b7cc75c1..bfa8cc927009 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -850,11 +850,9 @@ xfs_fs_statfs(
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- spin_lock(&mp->m_sb_lock);
statp->f_bsize = sbp->sb_blocksize;
lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
statp->f_blocks = sbp->sb_dblocks - lsize;
- spin_unlock(&mp->m_sb_lock);
/* make sure statp->f_bfree does not underflow */
statp->f_bfree = max_t(int64_t, 0,
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs
2024-12-11 8:54 ` [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs Christoph Hellwig
@ 2024-12-12 21:42 ` Darrick J. Wong
2024-12-13 5:06 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:42 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:30AM +0100, Christoph Hellwig wrote:
> The only non-constant value read under m_sb_lock in xfs_fs_statfs is
> sb_dblocks, and it could become stale right after dropping the lock
> anyway. Remove the thus pointless lock section.
Is there a stronger reason later for removing the critical section?
Do we lose much by leaving the protection in place?
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_super.c | 2 --
> 1 file changed, 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 0fa7b7cc75c1..bfa8cc927009 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -850,11 +850,9 @@ xfs_fs_statfs(
> ifree = percpu_counter_sum(&mp->m_ifree);
> fdblocks = percpu_counter_sum(&mp->m_fdblocks);
>
> - spin_lock(&mp->m_sb_lock);
> statp->f_bsize = sbp->sb_blocksize;
> lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
> statp->f_blocks = sbp->sb_dblocks - lsize;
> - spin_unlock(&mp->m_sb_lock);
>
> /* make sure statp->f_bfree does not underflow */
> statp->f_bfree = max_t(int64_t, 0,
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs
2024-12-12 21:42 ` Darrick J. Wong
@ 2024-12-13 5:06 ` Christoph Hellwig
2024-12-15 18:16 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:06 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:42:06PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:30AM +0100, Christoph Hellwig wrote:
> > The only non-constant value read under m_sb_lock in xfs_fs_statfs is
> > sb_dblocks, and it could become stale right after dropping the lock
> > anyway. Remove the thus pointless lock section.
>
> Is there a stronger reason later for removing the critical section?
> Do we lose much by leaving the protection in place?
It makes a completely mess of xfs_fs_statfs, and as stated in the
commit message about it's not actually useful at all. I also don't
think taking a global lock from a non-privileged operation is an
old that good idea to start with if we can avoid it.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs
2024-12-13 5:06 ` Christoph Hellwig
@ 2024-12-15 18:16 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-15 18:16 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:06:15AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 01:42:06PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:30AM +0100, Christoph Hellwig wrote:
> > > The only non-constant value read under m_sb_lock in xfs_fs_statfs is
> > > sb_dblocks, and it could become stale right after dropping the lock
> > > anyway. Remove the thus pointless lock section.
> >
> > Is there a stronger reason later for removing the critical section?
> > Do we lose much by leaving the protection in place?
>
> It makes a completely mess of xfs_fs_statfs, and as stated in the
> commit message about it's not actually useful at all. I also don't
> think taking a global lock from a non-privileged operation is an
> old that good idea to start with if we can avoid it.
Ok, I'm convinced. But perhaps you could leave a comment that we don't
care if the accesses are torn, to try to head off the inevitable kcsan/
dept/whatever patches?
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 06/43] xfs: refactor xfs_fs_statfs
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (4 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 05/43] xfs: don't take m_sb_lock in xfs_fs_statfs Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:24 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 07/43] xfs: cleanup xfs_vn_getattr Christoph Hellwig
` (36 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Split out helper for data, rt data and inode related informations,
and assing f_bavail once instead of in three places.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_qm_bhv.c | 2 +-
fs/xfs/xfs_super.c | 128 ++++++++++++++++++++++++++------------------
2 files changed, 78 insertions(+), 52 deletions(-)
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 847ba29630e9..6d5de3fa58e8 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -34,7 +34,7 @@ xfs_fill_statvfs_from_dquot(
blkres->hardlimit;
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
- statp->f_bfree = statp->f_bavail =
+ statp->f_bfree =
(statp->f_blocks > blkres->reserved) ?
(statp->f_blocks - blkres->reserved) : 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bfa8cc927009..a74a0cc1f6f6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -819,20 +819,74 @@ xfs_fs_sync_fs(
return 0;
}
+static xfs_extlen_t
+xfs_internal_log_size(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_sb.sb_logstart)
+ return 0;
+ return mp->m_sb.sb_logblocks;
+}
+
+static void
+xfs_statfs_data(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ int64_t fdblocks =
+ percpu_counter_sum(&mp->m_fdblocks);
+
+ /* make sure st->f_bfree does not underflow */
+ st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+ st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
+}
+
+/*
+ * When stat(v)fs is called on a file with the realtime bit set or a directory
+ * with the rtinherit bit, report freespace information for the RT device
+ * instead of the main data device.
+ */
+static void
+xfs_statfs_rt(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ int64_t freertx =
+ percpu_counter_sum_positive(&mp->m_frextents);
+
+ st->f_bfree = xfs_rtbxlen_to_blen(mp, freertx);
+ st->f_blocks = mp->m_sb.sb_rblocks;
+}
+
+static void
+xfs_statfs_inodes(
+ struct xfs_mount *mp,
+ struct kstatfs *st)
+{
+ uint64_t icount = percpu_counter_sum(&mp->m_icount);
+ uint64_t ifree = percpu_counter_sum(&mp->m_ifree);
+ uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree);
+
+ st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
+ if (M_IGEO(mp)->maxicount)
+ st->f_files = min_t(typeof(st->f_files), st->f_files,
+ M_IGEO(mp)->maxicount);
+
+ /* If sb_icount overshot maxicount, report actual allocation */
+ st->f_files = max_t(typeof(st->f_files), st->f_files,
+ mp->m_sb.sb_icount);
+
+ /* Make sure st->f_ffree does not underflow */
+ st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree));
+}
+
STATIC int
xfs_fs_statfs(
struct dentry *dentry,
- struct kstatfs *statp)
+ struct kstatfs *st)
{
struct xfs_mount *mp = XFS_M(dentry->d_sb);
- xfs_sb_t *sbp = &mp->m_sb;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
- uint64_t fakeinos, id;
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- xfs_extlen_t lsize;
- int64_t ffree;
/*
* Expedite background inodegc but don't wait. We do not want to block
@@ -840,56 +894,28 @@ xfs_fs_statfs(
*/
xfs_inodegc_push(mp);
- statp->f_type = XFS_SUPER_MAGIC;
- statp->f_namelen = MAXNAMELEN - 1;
-
- id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
- statp->f_fsid = u64_to_fsid(id);
-
- icount = percpu_counter_sum(&mp->m_icount);
- ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
-
- statp->f_bsize = sbp->sb_blocksize;
- lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
- statp->f_blocks = sbp->sb_dblocks - lsize;
-
- /* make sure statp->f_bfree does not underflow */
- statp->f_bfree = max_t(int64_t, 0,
- fdblocks - xfs_fdblocks_unavailable(mp));
- statp->f_bavail = statp->f_bfree;
-
- fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
- statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
- if (M_IGEO(mp)->maxicount)
- statp->f_files = min_t(typeof(statp->f_files),
- statp->f_files,
- M_IGEO(mp)->maxicount);
-
- /* If sb_icount overshot maxicount, report actual allocation */
- statp->f_files = max_t(typeof(statp->f_files),
- statp->f_files,
- sbp->sb_icount);
-
- /* make sure statp->f_ffree does not underflow */
- ffree = statp->f_files - (icount - ifree);
- statp->f_ffree = max_t(int64_t, ffree, 0);
+ st->f_type = XFS_SUPER_MAGIC;
+ st->f_namelen = MAXNAMELEN - 1;
+ st->f_bsize = mp->m_sb.sb_blocksize;
+ st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev));
+
+ xfs_statfs_data(mp, st);
+ xfs_statfs_inodes(mp, st);
if (XFS_IS_REALTIME_MOUNT(mp) &&
- (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
- s64 freertx;
-
- statp->f_blocks = sbp->sb_rblocks;
- freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree =
- xfs_rtbxlen_to_blen(mp, freertx);
- }
+ (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
+ xfs_statfs_rt(mp, st);
if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
(XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
- xfs_qm_statvfs(ip, statp);
+ xfs_qm_statvfs(ip, st);
+ /*
+ * XFS does not distinguish between blocks available to privileged and
+ * unprivileged users.
+ */
+ st->f_bavail = st->f_bfree;
return 0;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 06/43] xfs: refactor xfs_fs_statfs
2024-12-11 8:54 ` [PATCH 06/43] xfs: refactor xfs_fs_statfs Christoph Hellwig
@ 2024-12-12 21:24 ` Darrick J. Wong
2024-12-13 5:08 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:24 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:31AM +0100, Christoph Hellwig wrote:
> Split out helper for data, rt data and inode related informations,
> and assing f_bavail once instead of in three places.
^^^^^^ word choice
("assigning"?)
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_qm_bhv.c | 2 +-
> fs/xfs/xfs_super.c | 128 ++++++++++++++++++++++++++------------------
> 2 files changed, 78 insertions(+), 52 deletions(-)
>
> diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
> index 847ba29630e9..6d5de3fa58e8 100644
> --- a/fs/xfs/xfs_qm_bhv.c
> +++ b/fs/xfs/xfs_qm_bhv.c
> @@ -34,7 +34,7 @@ xfs_fill_statvfs_from_dquot(
> blkres->hardlimit;
> if (limit && statp->f_blocks > limit) {
> statp->f_blocks = limit;
> - statp->f_bfree = statp->f_bavail =
> + statp->f_bfree =
> (statp->f_blocks > blkres->reserved) ?
> (statp->f_blocks - blkres->reserved) : 0;
> }
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index bfa8cc927009..a74a0cc1f6f6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -819,20 +819,74 @@ xfs_fs_sync_fs(
> return 0;
> }
>
> +static xfs_extlen_t
> +xfs_internal_log_size(
> + struct xfs_mount *mp)
> +{
> + if (!mp->m_sb.sb_logstart)
> + return 0;
> + return mp->m_sb.sb_logblocks;
> +}
> +
> +static void
> +xfs_statfs_data(
> + struct xfs_mount *mp,
> + struct kstatfs *st)
> +{
> + int64_t fdblocks =
> + percpu_counter_sum(&mp->m_fdblocks);
> +
> + /* make sure st->f_bfree does not underflow */
> + st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
> + st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
> +}
> +
> +/*
> + * When stat(v)fs is called on a file with the realtime bit set or a directory
> + * with the rtinherit bit, report freespace information for the RT device
> + * instead of the main data device.
> + */
> +static void
> +xfs_statfs_rt(
> + struct xfs_mount *mp,
> + struct kstatfs *st)
> +{
> + int64_t freertx =
> + percpu_counter_sum_positive(&mp->m_frextents);
> +
> + st->f_bfree = xfs_rtbxlen_to_blen(mp, freertx);
> + st->f_blocks = mp->m_sb.sb_rblocks;
> +}
> +
> +static void
> +xfs_statfs_inodes(
> + struct xfs_mount *mp,
> + struct kstatfs *st)
> +{
> + uint64_t icount = percpu_counter_sum(&mp->m_icount);
> + uint64_t ifree = percpu_counter_sum(&mp->m_ifree);
> + uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree);
> +
> + st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
> + if (M_IGEO(mp)->maxicount)
> + st->f_files = min_t(typeof(st->f_files), st->f_files,
> + M_IGEO(mp)->maxicount);
> +
> + /* If sb_icount overshot maxicount, report actual allocation */
> + st->f_files = max_t(typeof(st->f_files), st->f_files,
> + mp->m_sb.sb_icount);
> +
> + /* Make sure st->f_ffree does not underflow */
> + st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree));
> +}
> +
> STATIC int
> xfs_fs_statfs(
> struct dentry *dentry,
> - struct kstatfs *statp)
> + struct kstatfs *st)
> {
> struct xfs_mount *mp = XFS_M(dentry->d_sb);
> - xfs_sb_t *sbp = &mp->m_sb;
> struct xfs_inode *ip = XFS_I(d_inode(dentry));
> - uint64_t fakeinos, id;
> - uint64_t icount;
> - uint64_t ifree;
> - uint64_t fdblocks;
> - xfs_extlen_t lsize;
> - int64_t ffree;
>
> /*
> * Expedite background inodegc but don't wait. We do not want to block
> @@ -840,56 +894,28 @@ xfs_fs_statfs(
> */
> xfs_inodegc_push(mp);
>
> - statp->f_type = XFS_SUPER_MAGIC;
> - statp->f_namelen = MAXNAMELEN - 1;
> -
> - id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
> - statp->f_fsid = u64_to_fsid(id);
> -
> - icount = percpu_counter_sum(&mp->m_icount);
> - ifree = percpu_counter_sum(&mp->m_ifree);
> - fdblocks = percpu_counter_sum(&mp->m_fdblocks);
> -
> - statp->f_bsize = sbp->sb_blocksize;
> - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
> - statp->f_blocks = sbp->sb_dblocks - lsize;
> -
> - /* make sure statp->f_bfree does not underflow */
> - statp->f_bfree = max_t(int64_t, 0,
> - fdblocks - xfs_fdblocks_unavailable(mp));
> - statp->f_bavail = statp->f_bfree;
> -
> - fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
> - statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
> - if (M_IGEO(mp)->maxicount)
> - statp->f_files = min_t(typeof(statp->f_files),
> - statp->f_files,
> - M_IGEO(mp)->maxicount);
> -
> - /* If sb_icount overshot maxicount, report actual allocation */
> - statp->f_files = max_t(typeof(statp->f_files),
> - statp->f_files,
> - sbp->sb_icount);
> -
> - /* make sure statp->f_ffree does not underflow */
> - ffree = statp->f_files - (icount - ifree);
> - statp->f_ffree = max_t(int64_t, ffree, 0);
> + st->f_type = XFS_SUPER_MAGIC;
> + st->f_namelen = MAXNAMELEN - 1;
> + st->f_bsize = mp->m_sb.sb_blocksize;
> + st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev));
> +
Whitespace ^^^ damage here.
> + xfs_statfs_data(mp, st);
> + xfs_statfs_inodes(mp, st);
>
> if (XFS_IS_REALTIME_MOUNT(mp) &&
> - (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
> - s64 freertx;
> -
> - statp->f_blocks = sbp->sb_rblocks;
> - freertx = percpu_counter_sum_positive(&mp->m_frextents);
> - statp->f_bavail = statp->f_bfree =
> - xfs_rtbxlen_to_blen(mp, freertx);
> - }
> + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
> + xfs_statfs_rt(mp, st);
>
> if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
> ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
> (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
> - xfs_qm_statvfs(ip, statp);
> + xfs_qm_statvfs(ip, st);
Nice cleanup of all of that.
> + /*
> + * XFS does not distinguish between blocks available to privileged and
> + * unprivileged users.
> + */
> + st->f_bavail = st->f_bfree;
Not relevant to this patch, but I noticed that (a) the statfs manpage
now tells me to go look at statvfs, and (b) statvfs advertises a
f_favail field that nobody in the kernel actually sets.
--D
> return 0;
> }
>
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 06/43] xfs: refactor xfs_fs_statfs
2024-12-12 21:24 ` Darrick J. Wong
@ 2024-12-13 5:08 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:08 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:24:00PM -0800, Darrick J. Wong wrote:
> > + * XFS does not distinguish between blocks available to privileged and
> > + * unprivileged users.
> > + */
> > + st->f_bavail = st->f_bfree;
>
> Not relevant to this patch, but I noticed that (a) the statfs manpage
> now tells me to go look at statvfs, and (b) statvfs advertises a
> f_favail field that nobody in the kernel actually sets.
The kernel doesn't implement statvfs and thus doesn't have the field.
This is what glibc does for it:
/* XXX I have no idea how to compute f_favail. Any idea??? */
buf->f_favail = buf->f_ffree;
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 07/43] xfs: cleanup xfs_vn_getattr
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (5 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 06/43] xfs: refactor xfs_fs_statfs Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:24 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 08/43] xfs: report the correct dio alignment for COW inodes Christoph Hellwig
` (35 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Split the two bits of optional statx reporting into their own helpers
so that they are self-contained instead of deeply indented in the main
getattr handler.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_iops.c | 47 +++++++++++++++++++++++------------------------
1 file changed, 23 insertions(+), 24 deletions(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 207e0dadffc3..6b0228a21617 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,17 +573,28 @@ xfs_stat_blksize(
}
static void
-xfs_get_atomic_write_attr(
+xfs_report_dioalign(
struct xfs_inode *ip,
- unsigned int *unit_min,
- unsigned int *unit_max)
+ struct kstat *stat)
{
- if (!xfs_inode_can_atomicwrite(ip)) {
- *unit_min = *unit_max = 0;
- return;
- }
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
- *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+}
+
+static void
+xfs_report_atomic_write(
+ struct xfs_inode *ip,
+ struct kstat *stat)
+{
+ unsigned int unit_min = 0, unit_max = 0;
+
+ if (xfs_inode_can_atomicwrite(ip))
+ unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
+ generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
}
STATIC int
@@ -647,22 +658,10 @@ xfs_vn_getattr(
stat->rdev = inode->i_rdev;
break;
case S_IFREG:
- if (request_mask & STATX_DIOALIGN) {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- struct block_device *bdev = target->bt_bdev;
-
- stat->result_mask |= STATX_DIOALIGN;
- stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
- stat->dio_offset_align = bdev_logical_block_size(bdev);
- }
- if (request_mask & STATX_WRITE_ATOMIC) {
- unsigned int unit_min, unit_max;
-
- xfs_get_atomic_write_attr(ip, &unit_min,
- &unit_max);
- generic_fill_statx_atomic_writes(stat,
- unit_min, unit_max);
- }
+ if (request_mask & STATX_DIOALIGN)
+ xfs_report_dioalign(ip, stat);
+ if (request_mask & STATX_WRITE_ATOMIC)
+ xfs_report_atomic_write(ip, stat);
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 07/43] xfs: cleanup xfs_vn_getattr
2024-12-11 8:54 ` [PATCH 07/43] xfs: cleanup xfs_vn_getattr Christoph Hellwig
@ 2024-12-12 21:24 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:24 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:32AM +0100, Christoph Hellwig wrote:
> Split the two bits of optional statx reporting into their own helpers
> so that they are self-contained instead of deeply indented in the main
> getattr handler.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_iops.c | 47 +++++++++++++++++++++++------------------------
> 1 file changed, 23 insertions(+), 24 deletions(-)
>
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 207e0dadffc3..6b0228a21617 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -573,17 +573,28 @@ xfs_stat_blksize(
> }
>
> static void
> -xfs_get_atomic_write_attr(
> +xfs_report_dioalign(
> struct xfs_inode *ip,
> - unsigned int *unit_min,
> - unsigned int *unit_max)
> + struct kstat *stat)
> {
> - if (!xfs_inode_can_atomicwrite(ip)) {
> - *unit_min = *unit_max = 0;
> - return;
> - }
> + struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> + struct block_device *bdev = target->bt_bdev;
>
> - *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
> + stat->result_mask |= STATX_DIOALIGN;
> + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> + stat->dio_offset_align = bdev_logical_block_size(bdev);
> +}
> +
> +static void
> +xfs_report_atomic_write(
> + struct xfs_inode *ip,
> + struct kstat *stat)
> +{
> + unsigned int unit_min = 0, unit_max = 0;
> +
> + if (xfs_inode_can_atomicwrite(ip))
> + unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
> + generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
> }
>
> STATIC int
> @@ -647,22 +658,10 @@ xfs_vn_getattr(
> stat->rdev = inode->i_rdev;
> break;
> case S_IFREG:
> - if (request_mask & STATX_DIOALIGN) {
> - struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> - struct block_device *bdev = target->bt_bdev;
> -
> - stat->result_mask |= STATX_DIOALIGN;
> - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> - stat->dio_offset_align = bdev_logical_block_size(bdev);
> - }
> - if (request_mask & STATX_WRITE_ATOMIC) {
> - unsigned int unit_min, unit_max;
> -
> - xfs_get_atomic_write_attr(ip, &unit_min,
> - &unit_max);
> - generic_fill_statx_atomic_writes(stat,
> - unit_min, unit_max);
> - }
> + if (request_mask & STATX_DIOALIGN)
> + xfs_report_dioalign(ip, stat);
> + if (request_mask & STATX_WRITE_ATOMIC)
> + xfs_report_atomic_write(ip, stat);
> fallthrough;
> default:
> stat->blksize = xfs_stat_blksize(ip);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 08/43] xfs: report the correct dio alignment for COW inodes
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (6 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 07/43] xfs: cleanup xfs_vn_getattr Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:29 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling Christoph Hellwig
` (34 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
For I/O to reflinked blocks we always need to write an entire new
file system block, and the code enforces the file system block alignment
for the entire file if it has any reflinked blocks.
Unfortunately the reported dio alignment can only report a single value
for reads and writes, so unless we want to trigger these read-modify
write cycles all the time, we need to increase both limits.
Without this zoned xfs triggers the warnings about failed page cache
invalidation in kiocb_invalidate_post_direct_write all the time when
running generic/551 when running on a 512 byte sector device, and
eventually fails the test due to miscompares.
Hopefully we can add a separate read alignment to statx eventually.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_ioctl.c | 6 +++++-
fs/xfs/xfs_iops.c | 15 ++++++++++++++-
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 726282e74d54..de8ba5345e17 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1213,7 +1213,11 @@ xfs_file_ioctl(
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
struct dioattr da;
- da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
+ da.d_mem = target->bt_logical_sectorsize;
+ if (xfs_is_cow_inode(ip))
+ da.d_miniosz = mp->m_sb.sb_blocksize;
+ else
+ da.d_miniosz = target->bt_logical_sectorsize;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
if (copy_to_user(arg, &da, sizeof(da)))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6b0228a21617..990df072ba35 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -582,7 +582,20 @@ xfs_report_dioalign(
stat->result_mask |= STATX_DIOALIGN;
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
- stat->dio_offset_align = bdev_logical_block_size(bdev);
+
+ /*
+ * On COW inodes we are forced to always rewrite an entire file system
+ * block.
+ *
+ * Because applications assume they can do sector sized direct writes
+ * on XFS we provide an emulation by doing a read-modify-write cycle
+ * through the cache, but that is highly inefficient. Thus report the
+ * natively supported size here.
+ */
+ if (xfs_is_cow_inode(ip))
+ stat->dio_offset_align = ip->i_mount->m_sb.sb_blocksize;
+ else
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
}
static void
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 08/43] xfs: report the correct dio alignment for COW inodes
2024-12-11 8:54 ` [PATCH 08/43] xfs: report the correct dio alignment for COW inodes Christoph Hellwig
@ 2024-12-12 21:29 ` Darrick J. Wong
2024-12-13 5:09 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:29 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:33AM +0100, Christoph Hellwig wrote:
> For I/O to reflinked blocks we always need to write an entire new
> file system block, and the code enforces the file system block alignment
> for the entire file if it has any reflinked blocks.
>
> Unfortunately the reported dio alignment can only report a single value
> for reads and writes, so unless we want to trigger these read-modify
> write cycles all the time, we need to increase both limits.
>
> Without this zoned xfs triggers the warnings about failed page cache
> invalidation in kiocb_invalidate_post_direct_write all the time when
> running generic/551 when running on a 512 byte sector device, and
> eventually fails the test due to miscompares.
>
> Hopefully we can add a separate read alignment to statx eventually.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_ioctl.c | 6 +++++-
> fs/xfs/xfs_iops.c | 15 ++++++++++++++-
> 2 files changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 726282e74d54..de8ba5345e17 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1213,7 +1213,11 @@ xfs_file_ioctl(
> struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> struct dioattr da;
>
> - da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
> + da.d_mem = target->bt_logical_sectorsize;
> + if (xfs_is_cow_inode(ip))
> + da.d_miniosz = mp->m_sb.sb_blocksize;
> + else
> + da.d_miniosz = target->bt_logical_sectorsize;
> da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
>
> if (copy_to_user(arg, &da, sizeof(da)))
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 6b0228a21617..990df072ba35 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -582,7 +582,20 @@ xfs_report_dioalign(
>
> stat->result_mask |= STATX_DIOALIGN;
> stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
> - stat->dio_offset_align = bdev_logical_block_size(bdev);
> +
> + /*
> + * On COW inodes we are forced to always rewrite an entire file system
> + * block.
That's not quite accurate -- we're always forced to write an entire file
allocation unit so that the rest of the bmap code doesn't have to deal
with a file range that's mapped to multiple different space extents.
For all the existing reflink scenarios the allocation unit is always an
fsblock so this is a trifling difference.
However, once we start adding reflink to the rt device then there comes
the question of needing to handle allocation unit > fsblock, and all
these bits would have to change.
IOWs, I'm saying that this should be:
if (xfs_is_cow_inode(ip))
stat->dio_offset_align = xfs_inode_alloc_unitsize(ip);
else
...
Though ATM this is a distinction that doesn't make a difference.
--D
> + *
> + * Because applications assume they can do sector sized direct writes
> + * on XFS we provide an emulation by doing a read-modify-write cycle
> + * through the cache, but that is highly inefficient. Thus report the
> + * natively supported size here.
> + */
> + if (xfs_is_cow_inode(ip))
> + stat->dio_offset_align = ip->i_mount->m_sb.sb_blocksize;
> + else
> + stat->dio_offset_align = bdev_logical_block_size(bdev);
> }
>
> static void
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 08/43] xfs: report the correct dio alignment for COW inodes
2024-12-12 21:29 ` Darrick J. Wong
@ 2024-12-13 5:09 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:09 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:29:53PM -0800, Darrick J. Wong wrote:
> > + /*
> > + * On COW inodes we are forced to always rewrite an entire file system
> > + * block.
>
> That's not quite accurate -- we're always forced to write an entire file
> allocation unit so that the rest of the bmap code doesn't have to deal
> with a file range that's mapped to multiple different space extents.
> For all the existing reflink scenarios the allocation unit is always an
> fsblock so this is a trifling difference.
Which right now is a block until we support horrors such as reflink
on larget rtextsize inodes or the forcealign stuff. But yes, it
could use the helper.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (7 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 08/43] xfs: report the correct dio alignment for COW inodes Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:37 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 10/43] xfs: preserve RT reservations across remounts Christoph Hellwig
` (33 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The main handling of the incore per-cpu freespace counters is already
handled in xfs_mod_freecounter for both the block and RT extent cases,
but the actual counter is passed in an special cases.
Replace both the percpu counters and the resblks counters with arrays,
so that support reserved RT extents can be supported, which will be
needed for garbarge collection on zoned devices.
Use helpers to access the freespace counters everywhere intead of
poking through the abstraction by using the percpu_count helpers
directly. This also switches the flooring of the frextents counter
to 0 in statfs for the rthinherit case to a manual min_t call to match
the handling of the fdblocks counter for normal file systems.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_ialloc.c | 2 +-
fs/xfs/libxfs/xfs_metafile.c | 2 +-
fs/xfs/libxfs/xfs_sb.c | 7 +--
fs/xfs/scrub/fscounters.c | 13 +++---
fs/xfs/scrub/fscounters_repair.c | 4 +-
fs/xfs/scrub/newbt.c | 2 +-
fs/xfs/xfs_fsops.c | 27 ++++++------
fs/xfs/xfs_fsops.h | 3 +-
fs/xfs/xfs_icache.c | 4 +-
fs/xfs/xfs_ioctl.c | 12 +++---
fs/xfs/xfs_iomap.c | 9 ++--
fs/xfs/xfs_mount.c | 58 ++++++++++++++++---------
fs/xfs/xfs_mount.h | 65 +++++++++++++++++++---------
fs/xfs/xfs_rtalloc.c | 2 +-
fs/xfs/xfs_super.c | 74 ++++++++++++++++++--------------
fs/xfs/xfs_trace.h | 2 +-
16 files changed, 171 insertions(+), 115 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f3a840a425f5..57513ba19d6a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1927,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
- if (percpu_counter_read_positive(&mp->m_fdblocks) <
+ if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index e151663cc9ef..c84820f5bdc6 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -77,7 +77,7 @@ xfs_metafile_resv_can_cover(
* There aren't enough blocks left in the inode's reservation, but it
* isn't critical unless there also isn't enough free space.
*/
- return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+ return xfs_compare_freecounter(ip->i_mount, XC_FREE_BLOCKS,
rhs - ip->i_delayed_blks, 2048) >= 0;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3dc5f5dba162..090f133f4da3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1266,7 +1266,7 @@ xfs_log_sb(
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks =
- percpu_counter_sum_positive(&mp->m_fdblocks);
+ max(0LL, xfs_sum_freecounter(mp, XC_FREE_BLOCKS));
}
/*
@@ -1275,9 +1275,10 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp)) {
mp->m_sb.sb_frextents =
- percpu_counter_sum_positive(&mp->m_frextents);
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
+ }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ca23cf4db6c5..732658a62a2d 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -350,7 +350,7 @@ xchk_fscount_aggregate_agcounts(
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_resblks[XC_FREE_BLOCKS].avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -513,8 +513,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
+ fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -589,15 +589,16 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS], fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ if (!xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS],
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index cda13447a373..8fb0db78489e 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
- percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@@ -74,7 +74,7 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
- percpu_counter_set(&mp->m_frextents,
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
fsc->frextents - fsc->frextents_delayed);
if (!xfs_has_rtgroups(mp))
mp->m_sb.sb_frextents = fsc->frextents;
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ac38f5843090..3e46b04f427f 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
- free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ free = xfs_sum_freecounter(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 455298503d01..bb2e31e338b8 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -366,6 +366,7 @@ xfs_growfs_log(
int
xfs_reserve_blocks(
struct xfs_mount *mp,
+ enum xfs_free_counter ctr,
uint64_t request)
{
int64_t lcounter, delta;
@@ -373,6 +374,8 @@ xfs_reserve_blocks(
int64_t free;
int error = 0;
+ ASSERT(ctr < XC_FREE_NR);
+
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -391,16 +394,16 @@ xfs_reserve_blocks(
* counters directly since we shouldn't have any problems unreserving
* space.
*/
- if (mp->m_resblks > request) {
- lcounter = mp->m_resblks_avail - request;
+ if (mp->m_resblks[ctr].total > request) {
+ lcounter = mp->m_resblks[ctr].avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
- mp->m_resblks_avail -= lcounter;
+ mp->m_resblks[ctr].avail -= lcounter;
}
- mp->m_resblks = request;
+ mp->m_resblks[ctr].total = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -409,7 +412,7 @@ xfs_reserve_blocks(
/*
* If the request is larger than the current reservation, reserve the
- * blocks before we update the reserve counters. Sample m_fdblocks and
+ * blocks before we update the reserve counters. Sample m_free and
* perform a partial reservation if the request exceeds free space.
*
* The code below estimates how many blocks it can request from
@@ -419,10 +422,10 @@ xfs_reserve_blocks(
* space to fill it because mod_fdblocks will refill an undersized
* reserve when it can.
*/
- free = percpu_counter_sum(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- delta = request - mp->m_resblks;
- mp->m_resblks = request;
+ free = xfs_sum_freecounter(mp, ctr) -
+ xfs_freecounter_unavailable(mp, ctr);
+ delta = request - mp->m_resblks[ctr].total;
+ mp->m_resblks[ctr].total = request;
if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
@@ -436,9 +439,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
- error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
+ error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
if (!error)
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 3e2f73bcf831..9d23c361ef56 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,8 @@
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
+ uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7b6c026d01a1..c9ded501e89b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2076,7 +2076,7 @@ xfs_inodegc_want_queue_rt_file(
if (!XFS_IS_REALTIME_INODE(ip))
return false;
- if (__percpu_counter_compare(&mp->m_frextents,
+ if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
@@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
if (items > mp->m_ino_geo.inodes_per_cluster)
return true;
- if (__percpu_counter_compare(&mp->m_fdblocks,
+ if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
mp->m_low_space[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index de8ba5345e17..d3cf62d81f0d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_reserve_blocks(mp, fsop.resblks);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
mnt_drop_write_file(filp);
if (error)
return error;
}
spin_lock(&mp->m_sb_lock);
- fsop.resblks = mp->m_resblks;
- fsop.resblks_avail = mp->m_resblks_avail;
+ fsop.resblks = mp->m_resblks[XC_FREE_BLOCKS].total;
+ fsop.resblks_avail = mp->m_resblks[XC_FREE_BLOCKS].avail;
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(arg, &fsop, sizeof(fsop)))
@@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
struct xfs_fsop_counts out = {
.allocino = percpu_counter_read_positive(&mp->m_icount),
.freeino = percpu_counter_read_positive(&mp->m_ifree),
- .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp),
- .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
+ xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
+ .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
};
if (copy_to_user(uarg, &out, sizeof(out)))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index b3783d7b8ebe..f3f4b5c328c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -432,13 +432,14 @@ xfs_quota_calc_throttle(
static int64_t
xfs_iomap_freesp(
- struct percpu_counter *counter,
+ struct xfs_mount *mp,
+ unsigned int idx,
uint64_t low_space[XFS_LOWSP_MAX],
int *shift)
{
int64_t freesp;
- freesp = percpu_counter_read_positive(counter);
+ freesp = xfs_estimate_freecounter(mp, idx);
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
*shift = 2;
if (freesp < low_space[XFS_LOWSP_4_PCNT])
@@ -537,10 +538,10 @@ xfs_iomap_prealloc_size(
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
freesp = xfs_rtbxlen_to_blen(mp,
- xfs_iomap_freesp(&mp->m_frextents,
+ xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts, &shift));
else
- freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+ freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
&shift);
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 66b91b582691..4174035b2ac9 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1058,7 +1058,8 @@ xfs_mountfs(
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
+ xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
@@ -1178,7 +1179,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- error = xfs_reserve_blocks(mp, 0);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
@@ -1225,52 +1226,68 @@ xfs_fs_writable(
return true;
}
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+uint64_t
+xfs_freecounter_unavailable(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ if (ctr == XC_FREE_RTEXTENTS)
+ return 0;
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
void
xfs_add_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta)
{
- bool has_resv_pool = (counter == &mp->m_fdblocks);
uint64_t res_used;
/*
* If the reserve pool is depleted, put blocks back into it first.
* Most of the time the pool is full.
*/
- if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
- percpu_counter_add(counter, delta);
+ if (likely(mp->m_resblks[ctr].total == mp->m_resblks[ctr].avail)) {
+ percpu_counter_add(&mp->m_free[ctr], delta);
return;
}
spin_lock(&mp->m_sb_lock);
- res_used = mp->m_resblks - mp->m_resblks_avail;
+ res_used = mp->m_resblks[ctr].total - mp->m_resblks[ctr].avail;
if (res_used > delta) {
- mp->m_resblks_avail += delta;
+ mp->m_resblks[ctr].avail += delta;
} else {
delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(counter, delta);
+ mp->m_resblks[ctr].avail = mp->m_resblks[ctr].total;
+ percpu_counter_add(&mp->m_free[ctr], delta);
}
spin_unlock(&mp->m_sb_lock);
}
+
+/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta,
bool rsvd)
{
+ struct percpu_counter *counter = &mp->m_free[ctr];
int64_t lcounter;
uint64_t set_aside = 0;
s32 batch;
- bool has_resv_pool;
- ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
- has_resv_pool = (counter == &mp->m_fdblocks);
- if (rsvd)
- ASSERT(has_resv_pool);
+ ASSERT(ctr < XC_FREE_NR);
/*
* Taking blocks away, need to be more accurate the closer we
@@ -1297,8 +1314,7 @@ xfs_dec_freecounter(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- if (has_resv_pool)
- set_aside = xfs_fdblocks_unavailable(mp);
+ set_aside = xfs_freecounter_unavailable(mp, ctr);
percpu_counter_add_batch(counter, -((int64_t)delta), batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
@@ -1312,12 +1328,12 @@ xfs_dec_freecounter(
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(counter, delta);
- if (!has_resv_pool || !rsvd)
+ if (!rsvd)
goto fdblocks_enospc;
- lcounter = (long long)mp->m_resblks_avail - delta;
+ lcounter = (long long)mp->m_resblks[ctr].avail - delta;
if (lcounter >= 0) {
- mp->m_resblks_avail = lcounter;
+ mp->m_resblks[ctr].avail = lcounter;
spin_unlock(&mp->m_sb_lock);
return 0;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbed172d6770..d92bce7bc184 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -105,6 +105,12 @@ struct xfs_groups {
uint64_t blkmask;
};
+enum xfs_free_counter {
+ XC_FREE_BLOCKS, /* free block counter */
+ XC_FREE_RTEXTENTS, /* free rt extent counter */
+ XC_FREE_NR,
+};
+
/*
* The struct xfsmount layout is optimised to separate read-mostly variables
* from variables that are frequently modified. We put the read-mostly variables
@@ -222,8 +228,7 @@ typedef struct xfs_mount {
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
- struct percpu_counter m_fdblocks; /* free block counter */
- struct percpu_counter m_frextents; /* free rt extent counter */
+ struct percpu_counter m_free[XC_FREE_NR];
/*
* Count of data device blocks reserved for delayed allocations,
@@ -245,9 +250,11 @@ typedef struct xfs_mount {
atomic64_t m_allocbt_blks;
struct xfs_groups m_groups[XG_TYPE_MAX];
- uint64_t m_resblks; /* total reserved blocks */
- uint64_t m_resblks_avail;/* available reserved blocks */
- uint64_t m_resblks_save; /* reserved blks @ remount,ro */
+ struct {
+ uint64_t total; /* total reserved blocks */
+ uint64_t avail; /* available reserved blocks */
+ uint64_t save; /* reserved blks @ remount,ro */
+ } m_resblks[XC_FREE_NR];
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
@@ -646,45 +653,61 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
+uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+
+static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_sum(&mp->m_free[ctr]);
+}
+
/*
- * Estimate the amount of free space that is not available to userspace and is
- * not explicitly reserved from the incore fdblocks. This includes:
- *
- * - The minimum number of blocks needed to support splitting a bmap btree
- * - The blocks currently in use by the freespace btrees because they record
- * the actual blocks that will fill per-AG metadata space reservations
+ * This just provides and estimate without the cpu-local updates, use
+ * xfs_sum_freecounter for the exact value.
*/
-static inline uint64_t
-xfs_fdblocks_unavailable(
- struct xfs_mount *mp)
+static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_read_positive(&mp->m_free[ctr]);
+}
+
+static inline int xfs_compare_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, s64 rhs, s32 batch)
+{
+ return __percpu_counter_compare(&mp->m_free[ctr], rhs, batch);
+}
+
+static inline void xfs_set_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, uint64_t val)
{
- return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+ percpu_counter_set(&mp->m_free[ctr], val);
}
-int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, bool rsvd);
-void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta);
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
bool reserved)
{
- return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+ return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+ xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
+ return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_frextents, delta);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index bc18b694db75..8da2498417f5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1519,7 +1519,7 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a74a0cc1f6f6..1960ee0aad45 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -834,10 +834,11 @@ xfs_statfs_data(
struct kstatfs *st)
{
int64_t fdblocks =
- percpu_counter_sum(&mp->m_fdblocks);
+ xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
/* make sure st->f_bfree does not underflow */
- st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+ st->f_bfree = max(0LL,
+ fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
}
@@ -852,9 +853,9 @@ xfs_statfs_rt(
struct kstatfs *st)
{
int64_t freertx =
- percpu_counter_sum_positive(&mp->m_frextents);
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
- st->f_bfree = xfs_rtbxlen_to_blen(mp, freertx);
+ st->f_bfree = xfs_rtbxlen_to_blen(mp, max(0LL, freertx));
st->f_blocks = mp->m_sb.sb_rblocks;
}
@@ -920,24 +921,32 @@ xfs_fs_statfs(
}
STATIC void
-xfs_save_resvblks(struct xfs_mount *mp)
+xfs_save_resvblks(
+ struct xfs_mount *mp)
{
- mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, 0);
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++) {
+ mp->m_resblks[i].save = mp->m_resblks[i].total;
+ xfs_reserve_blocks(mp, i, 0);
+ }
}
STATIC void
-xfs_restore_resvblks(struct xfs_mount *mp)
+xfs_restore_resvblks(
+ struct xfs_mount *mp)
{
- uint64_t resblks;
+ uint64_t resblks;
+ enum xfs_free_counter i;
- if (mp->m_resblks_save) {
- resblks = mp->m_resblks_save;
- mp->m_resblks_save = 0;
- } else
- resblks = xfs_default_resblks(mp);
-
- xfs_reserve_blocks(mp, resblks);
+ for (i = 0; i < XC_FREE_NR; i++) {
+ if (mp->m_resblks[i].save) {
+ resblks = mp->m_resblks[i].save;
+ mp->m_resblks[i].save = 0;
+ } else
+ resblks = xfs_default_resblks(mp);
+ xfs_reserve_blocks(mp, i, resblks);
+ }
}
/*
@@ -1063,7 +1072,8 @@ static int
xfs_init_percpu_counters(
struct xfs_mount *mp)
{
- int error;
+ int error;
+ enum xfs_free_counter i;
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
if (error)
@@ -1073,30 +1083,28 @@ xfs_init_percpu_counters(
if (error)
goto free_icount;
- error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
- if (error)
- goto free_ifree;
-
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
if (error)
- goto free_fdblocks;
+ goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
- error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
- if (error)
- goto free_delalloc_rt;
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = percpu_counter_init(&mp->m_free[i], 0, GFP_KERNEL);
+ if (error)
+ goto free_freecounters;
+ }
return 0;
-free_delalloc_rt:
+free_freecounters:
+ while (--i > 0)
+ percpu_counter_destroy(&mp->m_free[i]);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
- percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1110,24 +1118,26 @@ xfs_reinit_percpu_counters(
{
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
- percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
}
static void
xfs_destroy_percpu_counters(
struct xfs_mount *mp)
{
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++)
+ percpu_counter_destroy(&mp->m_free[i]);
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
- percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
- percpu_counter_destroy(&mp->m_frextents);
}
static int
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4fe689410eb6..15dec76fec10 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -5621,7 +5621,7 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
__entry->dev = mp->m_super->s_dev;
__entry->ino = ip->i_ino;
- __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
+ __entry->freeblks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
__entry->reserved = ip->i_delayed_blks;
__entry->asked = ip->i_meta_resv_asked;
__entry->used = ip->i_nblocks;
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling
2024-12-11 8:54 ` [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling Christoph Hellwig
@ 2024-12-12 21:37 ` Darrick J. Wong
2024-12-13 5:11 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:34AM +0100, Christoph Hellwig wrote:
> The main handling of the incore per-cpu freespace counters is already
> handled in xfs_mod_freecounter for both the block and RT extent cases,
> but the actual counter is passed in an special cases.
>
> Replace both the percpu counters and the resblks counters with arrays,
> so that support reserved RT extents can be supported, which will be
> needed for garbarge collection on zoned devices.
>
> Use helpers to access the freespace counters everywhere intead of
> poking through the abstraction by using the percpu_count helpers
> directly. This also switches the flooring of the frextents counter
> to 0 in statfs for the rthinherit case to a manual min_t call to match
> the handling of the fdblocks counter for normal file systems.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_ialloc.c | 2 +-
> fs/xfs/libxfs/xfs_metafile.c | 2 +-
> fs/xfs/libxfs/xfs_sb.c | 7 +--
> fs/xfs/scrub/fscounters.c | 13 +++---
> fs/xfs/scrub/fscounters_repair.c | 4 +-
> fs/xfs/scrub/newbt.c | 2 +-
> fs/xfs/xfs_fsops.c | 27 ++++++------
> fs/xfs/xfs_fsops.h | 3 +-
> fs/xfs/xfs_icache.c | 4 +-
> fs/xfs/xfs_ioctl.c | 12 +++---
> fs/xfs/xfs_iomap.c | 9 ++--
> fs/xfs/xfs_mount.c | 58 ++++++++++++++++---------
> fs/xfs/xfs_mount.h | 65 +++++++++++++++++++---------
> fs/xfs/xfs_rtalloc.c | 2 +-
> fs/xfs/xfs_super.c | 74 ++++++++++++++++++--------------
> fs/xfs/xfs_trace.h | 2 +-
> 16 files changed, 171 insertions(+), 115 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
> index f3a840a425f5..57513ba19d6a 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.c
> +++ b/fs/xfs/libxfs/xfs_ialloc.c
> @@ -1927,7 +1927,7 @@ xfs_dialloc(
> * that we can immediately allocate, but then we allow allocation on the
> * second pass if we fail to find an AG with free inodes in it.
> */
> - if (percpu_counter_read_positive(&mp->m_fdblocks) <
> + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
> mp->m_low_space[XFS_LOWSP_1_PCNT]) {
> ok_alloc = false;
> low_space = true;
> diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
> index e151663cc9ef..c84820f5bdc6 100644
> --- a/fs/xfs/libxfs/xfs_metafile.c
> +++ b/fs/xfs/libxfs/xfs_metafile.c
> @@ -77,7 +77,7 @@ xfs_metafile_resv_can_cover(
> * There aren't enough blocks left in the inode's reservation, but it
> * isn't critical unless there also isn't enough free space.
> */
> - return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
> + return xfs_compare_freecounter(ip->i_mount, XC_FREE_BLOCKS,
> rhs - ip->i_delayed_blks, 2048) >= 0;
> }
>
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 3dc5f5dba162..090f133f4da3 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1266,7 +1266,7 @@ xfs_log_sb(
> percpu_counter_sum_positive(&mp->m_ifree),
> mp->m_sb.sb_icount);
> mp->m_sb.sb_fdblocks =
> - percpu_counter_sum_positive(&mp->m_fdblocks);
> + max(0LL, xfs_sum_freecounter(mp, XC_FREE_BLOCKS));
> }
>
> /*
> @@ -1275,9 +1275,10 @@ xfs_log_sb(
> * we handle nearly-lockless reservations, so we must use the _positive
> * variant here to avoid writing out nonsense frextents.
> */
> - if (xfs_has_rtgroups(mp))
> + if (xfs_has_rtgroups(mp)) {
> mp->m_sb.sb_frextents =
> - percpu_counter_sum_positive(&mp->m_frextents);
> + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
Curious. xfs_sum_freecounter returns percpu_counter_sum, not its
_positive variant. This seems like a bug? Or at least an omitted
max(0LL, ...) call?
> + }
>
> xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
> xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
> diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
> index ca23cf4db6c5..732658a62a2d 100644
> --- a/fs/xfs/scrub/fscounters.c
> +++ b/fs/xfs/scrub/fscounters.c
> @@ -350,7 +350,7 @@ xchk_fscount_aggregate_agcounts(
> * The global incore space reservation is taken from the incore
> * counters, so leave that out of the computation.
> */
> - fsc->fdblocks -= mp->m_resblks_avail;
> + fsc->fdblocks -= mp->m_resblks[XC_FREE_BLOCKS].avail;
>
> /*
> * Delayed allocation reservations are taken out of the incore counters
> @@ -513,8 +513,8 @@ xchk_fscounters(
> /* Snapshot the percpu counters. */
> icount = percpu_counter_sum(&mp->m_icount);
> ifree = percpu_counter_sum(&mp->m_ifree);
> - fdblocks = percpu_counter_sum(&mp->m_fdblocks);
> - frextents = percpu_counter_sum(&mp->m_frextents);
> + fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
> + frextents = xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
>
> /* No negative values, please! */
> if (icount < 0 || ifree < 0)
> @@ -589,15 +589,16 @@ xchk_fscounters(
> try_again = true;
> }
>
> - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
> - fsc->fdblocks)) {
> + if (!xchk_fscount_within_range(sc, fdblocks,
> + &mp->m_free[XC_FREE_BLOCKS], fsc->fdblocks)) {
> if (fsc->frozen)
> xchk_set_corrupt(sc);
> else
> try_again = true;
> }
>
> - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
> + if (!xchk_fscount_within_range(sc, frextents,
> + &mp->m_free[XC_FREE_RTEXTENTS],
> fsc->frextents - fsc->frextents_delayed)) {
> if (fsc->frozen)
> xchk_set_corrupt(sc);
> diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
> index cda13447a373..8fb0db78489e 100644
> --- a/fs/xfs/scrub/fscounters_repair.c
> +++ b/fs/xfs/scrub/fscounters_repair.c
> @@ -64,7 +64,7 @@ xrep_fscounters(
>
> percpu_counter_set(&mp->m_icount, fsc->icount);
> percpu_counter_set(&mp->m_ifree, fsc->ifree);
> - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
> + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
>
> /*
> * Online repair is only supported on v5 file systems, which require
> @@ -74,7 +74,7 @@ xrep_fscounters(
> * track of the delalloc reservations separately, as they are are
> * subtracted from m_frextents, but not included in sb_frextents.
> */
> - percpu_counter_set(&mp->m_frextents,
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> fsc->frextents - fsc->frextents_delayed);
> if (!xfs_has_rtgroups(mp))
> mp->m_sb.sb_frextents = fsc->frextents;
> diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
> index ac38f5843090..3e46b04f427f 100644
> --- a/fs/xfs/scrub/newbt.c
> +++ b/fs/xfs/scrub/newbt.c
> @@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
> free = sc->sa.pag->pagf_freeblks;
> sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
> } else {
> - free = percpu_counter_sum(&sc->mp->m_fdblocks);
> + free = xfs_sum_freecounter(sc->mp, XC_FREE_BLOCKS);
> sz = sc->mp->m_sb.sb_dblocks;
> }
>
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 455298503d01..bb2e31e338b8 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -366,6 +366,7 @@ xfs_growfs_log(
> int
> xfs_reserve_blocks(
> struct xfs_mount *mp,
> + enum xfs_free_counter ctr,
> uint64_t request)
> {
> int64_t lcounter, delta;
> @@ -373,6 +374,8 @@ xfs_reserve_blocks(
> int64_t free;
> int error = 0;
>
> + ASSERT(ctr < XC_FREE_NR);
> +
> /*
> * With per-cpu counters, this becomes an interesting problem. we need
> * to work out if we are freeing or allocation blocks first, then we can
> @@ -391,16 +394,16 @@ xfs_reserve_blocks(
> * counters directly since we shouldn't have any problems unreserving
> * space.
> */
> - if (mp->m_resblks > request) {
> - lcounter = mp->m_resblks_avail - request;
> + if (mp->m_resblks[ctr].total > request) {
> + lcounter = mp->m_resblks[ctr].avail - request;
> if (lcounter > 0) { /* release unused blocks */
> fdblks_delta = lcounter;
> - mp->m_resblks_avail -= lcounter;
> + mp->m_resblks[ctr].avail -= lcounter;
> }
> - mp->m_resblks = request;
> + mp->m_resblks[ctr].total = request;
> if (fdblks_delta) {
> spin_unlock(&mp->m_sb_lock);
> - xfs_add_fdblocks(mp, fdblks_delta);
> + xfs_add_freecounter(mp, ctr, fdblks_delta);
> spin_lock(&mp->m_sb_lock);
> }
>
> @@ -409,7 +412,7 @@ xfs_reserve_blocks(
>
> /*
> * If the request is larger than the current reservation, reserve the
> - * blocks before we update the reserve counters. Sample m_fdblocks and
> + * blocks before we update the reserve counters. Sample m_free and
> * perform a partial reservation if the request exceeds free space.
> *
> * The code below estimates how many blocks it can request from
> @@ -419,10 +422,10 @@ xfs_reserve_blocks(
> * space to fill it because mod_fdblocks will refill an undersized
> * reserve when it can.
> */
> - free = percpu_counter_sum(&mp->m_fdblocks) -
> - xfs_fdblocks_unavailable(mp);
> - delta = request - mp->m_resblks;
> - mp->m_resblks = request;
> + free = xfs_sum_freecounter(mp, ctr) -
> + xfs_freecounter_unavailable(mp, ctr);
> + delta = request - mp->m_resblks[ctr].total;
> + mp->m_resblks[ctr].total = request;
> if (delta > 0 && free > 0) {
> /*
> * We'll either succeed in getting space from the free block
> @@ -436,9 +439,9 @@ xfs_reserve_blocks(
> */
> fdblks_delta = min(free, delta);
> spin_unlock(&mp->m_sb_lock);
> - error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
> + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
> if (!error)
> - xfs_add_fdblocks(mp, fdblks_delta);
> + xfs_add_freecounter(mp, ctr, fdblks_delta);
> spin_lock(&mp->m_sb_lock);
> }
> out:
> diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
> index 3e2f73bcf831..9d23c361ef56 100644
> --- a/fs/xfs/xfs_fsops.h
> +++ b/fs/xfs/xfs_fsops.h
> @@ -8,7 +8,8 @@
>
> int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
> int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
> -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
> +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
> + uint64_t request);
> int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
>
> int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 7b6c026d01a1..c9ded501e89b 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -2076,7 +2076,7 @@ xfs_inodegc_want_queue_rt_file(
> if (!XFS_IS_REALTIME_INODE(ip))
> return false;
>
> - if (__percpu_counter_compare(&mp->m_frextents,
> + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
> mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
> XFS_FDBLOCKS_BATCH) < 0)
> return true;
> @@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
> if (items > mp->m_ino_geo.inodes_per_cluster)
> return true;
>
> - if (__percpu_counter_compare(&mp->m_fdblocks,
> + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
> mp->m_low_space[XFS_LOWSP_5_PCNT],
> XFS_FDBLOCKS_BATCH) < 0)
> return true;
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index de8ba5345e17..d3cf62d81f0d 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
> error = mnt_want_write_file(filp);
> if (error)
> return error;
> - error = xfs_reserve_blocks(mp, fsop.resblks);
> + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
> mnt_drop_write_file(filp);
> if (error)
> return error;
> }
>
> spin_lock(&mp->m_sb_lock);
> - fsop.resblks = mp->m_resblks;
> - fsop.resblks_avail = mp->m_resblks_avail;
> + fsop.resblks = mp->m_resblks[XC_FREE_BLOCKS].total;
> + fsop.resblks_avail = mp->m_resblks[XC_FREE_BLOCKS].avail;
> spin_unlock(&mp->m_sb_lock);
>
> if (copy_to_user(arg, &fsop, sizeof(fsop)))
> @@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
> struct xfs_fsop_counts out = {
> .allocino = percpu_counter_read_positive(&mp->m_icount),
> .freeino = percpu_counter_read_positive(&mp->m_ifree),
> - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
> - xfs_fdblocks_unavailable(mp),
> - .freertx = percpu_counter_read_positive(&mp->m_frextents),
> + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
> + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
> + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
> };
>
> if (copy_to_user(uarg, &out, sizeof(out)))
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index b3783d7b8ebe..f3f4b5c328c3 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -432,13 +432,14 @@ xfs_quota_calc_throttle(
>
> static int64_t
> xfs_iomap_freesp(
> - struct percpu_counter *counter,
> + struct xfs_mount *mp,
> + unsigned int idx,
> uint64_t low_space[XFS_LOWSP_MAX],
> int *shift)
> {
> int64_t freesp;
>
> - freesp = percpu_counter_read_positive(counter);
> + freesp = xfs_estimate_freecounter(mp, idx);
> if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
> *shift = 2;
> if (freesp < low_space[XFS_LOWSP_4_PCNT])
> @@ -537,10 +538,10 @@ xfs_iomap_prealloc_size(
>
> if (unlikely(XFS_IS_REALTIME_INODE(ip)))
> freesp = xfs_rtbxlen_to_blen(mp,
> - xfs_iomap_freesp(&mp->m_frextents,
> + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
> mp->m_low_rtexts, &shift));
> else
> - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
> + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
> &shift);
>
> /*
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 66b91b582691..4174035b2ac9 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1058,7 +1058,8 @@ xfs_mountfs(
> * we were already there on the last unmount. Warn if this occurs.
> */
> if (!xfs_is_readonly(mp)) {
> - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
> + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
> + xfs_default_resblks(mp));
> if (error)
> xfs_warn(mp,
> "Unable to allocate reserve blocks. Continuing without reserve pool.");
> @@ -1178,7 +1179,7 @@ xfs_unmountfs(
> * we only every apply deltas to the superblock and hence the incore
> * value does not matter....
> */
> - error = xfs_reserve_blocks(mp, 0);
> + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
> if (error)
> xfs_warn(mp, "Unable to free reserved block pool. "
> "Freespace may not be correct on next mount.");
> @@ -1225,52 +1226,68 @@ xfs_fs_writable(
> return true;
> }
>
> +/*
> + * Estimate the amount of free space that is not available to userspace and is
> + * not explicitly reserved from the incore fdblocks. This includes:
> + *
> + * - The minimum number of blocks needed to support splitting a bmap btree
> + * - The blocks currently in use by the freespace btrees because they record
> + * the actual blocks that will fill per-AG metadata space reservations
> + */
> +uint64_t
> +xfs_freecounter_unavailable(
> + struct xfs_mount *mp,
> + enum xfs_free_counter ctr)
> +{
> + if (ctr == XC_FREE_RTEXTENTS)
> + return 0;
> + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
> +}
> +
> void
> xfs_add_freecounter(
> struct xfs_mount *mp,
> - struct percpu_counter *counter,
> + enum xfs_free_counter ctr,
> uint64_t delta)
> {
> - bool has_resv_pool = (counter == &mp->m_fdblocks);
> uint64_t res_used;
>
> /*
> * If the reserve pool is depleted, put blocks back into it first.
> * Most of the time the pool is full.
> */
> - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
> - percpu_counter_add(counter, delta);
> + if (likely(mp->m_resblks[ctr].total == mp->m_resblks[ctr].avail)) {
> + percpu_counter_add(&mp->m_free[ctr], delta);
> return;
> }
>
> spin_lock(&mp->m_sb_lock);
> - res_used = mp->m_resblks - mp->m_resblks_avail;
> + res_used = mp->m_resblks[ctr].total - mp->m_resblks[ctr].avail;
> if (res_used > delta) {
> - mp->m_resblks_avail += delta;
> + mp->m_resblks[ctr].avail += delta;
> } else {
> delta -= res_used;
> - mp->m_resblks_avail = mp->m_resblks;
> - percpu_counter_add(counter, delta);
> + mp->m_resblks[ctr].avail = mp->m_resblks[ctr].total;
> + percpu_counter_add(&mp->m_free[ctr], delta);
> }
> spin_unlock(&mp->m_sb_lock);
> }
>
> +
> +/* Adjust in-core free blocks or RT extents. */
> int
> xfs_dec_freecounter(
> struct xfs_mount *mp,
> - struct percpu_counter *counter,
> + enum xfs_free_counter ctr,
> uint64_t delta,
> bool rsvd)
> {
> + struct percpu_counter *counter = &mp->m_free[ctr];
> int64_t lcounter;
> uint64_t set_aside = 0;
> s32 batch;
> - bool has_resv_pool;
>
> - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
> - has_resv_pool = (counter == &mp->m_fdblocks);
> - if (rsvd)
> - ASSERT(has_resv_pool);
> + ASSERT(ctr < XC_FREE_NR);
>
> /*
> * Taking blocks away, need to be more accurate the closer we
> @@ -1297,8 +1314,7 @@ xfs_dec_freecounter(
> * problems (i.e. transaction abort, pagecache discards, etc.) than
> * slightly premature -ENOSPC.
> */
> - if (has_resv_pool)
> - set_aside = xfs_fdblocks_unavailable(mp);
> + set_aside = xfs_freecounter_unavailable(mp, ctr);
Nit: I think you can get rid of the set_aside = 0; above?
> percpu_counter_add_batch(counter, -((int64_t)delta), batch);
> if (__percpu_counter_compare(counter, set_aside,
> XFS_FDBLOCKS_BATCH) >= 0) {
> @@ -1312,12 +1328,12 @@ xfs_dec_freecounter(
> */
> spin_lock(&mp->m_sb_lock);
> percpu_counter_add(counter, delta);
> - if (!has_resv_pool || !rsvd)
> + if (!rsvd)
> goto fdblocks_enospc;
>
> - lcounter = (long long)mp->m_resblks_avail - delta;
> + lcounter = (long long)mp->m_resblks[ctr].avail - delta;
> if (lcounter >= 0) {
> - mp->m_resblks_avail = lcounter;
> + mp->m_resblks[ctr].avail = lcounter;
> spin_unlock(&mp->m_sb_lock);
> return 0;
> }
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index fbed172d6770..d92bce7bc184 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -105,6 +105,12 @@ struct xfs_groups {
> uint64_t blkmask;
> };
>
> +enum xfs_free_counter {
> + XC_FREE_BLOCKS, /* free block counter */
> + XC_FREE_RTEXTENTS, /* free rt extent counter */
> + XC_FREE_NR,
> +};
> +
> /*
> * The struct xfsmount layout is optimised to separate read-mostly variables
> * from variables that are frequently modified. We put the read-mostly variables
> @@ -222,8 +228,7 @@ typedef struct xfs_mount {
> spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
> struct percpu_counter m_icount; /* allocated inodes counter */
> struct percpu_counter m_ifree; /* free inodes counter */
> - struct percpu_counter m_fdblocks; /* free block counter */
> - struct percpu_counter m_frextents; /* free rt extent counter */
> + struct percpu_counter m_free[XC_FREE_NR];
>
> /*
> * Count of data device blocks reserved for delayed allocations,
> @@ -245,9 +250,11 @@ typedef struct xfs_mount {
> atomic64_t m_allocbt_blks;
>
> struct xfs_groups m_groups[XG_TYPE_MAX];
> - uint64_t m_resblks; /* total reserved blocks */
> - uint64_t m_resblks_avail;/* available reserved blocks */
> - uint64_t m_resblks_save; /* reserved blks @ remount,ro */
> + struct {
> + uint64_t total; /* total reserved blocks */
> + uint64_t avail; /* available reserved blocks */
> + uint64_t save; /* reserved blks @ remount,ro */
> + } m_resblks[XC_FREE_NR];
> struct delayed_work m_reclaim_work; /* background inode reclaim */
> struct dentry *m_debugfs; /* debugfs parent */
> struct xfs_kobj m_kobj;
> @@ -646,45 +653,61 @@ extern void xfs_unmountfs(xfs_mount_t *);
> */
> #define XFS_FDBLOCKS_BATCH 1024
>
> +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
> + enum xfs_free_counter ctr);
> +
> +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
> + enum xfs_free_counter ctr)
> +{
> + return percpu_counter_sum(&mp->m_free[ctr]);
> +}
> +
> /*
> - * Estimate the amount of free space that is not available to userspace and is
> - * not explicitly reserved from the incore fdblocks. This includes:
> - *
> - * - The minimum number of blocks needed to support splitting a bmap btree
> - * - The blocks currently in use by the freespace btrees because they record
> - * the actual blocks that will fill per-AG metadata space reservations
> + * This just provides and estimate without the cpu-local updates, use
> + * xfs_sum_freecounter for the exact value.
> */
> -static inline uint64_t
> -xfs_fdblocks_unavailable(
> - struct xfs_mount *mp)
> +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
> + enum xfs_free_counter ctr)
> +{
> + return percpu_counter_read_positive(&mp->m_free[ctr]);
> +}
> +
> +static inline int xfs_compare_freecounter(struct xfs_mount *mp,
> + enum xfs_free_counter ctr, s64 rhs, s32 batch)
> +{
> + return __percpu_counter_compare(&mp->m_free[ctr], rhs, batch);
> +}
> +
> +static inline void xfs_set_freecounter(struct xfs_mount *mp,
> + enum xfs_free_counter ctr, uint64_t val)
> {
> - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
> + percpu_counter_set(&mp->m_free[ctr], val);
> }
>
> -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
> +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
> uint64_t delta, bool rsvd);
> -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
> +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
> uint64_t delta);
>
> static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
> bool reserved)
> {
> - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
> + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
> }
>
> static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
> {
> - xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
> + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
> }
>
> static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
> {
> - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
> + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
> }
>
> static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
> {
> - xfs_add_freecounter(mp, &mp->m_frextents, delta);
> + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
> }
>
> extern int xfs_readsb(xfs_mount_t *, int);
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index bc18b694db75..8da2498417f5 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1519,7 +1519,7 @@ xfs_rtalloc_reinit_frextents(
> spin_lock(&mp->m_sb_lock);
> mp->m_sb.sb_frextents = val;
> spin_unlock(&mp->m_sb_lock);
> - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
> return 0;
> }
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index a74a0cc1f6f6..1960ee0aad45 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -834,10 +834,11 @@ xfs_statfs_data(
> struct kstatfs *st)
> {
> int64_t fdblocks =
> - percpu_counter_sum(&mp->m_fdblocks);
> + xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
>
> /* make sure st->f_bfree does not underflow */
> - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
> + st->f_bfree = max(0LL,
> + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
> st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp);
> }
>
> @@ -852,9 +853,9 @@ xfs_statfs_rt(
> struct kstatfs *st)
> {
> int64_t freertx =
> - percpu_counter_sum_positive(&mp->m_frextents);
> + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
Same question as I had for xfs_log_sb() about _positive.
--D
>
> - st->f_bfree = xfs_rtbxlen_to_blen(mp, freertx);
> + st->f_bfree = xfs_rtbxlen_to_blen(mp, max(0LL, freertx));
> st->f_blocks = mp->m_sb.sb_rblocks;
> }
>
> @@ -920,24 +921,32 @@ xfs_fs_statfs(
> }
>
> STATIC void
> -xfs_save_resvblks(struct xfs_mount *mp)
> +xfs_save_resvblks(
> + struct xfs_mount *mp)
> {
> - mp->m_resblks_save = mp->m_resblks;
> - xfs_reserve_blocks(mp, 0);
> + enum xfs_free_counter i;
> +
> + for (i = 0; i < XC_FREE_NR; i++) {
> + mp->m_resblks[i].save = mp->m_resblks[i].total;
> + xfs_reserve_blocks(mp, i, 0);
> + }
> }
>
> STATIC void
> -xfs_restore_resvblks(struct xfs_mount *mp)
> +xfs_restore_resvblks(
> + struct xfs_mount *mp)
> {
> - uint64_t resblks;
> + uint64_t resblks;
> + enum xfs_free_counter i;
>
> - if (mp->m_resblks_save) {
> - resblks = mp->m_resblks_save;
> - mp->m_resblks_save = 0;
> - } else
> - resblks = xfs_default_resblks(mp);
> -
> - xfs_reserve_blocks(mp, resblks);
> + for (i = 0; i < XC_FREE_NR; i++) {
> + if (mp->m_resblks[i].save) {
> + resblks = mp->m_resblks[i].save;
> + mp->m_resblks[i].save = 0;
> + } else
> + resblks = xfs_default_resblks(mp);
> + xfs_reserve_blocks(mp, i, resblks);
> + }
> }
>
> /*
> @@ -1063,7 +1072,8 @@ static int
> xfs_init_percpu_counters(
> struct xfs_mount *mp)
> {
> - int error;
> + int error;
> + enum xfs_free_counter i;
>
> error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
> if (error)
> @@ -1073,30 +1083,28 @@ xfs_init_percpu_counters(
> if (error)
> goto free_icount;
>
> - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
> - if (error)
> - goto free_ifree;
> -
> error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
> if (error)
> - goto free_fdblocks;
> + goto free_ifree;
>
> error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
> if (error)
> goto free_delalloc;
>
> - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
> - if (error)
> - goto free_delalloc_rt;
> + for (i = 0; i < XC_FREE_NR; i++) {
> + error = percpu_counter_init(&mp->m_free[i], 0, GFP_KERNEL);
> + if (error)
> + goto free_freecounters;
> + }
>
> return 0;
>
> -free_delalloc_rt:
> +free_freecounters:
> + while (--i > 0)
> + percpu_counter_destroy(&mp->m_free[i]);
> percpu_counter_destroy(&mp->m_delalloc_rtextents);
> free_delalloc:
> percpu_counter_destroy(&mp->m_delalloc_blks);
> -free_fdblocks:
> - percpu_counter_destroy(&mp->m_fdblocks);
> free_ifree:
> percpu_counter_destroy(&mp->m_ifree);
> free_icount:
> @@ -1110,24 +1118,26 @@ xfs_reinit_percpu_counters(
> {
> percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
> percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
> - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
> - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
> + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
> }
>
> static void
> xfs_destroy_percpu_counters(
> struct xfs_mount *mp)
> {
> + enum xfs_free_counter i;
> +
> + for (i = 0; i < XC_FREE_NR; i++)
> + percpu_counter_destroy(&mp->m_free[i]);
> percpu_counter_destroy(&mp->m_icount);
> percpu_counter_destroy(&mp->m_ifree);
> - percpu_counter_destroy(&mp->m_fdblocks);
> ASSERT(xfs_is_shutdown(mp) ||
> percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
> percpu_counter_destroy(&mp->m_delalloc_rtextents);
> ASSERT(xfs_is_shutdown(mp) ||
> percpu_counter_sum(&mp->m_delalloc_blks) == 0);
> percpu_counter_destroy(&mp->m_delalloc_blks);
> - percpu_counter_destroy(&mp->m_frextents);
> }
>
> static int
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 4fe689410eb6..15dec76fec10 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -5621,7 +5621,7 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
>
> __entry->dev = mp->m_super->s_dev;
> __entry->ino = ip->i_ino;
> - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
> + __entry->freeblks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
> __entry->reserved = ip->i_delayed_blks;
> __entry->asked = ip->i_meta_resv_asked;
> __entry->used = ip->i_nblocks;
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling
2024-12-12 21:37 ` Darrick J. Wong
@ 2024-12-13 5:11 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:11 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:37:18PM -0800, Darrick J. Wong wrote:
> > mp->m_sb.sb_frextents =
> > - percpu_counter_sum_positive(&mp->m_frextents);
> > + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
>
> Curious. xfs_sum_freecounter returns percpu_counter_sum, not its
> _positive variant. This seems like a bug? Or at least an omitted
> max(0LL, ...) call?
Good question. This code is pretty old and it's probably time to do
a full audit of the _positive thingies, including checking if the
existing callers make sense and what the right levels of abstraction
are.
> > @@ -1297,8 +1314,7 @@ xfs_dec_freecounter(
> > * problems (i.e. transaction abort, pagecache discards, etc.) than
> > * slightly premature -ENOSPC.
> > */
> > - if (has_resv_pool)
> > - set_aside = xfs_fdblocks_unavailable(mp);
> > + set_aside = xfs_freecounter_unavailable(mp, ctr);
>
> Nit: I think you can get rid of the set_aside = 0; above?
Yes.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 10/43] xfs: preserve RT reservations across remounts
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (8 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 09/43] xfs: generalize the freespace and reserved blocks handling Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:38 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared Christoph Hellwig
` (32 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
From: Hans Holmberg <hans.holmberg@wdc.com>
Introduce a reservation setting for rt devices so that zoned GC
reservations are preserved over remount ro/rw cycles.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_mount.c | 22 +++++++++++++++-------
fs/xfs/xfs_mount.h | 3 ++-
fs/xfs/xfs_super.c | 2 +-
3 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4174035b2ac9..db910ecc1ed4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -465,10 +465,15 @@ xfs_mount_reset_sbqflags(
}
uint64_t
-xfs_default_resblks(xfs_mount_t *mp)
+xfs_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
{
uint64_t resblks;
+ if (ctr == XC_FREE_RTEXTENTS)
+ return 0;
+
/*
* We default to 5% or 8192 fsbs of space reserved, whichever is
* smaller. This is intended to cover concurrent allocation
@@ -683,6 +688,7 @@ xfs_mountfs(
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
+ int i;
xfs_sb_mount_common(mp, sbp);
@@ -1051,18 +1057,20 @@ xfs_mountfs(
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
- * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
- * are not allowed to use this reserved space.
+ * attr, unwritten extent conversion at ENOSPC, garbage collection
+ * etc. Data allocations are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
- xfs_default_resblks(mp));
- if (error)
- xfs_warn(mp,
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = xfs_reserve_blocks(mp, i,
+ xfs_default_resblks(mp, i));
+ if (error)
+ xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
+ }
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d92bce7bc184..73bc053fdd17 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -640,7 +640,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
extern void xfs_uuid_table_free(void);
-extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+uint64_t xfs_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1960ee0aad45..f57c27940467 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -944,7 +944,7 @@ xfs_restore_resvblks(
resblks = mp->m_resblks[i].save;
mp->m_resblks[i].save = 0;
} else
- resblks = xfs_default_resblks(mp);
+ resblks = xfs_default_resblks(mp, i);
xfs_reserve_blocks(mp, i, resblks);
}
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 10/43] xfs: preserve RT reservations across remounts
2024-12-11 8:54 ` [PATCH 10/43] xfs: preserve RT reservations across remounts Christoph Hellwig
@ 2024-12-12 21:38 ` Darrick J. Wong
2024-12-13 9:15 ` Hans Holmberg
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:38 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:35AM +0100, Christoph Hellwig wrote:
> From: Hans Holmberg <hans.holmberg@wdc.com>
>
> Introduce a reservation setting for rt devices so that zoned GC
> reservations are preserved over remount ro/rw cycles.
>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_mount.c | 22 +++++++++++++++-------
> fs/xfs/xfs_mount.h | 3 ++-
> fs/xfs/xfs_super.c | 2 +-
> 3 files changed, 18 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 4174035b2ac9..db910ecc1ed4 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -465,10 +465,15 @@ xfs_mount_reset_sbqflags(
> }
>
> uint64_t
> -xfs_default_resblks(xfs_mount_t *mp)
> +xfs_default_resblks(
> + struct xfs_mount *mp,
> + enum xfs_free_counter ctr)
> {
> uint64_t resblks;
>
> + if (ctr == XC_FREE_RTEXTENTS)
> + return 0;
> +
> /*
> * We default to 5% or 8192 fsbs of space reserved, whichever is
> * smaller. This is intended to cover concurrent allocation
> @@ -683,6 +688,7 @@ xfs_mountfs(
> uint quotamount = 0;
> uint quotaflags = 0;
> int error = 0;
> + int i;
>
> xfs_sb_mount_common(mp, sbp);
>
> @@ -1051,18 +1057,20 @@ xfs_mountfs(
> * privileged transactions. This is needed so that transaction
> * space required for critical operations can dip into this pool
> * when at ENOSPC. This is needed for operations like create with
> - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
> - * are not allowed to use this reserved space.
> + * attr, unwritten extent conversion at ENOSPC, garbage collection
> + * etc. Data allocations are not allowed to use this reserved space.
> *
> * This may drive us straight to ENOSPC on mount, but that implies
> * we were already there on the last unmount. Warn if this occurs.
> */
> if (!xfs_is_readonly(mp)) {
> - error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
> - xfs_default_resblks(mp));
> - if (error)
> - xfs_warn(mp,
> + for (i = 0; i < XC_FREE_NR; i++) {
> + error = xfs_reserve_blocks(mp, i,
> + xfs_default_resblks(mp, i));
> + if (error)
> + xfs_warn(mp,
> "Unable to allocate reserve blocks. Continuing without reserve pool.");
Should we be able to log *which* reserve block pool is out?
Otherwise looks good to me.
--D
> + }
>
> /* Reserve AG blocks for future btree expansion. */
> error = xfs_fs_reserve_ag_blocks(mp);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index d92bce7bc184..73bc053fdd17 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -640,7 +640,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
> }
>
> extern void xfs_uuid_table_free(void);
> -extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
> +uint64_t xfs_default_resblks(struct xfs_mount *mp,
> + enum xfs_free_counter ctr);
> extern int xfs_mountfs(xfs_mount_t *mp);
> extern void xfs_unmountfs(xfs_mount_t *);
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 1960ee0aad45..f57c27940467 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -944,7 +944,7 @@ xfs_restore_resvblks(
> resblks = mp->m_resblks[i].save;
> mp->m_resblks[i].save = 0;
> } else
> - resblks = xfs_default_resblks(mp);
> + resblks = xfs_default_resblks(mp, i);
> xfs_reserve_blocks(mp, i, resblks);
> }
> }
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 10/43] xfs: preserve RT reservations across remounts
2024-12-12 21:38 ` Darrick J. Wong
@ 2024-12-13 9:15 ` Hans Holmberg
2024-12-15 18:42 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Hans Holmberg @ 2024-12-13 9:15 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 10:38 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Wed, Dec 11, 2024 at 09:54:35AM +0100, Christoph Hellwig wrote:
> > From: Hans Holmberg <hans.holmberg@wdc.com>
> >
> > Introduce a reservation setting for rt devices so that zoned GC
> > reservations are preserved over remount ro/rw cycles.
> >
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> > fs/xfs/xfs_mount.c | 22 +++++++++++++++-------
> > fs/xfs/xfs_mount.h | 3 ++-
> > fs/xfs/xfs_super.c | 2 +-
> > 3 files changed, 18 insertions(+), 9 deletions(-)
> >
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 4174035b2ac9..db910ecc1ed4 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -465,10 +465,15 @@ xfs_mount_reset_sbqflags(
> > }
> >
> > uint64_t
> > -xfs_default_resblks(xfs_mount_t *mp)
> > +xfs_default_resblks(
> > + struct xfs_mount *mp,
> > + enum xfs_free_counter ctr)
> > {
> > uint64_t resblks;
> >
> > + if (ctr == XC_FREE_RTEXTENTS)
> > + return 0;
> > +
> > /*
> > * We default to 5% or 8192 fsbs of space reserved, whichever is
> > * smaller. This is intended to cover concurrent allocation
> > @@ -683,6 +688,7 @@ xfs_mountfs(
> > uint quotamount = 0;
> > uint quotaflags = 0;
> > int error = 0;
> > + int i;
> >
> > xfs_sb_mount_common(mp, sbp);
> >
> > @@ -1051,18 +1057,20 @@ xfs_mountfs(
> > * privileged transactions. This is needed so that transaction
> > * space required for critical operations can dip into this pool
> > * when at ENOSPC. This is needed for operations like create with
> > - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
> > - * are not allowed to use this reserved space.
> > + * attr, unwritten extent conversion at ENOSPC, garbage collection
> > + * etc. Data allocations are not allowed to use this reserved space.
> > *
> > * This may drive us straight to ENOSPC on mount, but that implies
> > * we were already there on the last unmount. Warn if this occurs.
> > */
> > if (!xfs_is_readonly(mp)) {
> > - error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
> > - xfs_default_resblks(mp));
> > - if (error)
> > - xfs_warn(mp,
> > + for (i = 0; i < XC_FREE_NR; i++) {
> > + error = xfs_reserve_blocks(mp, i,
> > + xfs_default_resblks(mp, i));
> > + if (error)
> > + xfs_warn(mp,
> > "Unable to allocate reserve blocks. Continuing without reserve pool.");
>
> Should we be able to log *which* reserve block pool is out?
Yep, that should be useful I think. We could do something like this:
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 20d564b3b564..6ef69d025f9a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -674,6 +674,10 @@ xfs_rtbtree_compute_maxlevels(
mp->m_rtbtree_maxlevels = levels;
}
+static const char * const xfs_free_pool_name[XC_FREE_NR] = {
+ "free blocks", "free rt extents", "available rt extents"
+};
+
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
@@ -1081,7 +1085,8 @@ xfs_mountfs(
xfs_default_resblks(mp, i));
if (error)
xfs_warn(mp,
- "Unable to allocate reserve blocks. Continuing without reserve pool.");
+"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
+ xfs_free_pool_name[i]);
}
/* Reserve AG blocks for future btree expansion. */
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 10/43] xfs: preserve RT reservations across remounts
2024-12-13 9:15 ` Hans Holmberg
@ 2024-12-15 18:42 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-15 18:42 UTC (permalink / raw)
To: Hans Holmberg
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 10:15:25AM +0100, Hans Holmberg wrote:
> On Thu, Dec 12, 2024 at 10:38 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Wed, Dec 11, 2024 at 09:54:35AM +0100, Christoph Hellwig wrote:
> > > From: Hans Holmberg <hans.holmberg@wdc.com>
> > >
> > > Introduce a reservation setting for rt devices so that zoned GC
> > > reservations are preserved over remount ro/rw cycles.
> > >
> > > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > > fs/xfs/xfs_mount.c | 22 +++++++++++++++-------
> > > fs/xfs/xfs_mount.h | 3 ++-
> > > fs/xfs/xfs_super.c | 2 +-
> > > 3 files changed, 18 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > > index 4174035b2ac9..db910ecc1ed4 100644
> > > --- a/fs/xfs/xfs_mount.c
> > > +++ b/fs/xfs/xfs_mount.c
> > > @@ -465,10 +465,15 @@ xfs_mount_reset_sbqflags(
> > > }
> > >
> > > uint64_t
> > > -xfs_default_resblks(xfs_mount_t *mp)
> > > +xfs_default_resblks(
> > > + struct xfs_mount *mp,
> > > + enum xfs_free_counter ctr)
> > > {
> > > uint64_t resblks;
> > >
> > > + if (ctr == XC_FREE_RTEXTENTS)
> > > + return 0;
> > > +
> > > /*
> > > * We default to 5% or 8192 fsbs of space reserved, whichever is
> > > * smaller. This is intended to cover concurrent allocation
> > > @@ -683,6 +688,7 @@ xfs_mountfs(
> > > uint quotamount = 0;
> > > uint quotaflags = 0;
> > > int error = 0;
> > > + int i;
> > >
> > > xfs_sb_mount_common(mp, sbp);
> > >
> > > @@ -1051,18 +1057,20 @@ xfs_mountfs(
> > > * privileged transactions. This is needed so that transaction
> > > * space required for critical operations can dip into this pool
> > > * when at ENOSPC. This is needed for operations like create with
> > > - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
> > > - * are not allowed to use this reserved space.
> > > + * attr, unwritten extent conversion at ENOSPC, garbage collection
> > > + * etc. Data allocations are not allowed to use this reserved space.
> > > *
> > > * This may drive us straight to ENOSPC on mount, but that implies
> > > * we were already there on the last unmount. Warn if this occurs.
> > > */
> > > if (!xfs_is_readonly(mp)) {
> > > - error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS,
> > > - xfs_default_resblks(mp));
> > > - if (error)
> > > - xfs_warn(mp,
> > > + for (i = 0; i < XC_FREE_NR; i++) {
> > > + error = xfs_reserve_blocks(mp, i,
> > > + xfs_default_resblks(mp, i));
> > > + if (error)
> > > + xfs_warn(mp,
> > > "Unable to allocate reserve blocks. Continuing without reserve pool.");
> >
> > Should we be able to log *which* reserve block pool is out?
>
> Yep, that should be useful I think. We could do something like this:
Yeah, that looks good to me.
--D
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 20d564b3b564..6ef69d025f9a 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -674,6 +674,10 @@ xfs_rtbtree_compute_maxlevels(
> mp->m_rtbtree_maxlevels = levels;
> }
>
> +static const char * const xfs_free_pool_name[XC_FREE_NR] = {
> + "free blocks", "free rt extents", "available rt extents"
> +};
> +
> /*
> * This function does the following on an initial mount of a file system:
> * - reads the superblock from disk and init the mount struct
> @@ -1081,7 +1085,8 @@ xfs_mountfs(
> xfs_default_resblks(mp, i));
> if (error)
> xfs_warn(mp,
> - "Unable to allocate reserve blocks. Continuing without reserve pool.");
> +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
> + xfs_free_pool_name[i]);
> }
>
> /* Reserve AG blocks for future btree expansion. */
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (9 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 10/43] xfs: preserve RT reservations across remounts Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:38 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write Christoph Hellwig
` (31 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
xfs_reflink_trim_around_shared tries to find shared blocks in the
refcount btree. Always_cow inodes don't have that tree, so don't
bother.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_reflink.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 59f7fc16eb80..3e778e077d09 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared
2024-12-11 8:54 ` [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared Christoph Hellwig
@ 2024-12-12 21:38 ` Darrick J. Wong
2024-12-13 5:12 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:38 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:36AM +0100, Christoph Hellwig wrote:
> xfs_reflink_trim_around_shared tries to find shared blocks in the
> refcount btree. Always_cow inodes don't have that tree, so don't
> bother.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Is this a bug fix?
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_reflink.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 59f7fc16eb80..3e778e077d09 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
> int error = 0;
>
> /* Holes, unwritten, and delalloc extents cannot be shared */
> - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
> + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
> *shared = false;
> return 0;
> }
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared
2024-12-12 21:38 ` Darrick J. Wong
@ 2024-12-13 5:12 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:12 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:38:57PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:36AM +0100, Christoph Hellwig wrote:
> > xfs_reflink_trim_around_shared tries to find shared blocks in the
> > refcount btree. Always_cow inodes don't have that tree, so don't
> > bother.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Is this a bug fix?
For the existing always_cow code it is a minor optimization. For
the zoned code that can do COW without the rtreflink code it avoids
triggering a NULL pointer dereference.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (10 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 11/43] xfs: skip always_cow inodes in xfs_reflink_trim_around_shared Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:44 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delay Christoph Hellwig
` (30 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
For always COW inodes we also must check the alignment of each individual
iovec segment, as they could end up with different I/Os due to the way
bio_iov_iter_get_pages works, and we'd then overwrite an already written
block.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2b6d4c71994d..6bcfd4c34a37 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -721,7 +721,16 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+
+ /*
+ * For always COW inodes we also must check the alignment of each
+ * individual iovec segment, as they could end up with different
+ * I/Os due to the way bio_iov_iter_get_pages works, and we'd
+ * then overwrite an already written block.
+ */
+ if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
+ (xfs_is_always_cow_inode(ip) &&
+ (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from);
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write
2024-12-11 8:54 ` [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write Christoph Hellwig
@ 2024-12-12 21:44 ` Darrick J. Wong
2024-12-13 5:14 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:44 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:37AM +0100, Christoph Hellwig wrote:
> For always COW inodes we also must check the alignment of each individual
> iovec segment, as they could end up with different I/Os due to the way
> bio_iov_iter_get_pages works, and we'd then overwrite an already written
> block.
I'm not sure why an alwayscow inode now needs to require fsblock-aligned
segments, seeing as it's been running mostly fine for years.
Is this a bug fix? Or prep for rtzone files, which are (presumably)
always written out of place?
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_file.c | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2b6d4c71994d..6bcfd4c34a37 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -721,7 +721,16 @@ xfs_file_dio_write(
> /* direct I/O must be aligned to device logical sector size */
> if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
> return -EINVAL;
> - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
> +
> + /*
> + * For always COW inodes we also must check the alignment of each
> + * individual iovec segment, as they could end up with different
> + * I/Os due to the way bio_iov_iter_get_pages works, and we'd
> + * then overwrite an already written block.
> + */
> + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
> + (xfs_is_always_cow_inode(ip) &&
> + (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
> return xfs_file_dio_write_unaligned(ip, iocb, from);
> return xfs_file_dio_write_aligned(ip, iocb, from);
> }
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write
2024-12-12 21:44 ` Darrick J. Wong
@ 2024-12-13 5:14 ` Christoph Hellwig
2024-12-13 23:14 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:14 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:44:42PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:37AM +0100, Christoph Hellwig wrote:
> > For always COW inodes we also must check the alignment of each individual
> > iovec segment, as they could end up with different I/Os due to the way
> > bio_iov_iter_get_pages works, and we'd then overwrite an already written
> > block.
>
> I'm not sure why an alwayscow inode now needs to require fsblock-aligned
> segments, seeing as it's been running mostly fine for years.
Because the storage you test always_cow on doesn't actually force
always_cow on you :) I.e. these segmented iovecs can end up overwriting
a block. That's ok if you're on a conventional device, but it will
error out on a sequential write required zoned.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write
2024-12-13 5:14 ` Christoph Hellwig
@ 2024-12-13 23:14 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:14 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:14:00AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 01:44:42PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:37AM +0100, Christoph Hellwig wrote:
> > > For always COW inodes we also must check the alignment of each individual
> > > iovec segment, as they could end up with different I/Os due to the way
> > > bio_iov_iter_get_pages works, and we'd then overwrite an already written
> > > block.
> >
> > I'm not sure why an alwayscow inode now needs to require fsblock-aligned
> > segments, seeing as it's been running mostly fine for years.
>
> Because the storage you test always_cow on doesn't actually force
> always_cow on you :) I.e. these segmented iovecs can end up overwriting
> a block. That's ok if you're on a conventional device, but it will
> error out on a sequential write required zoned.
Fine with me then :)
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delay
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (11 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:47 ` [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delayOM Darrick J. Wong
2024-12-11 8:54 ` [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper Christoph Hellwig
` (29 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The zone allocator wants to be able to remove a delalloc mapping in the
COW fork while keeping the block reservation. To support that pass the
blags argument down to xfs_bmap_del_extent_delay and support the
XFS_BMAPI_REMAP flag to keep the reservation.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_bmap.c | 10 +++++++---
fs/xfs/libxfs/xfs_bmap.h | 2 +-
fs/xfs/xfs_bmap_util.c | 2 +-
fs/xfs/xfs_reflink.c | 2 +-
4 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 861945a5fce3..512f1ceca47f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4666,7 +4666,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del)
+ struct xfs_bmbt_irec *del,
+ uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -4786,7 +4787,9 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (isrt)
+ if (bflags & XFS_BMAPI_REMAP)
+ ;
+ else if (isrt)
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
else
fdblocks += del->br_blockcount;
@@ -5388,7 +5391,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 4d48087fd3a8..b4d9c6e0f3f9 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del);
+ struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0836fea2d6d8..c623688e457c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -467,7 +467,7 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3e778e077d09..b7dba5ad2f34 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
if (isnullstartblock(del.br_startblock)) {
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
- &del);
+ &del, 0);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delayOM
2024-12-11 8:54 ` [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delay Christoph Hellwig
@ 2024-12-12 21:47 ` Darrick J. Wong
2024-12-13 5:14 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:47 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:38AM +0100, Christoph Hellwig wrote:
> The zone allocator wants to be able to remove a delalloc mapping in the
> COW fork while keeping the block reservation. To support that pass the
> blags argument down to xfs_bmap_del_extent_delay and support the
bflags
> XFS_BMAPI_REMAP flag to keep the reservation.
Is REMAP the only bmapi flag that will be valid here?
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_bmap.c | 10 +++++++---
> fs/xfs/libxfs/xfs_bmap.h | 2 +-
> fs/xfs/xfs_bmap_util.c | 2 +-
> fs/xfs/xfs_reflink.c | 2 +-
> 4 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 861945a5fce3..512f1ceca47f 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4666,7 +4666,8 @@ xfs_bmap_del_extent_delay(
> int whichfork,
> struct xfs_iext_cursor *icur,
> struct xfs_bmbt_irec *got,
> - struct xfs_bmbt_irec *del)
> + struct xfs_bmbt_irec *del,
> + uint32_t bflags) /* bmapi flags */
> {
> struct xfs_mount *mp = ip->i_mount;
> struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
> @@ -4786,7 +4787,9 @@ xfs_bmap_del_extent_delay(
> da_diff = da_old - da_new;
> fdblocks = da_diff;
>
> - if (isrt)
> + if (bflags & XFS_BMAPI_REMAP)
> + ;
> + else if (isrt)
> xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
> else
> fdblocks += del->br_blockcount;
> @@ -5388,7 +5391,8 @@ __xfs_bunmapi(
>
> delete:
> if (wasdel) {
> - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
> + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
> + &del, flags);
> } else {
> error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
> &del, &tmp_logflags, whichfork,
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index 4d48087fd3a8..b4d9c6e0f3f9 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
> xfs_extnum_t nexts, int *done);
> void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
> struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
> - struct xfs_bmbt_irec *del);
> + struct xfs_bmbt_irec *del, uint32_t bflags);
> void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
> struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
> struct xfs_bmbt_irec *del);
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 0836fea2d6d8..c623688e457c 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -467,7 +467,7 @@ xfs_bmap_punch_delalloc_range(
> continue;
> }
>
> - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
> + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
> if (!xfs_iext_get_extent(ifp, &icur, &got))
> break;
> }
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 3e778e077d09..b7dba5ad2f34 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
>
> if (isnullstartblock(del.br_startblock)) {
> xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
> - &del);
> + &del, 0);
> } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
> ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
>
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delayOM
2024-12-12 21:47 ` [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delayOM Darrick J. Wong
@ 2024-12-13 5:14 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:14 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:47:20PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:38AM +0100, Christoph Hellwig wrote:
> > The zone allocator wants to be able to remove a delalloc mapping in the
> > COW fork while keeping the block reservation. To support that pass the
> > blags argument down to xfs_bmap_del_extent_delay and support the
>
> bflags
>
> > XFS_BMAPI_REMAP flag to keep the reservation.
>
> Is REMAP the only bmapi flag that will be valid here?
Yes. I'll add an assert.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (12 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 13/43] xfs: support XFS_BMAPI_REMAP in xfs_bmap_del_extent_delay Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 21:48 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 15/43] xfs: define the zoned on-disk format Christoph Hellwig
` (28 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Add a helper to find the last offset mapped in the rtrmap. This will be
used by the zoned code to find out where to start writing again on
conventional devices without hardware zone support.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_rtrmap_btree.c | 16 ++++++++++++++++
fs/xfs/libxfs/xfs_rtrmap_btree.h | 2 ++
2 files changed, 18 insertions(+)
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 04b9c76380ad..b2bb0dd53b00 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -1033,3 +1033,19 @@ xfs_rtrmapbt_init_rtsb(
xfs_btree_del_cursor(cur, error);
return error;
}
+
+xfs_rgblock_t
+xfs_rtrmap_first_unwritten_rgbno(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
+ union xfs_btree_key key = {};
+ struct xfs_btree_cur *cur;
+
+ if (block->bb_numrecs == 0)
+ return 0;
+ cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_get_keys(cur, block, &key);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock) + 1;
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 6a2d432b55ad..d5cca8fcf4a3 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+xfs_rgblock_t xfs_rtrmap_first_unwritten_rgbno(struct xfs_rtgroup *rtg);
+
#endif /* __XFS_RTRMAP_BTREE_H__ */
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper
2024-12-11 8:54 ` [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper Christoph Hellwig
@ 2024-12-12 21:48 ` Darrick J. Wong
2024-12-13 5:16 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 21:48 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:39AM +0100, Christoph Hellwig wrote:
> Add a helper to find the last offset mapped in the rtrmap. This will be
> used by the zoned code to find out where to start writing again on
> conventional devices without hardware zone support.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_rtrmap_btree.c | 16 ++++++++++++++++
> fs/xfs/libxfs/xfs_rtrmap_btree.h | 2 ++
> 2 files changed, 18 insertions(+)
>
> diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
> index 04b9c76380ad..b2bb0dd53b00 100644
> --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
> @@ -1033,3 +1033,19 @@ xfs_rtrmapbt_init_rtsb(
> xfs_btree_del_cursor(cur, error);
> return error;
> }
> +
> +xfs_rgblock_t
> +xfs_rtrmap_first_unwritten_rgbno(
> + struct xfs_rtgroup *rtg)
Might want to leave a comment here saying that this only applies to
zoned realtime devices because they are written start to end, not
randomly. Otherwise this looks ok to me, having peered into the future
to see how it got used. :)
--D
> +{
> + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
> + union xfs_btree_key key = {};
> + struct xfs_btree_cur *cur;
> +
> + if (block->bb_numrecs == 0)
> + return 0;
> + cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
> + xfs_btree_get_keys(cur, block, &key);
> + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
> + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock) + 1;
> +}
> diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
> index 6a2d432b55ad..d5cca8fcf4a3 100644
> --- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
> +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
> @@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
> int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
> struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
>
> +xfs_rgblock_t xfs_rtrmap_first_unwritten_rgbno(struct xfs_rtgroup *rtg);
> +
> #endif /* __XFS_RTRMAP_BTREE_H__ */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper
2024-12-12 21:48 ` Darrick J. Wong
@ 2024-12-13 5:16 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:16 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 01:48:51PM -0800, Darrick J. Wong wrote:
> > }
> > +
> > +xfs_rgblock_t
> > +xfs_rtrmap_first_unwritten_rgbno(
> > + struct xfs_rtgroup *rtg)
>
> Might want to leave a comment here saying that this only applies to
> zoned realtime devices because they are written start to end, not
> randomly. Otherwise this looks ok to me, having peered into the future
> to see how it got used. :)
Yes. Or rename it and make it return the highest tracked rgbno and
return NULLRGBLOCK, so that all the meaning assigned to that stays
in the caller, which might be less confusing.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 15/43] xfs: define the zoned on-disk format
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (13 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 14/43] xfs: add a xfs_rtrmap_first_unwritten_rgbno helper Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:02 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 16/43] xfs: allow internal RT devices for zoned mode Christoph Hellwig
` (27 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Zone file systems reuse the basic RT group enabled XFS file system
structure to support a mode where each RT group is always written from
start to end and then reset for reuse (after moving out any remaining
data). There are few minor but important changes, which are indicated
by a new incompat flag:
1) there are no bitmap and summary inodes, and thus the sb_rbmblocks
superblock field must be cleared to zero
2) there is a new superblock field that specifies the start of an
internal RT section. This allows supporting SMR HDDs that have random
writable space at the beginning which is used for the XFS data device
(which really is the metadata device for this configuration), directly
followed by a RT device on the same block device. While something
similar could be achieved using dm-linear just having a single device
directly consumed by XFS makes handling the file systems a lot easier.
3) Another superblock field that tracks the amount of reserved space (or
overprovisioning) that is never used for user capacity, but allows GC
to run more smoothly.
4) an overlay of the cowextsizse field for the rtrmap inode so that we
can persistently track the total amount of bytes currently used in
a RT group. There is no data structure other than the rmap that
tracks used space in an RT group, and this counter is used to decide
when a RT group has been entirely emptied, and to select one that
is relatively empty if garbage collection needs to be performed.
While this counter could be tracked entirely in memory and rebuilt
from the rmap at mount time, that would lead to very long mount times
with the large number of RT groups implied by the number of hardware
zones especially on SMR hard drives with 256MB zone sizes.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_format.h | 15 ++++++++++---
fs/xfs/libxfs/xfs_inode_buf.c | 18 +++++++++++-----
fs/xfs/libxfs/xfs_inode_util.c | 1 +
fs/xfs/libxfs/xfs_log_format.h | 7 ++++++-
fs/xfs/libxfs/xfs_ondisk.h | 6 ++++--
fs/xfs/libxfs/xfs_rtbitmap.c | 11 ++++++++++
fs/xfs/libxfs/xfs_rtgroup.c | 37 ++++++++++++++++++++-------------
fs/xfs/libxfs/xfs_sb.c | 35 ++++++++++++++++++++++++++++---
| 2 ++
fs/xfs/scrub/inode.c | 7 +++++++
fs/xfs/scrub/inode_repair.c | 4 +++-
fs/xfs/scrub/scrub.c | 2 ++
fs/xfs/xfs_fsmap.c | 6 +++++-
fs/xfs/xfs_inode.c | 3 ++-
fs/xfs/xfs_inode.h | 12 ++++++++++-
fs/xfs/xfs_inode_item.c | 1 +
fs/xfs/xfs_inode_item_recover.c | 1 +
fs/xfs/xfs_iomap.c | 1 +
fs/xfs/xfs_message.c | 4 ++++
fs/xfs/xfs_message.h | 1 +
fs/xfs/xfs_mount.h | 13 +++++++++++-
fs/xfs/xfs_rtalloc.c | 2 ++
fs/xfs/xfs_super.c | 11 +++++++++-
23 files changed, 165 insertions(+), 35 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b1007fb661ba..12979496f30a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -178,9 +178,10 @@ typedef struct xfs_sb {
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
-
uint8_t sb_rgblklog; /* rt group number shift */
uint8_t sb_pad[7]; /* zeroes */
+ xfs_fsblock_t sb_rtstart; /* start of internal RT section (FSB) */
+ xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
__be64 sb_metadirino; /* metadata directory tree root */
__be32 sb_rgcount; /* # of realtime groups */
__be32 sb_rgextents; /* size of rtgroup in rtx */
-
__u8 sb_rgblklog; /* rt group number shift */
__u8 sb_pad[7]; /* zeroes */
+ __be64 sb_rtstart; /* start of internal RT section (FSB) */
+ __be64 sb_rtreserved; /* reserved (zoned) RT blocks */
/*
* The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,8 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
+#define XFS_SB_FEAT_INCOMPAT_ZONED (1U << 31)/* zoned RT allocator */
+
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -952,7 +956,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __be32 di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ __be32 di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ __be32 di_used_blocks;
+ };
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f24fa628fecf..cd38be2a69be 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -252,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
+ sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -752,11 +756,15 @@ xfs_dinode_verify(
!xfs_has_rtreflink(mp))
return __this_address;
- /* COW extent size hint validation */
- fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
- mode, flags, flags2);
- if (fa)
- return fa;
+ if (!xfs_has_zoned(mp) ||
+ dip->di_metatype != cpu_to_be16(XFS_METAFILE_RTRMAP)) {
+ /* COW extent size hint validation */
+ fa = xfs_inode_validate_cowextsize(mp,
+ be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index deb0b7c00a1f..48fe49a5f050 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -322,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index ec7157eaba5f..8f6fd195ce6e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -481,7 +481,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
- uint32_t di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ uint32_t di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ uint32_t di_used_blocks;
+ };
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index a85ecddaa48e..5ed44fdf7491 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
16299260424LL);
/* superblock field checks we got from xfs/122 */
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
- XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
XFS_CHECK_SB_OFFSET(sb_pad, 281);
+ XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
+ XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 770adf60dd73..5057536e586c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
xfs_extlen_t mod;
int error;
+ ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
end = min(end, rtg->rtg_extents - 1);
+ if (xfs_has_zoned(mp))
+ return -EINVAL;
+
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
@@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
+ if (xfs_has_zoned(mp))
+ return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
@@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
+ if (xfs_has_zoned(mp)) {
+ *rsumlevels = 0;
+ return 0;
+ }
+
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index 97aad8967149..9186c58e83d5 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -194,15 +194,17 @@ xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- /*
- * Lock both realtime free space metadata inodes for a freespace
- * update.
- */
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ /*
+ * Lock both realtime free space metadata inodes for a
+ * freespace update.
+ */
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
@@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
}
@@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ if (!xfs_has_zoned(rtg_mount(rtg)) &&
+ (rtglock_flags & XFS_RTGLOCK_BITMAP)) {
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
@@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
@@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 090f133f4da3..0bbe0b87bf04 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -30,6 +30,7 @@
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -185,6 +186,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
+ features |= XFS_FEAT_ZONED;
return features;
}
@@ -266,6 +269,9 @@ static uint64_t
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+ return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
@@ -275,9 +281,19 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
- if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
- sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
- return false;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
+ if (sbp->sb_rextsize != 1)
+ return false;
+ if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks)
+ return false;
+ if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks)
+ return false;
+ } else {
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
+ return false;
+ }
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@@ -835,6 +851,14 @@ __xfs_sb_from_disk(
to->sb_rgcount = 1;
to->sb_rgextents = 0;
}
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
+ to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
+ } else {
+ to->sb_rtstart = 0;
+ to->sb_rtreserved = 0;
+ }
}
void
@@ -1001,6 +1025,11 @@ xfs_sb_to_disk(
to->sb_rbmino = cpu_to_be64(0);
to->sb_rsumino = cpu_to_be64(0);
}
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
+ to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
+ }
}
/*
--git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 190d56f81344..68de763b2543 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -64,6 +64,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
+ if (xfs_has_zoned(mp))
+ return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index db6edd5a5fe5..bb3f475b6353 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
+ /*
+ * The used block counter for rtrmap is checked and repaired elsewhere.
+ */
+ if (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
+ return;
+
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 2f641b6d663e..c8d17dd4fc32 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
- if (dip->di_version < 3)
+ if (dip->di_version < 3 ||
+ (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7567dd5cad14..1a05c27ba471 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -387,12 +387,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 1dbd2d75f7ae..917d4d0e51b3 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -1138,7 +1138,11 @@ xfs_getfsmap(
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
- if (mp->m_rtdev_targp) {
+ /*
+ * For zoned file systems there is no rtbitmap, so only support fsmap
+ * if the callers is privileged enough to use the full rmap version.
+ */
+ if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
if (use_rmap)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c95fe1b1de4e..4624d40c664c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3071,5 +3071,6 @@ bool
xfs_is_always_cow_inode(
const struct xfs_inode *ip)
{
- return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+ return xfs_is_zoned_inode(ip) ||
+ (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c08093a65352..10cd8f0c4697 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -69,8 +69,13 @@ typedef struct xfs_inode {
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
xfs_extlen_t i_extsize; /* basic/minimum extent size */
- /* cowextsize is only used for v3 inodes, flushiter for v1/2 */
+ /*
+ * i_used_blocks is used for zoned rtrmap inodes,
+ * i_cowextsize is used for other v3 inodes,
+ * i_flushiter for v1/2 inodes
+ */
union {
+ uint32_t i_used_blocks; /* used blocks in RTG */
xfs_extlen_t i_cowextsize; /* basic cow extent size */
uint16_t i_flushiter; /* incremented on flush */
};
@@ -309,6 +314,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
+static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
+{
+ return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
+}
+
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 70283c6419fd..2f1122e3ab12 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 4e583bfc5ca8..7b8b8610de35 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f3f4b5c328c3..aa1db0dc1d98 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1216,6 +1216,7 @@ xfs_bmapi_reserve_delalloc(
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
+ ASSERT(!xfs_is_zoned_inode(ip));
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 6ed485ff2756..15d410d16bb2 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -173,6 +173,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
+ [XFS_EXPERIMENTAL_ZONED] = {
+ .opstate = XFS_OPSTATE_WARNED_ZONED,
+ .name = "zoned RT device",
+ },
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb36ced9df7..a92a4d09c8e9 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -99,6 +99,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
+ XFS_EXPERIMENTAL_ZONED,
XFS_EXPERIMENTAL_MAX,
};
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 73bc053fdd17..72c5389ff78b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -343,6 +343,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
+#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -399,6 +400,7 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
+__XFS_HAS_FEAT(zoned, ZONED)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
@@ -409,7 +411,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
- return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
+ return xfs_has_rtgroups(mp) &&
+ xfs_has_realtime(mp) &&
+ !xfs_has_zoned(mp);
}
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
@@ -424,6 +428,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
xfs_has_reflink(mp);
}
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+ return !xfs_has_zoned(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -527,6 +536,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
+/* Kernel has logged a warning about zoned RT device being used on this fs. */
+#define XFS_OPSTATE_WARNED_ZONED 19
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8da2498417f5..e457a2c2d561 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2097,6 +2097,8 @@ xfs_bmap_rtalloc(
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
+ ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f57c27940467..92dd44965943 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1784,8 +1784,17 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
- if (xfs_has_metadir(mp))
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_has_metadir(mp)) {
+ xfs_alert(mp,
+ "metadir feature required for zoned realtime devices.");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
+ } else if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+ }
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 15/43] xfs: define the zoned on-disk format
2024-12-11 8:54 ` [PATCH 15/43] xfs: define the zoned on-disk format Christoph Hellwig
@ 2024-12-12 22:02 ` Darrick J. Wong
2024-12-13 5:22 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:02 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:40AM +0100, Christoph Hellwig wrote:
> Zone file systems reuse the basic RT group enabled XFS file system
> structure to support a mode where each RT group is always written from
> start to end and then reset for reuse (after moving out any remaining
> data). There are few minor but important changes, which are indicated
> by a new incompat flag:
>
> 1) there are no bitmap and summary inodes, and thus the sb_rbmblocks
> superblock field must be cleared to zero
zoned rt requires rt rmap and reflink, and hence metadir. There is no
such field as sb_rbmblocks anymore.
"...and thus there must not be any /rtgroups/0.{bitmap,summary} files."
> 2) there is a new superblock field that specifies the start of an
> internal RT section. This allows supporting SMR HDDs that have random
> writable space at the beginning which is used for the XFS data device
> (which really is the metadata device for this configuration), directly
> followed by a RT device on the same block device. While something
> similar could be achieved using dm-linear just having a single device
> directly consumed by XFS makes handling the file systems a lot easier.
>
> 3) Another superblock field that tracks the amount of reserved space (or
> overprovisioning) that is never used for user capacity, but allows GC
> to run more smoothly.
>
> 4) an overlay of the cowextsizse field for the rtrmap inode so that we
cowextsize
> can persistently track the total amount of bytes currently used in
Isn't this the total number of *fsblocks* currently used?
> a RT group. There is no data structure other than the rmap that
> tracks used space in an RT group, and this counter is used to decide
> when a RT group has been entirely emptied, and to select one that
> is relatively empty if garbage collection needs to be performed.
> While this counter could be tracked entirely in memory and rebuilt
> from the rmap at mount time, that would lead to very long mount times
> with the large number of RT groups implied by the number of hardware
> zones especially on SMR hard drives with 256MB zone sizes.
Heh, I guess I should go down to my lab and plug in this smr disk and
see how many zones it reports...
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_format.h | 15 ++++++++++---
> fs/xfs/libxfs/xfs_inode_buf.c | 18 +++++++++++-----
> fs/xfs/libxfs/xfs_inode_util.c | 1 +
> fs/xfs/libxfs/xfs_log_format.h | 7 ++++++-
> fs/xfs/libxfs/xfs_ondisk.h | 6 ++++--
> fs/xfs/libxfs/xfs_rtbitmap.c | 11 ++++++++++
> fs/xfs/libxfs/xfs_rtgroup.c | 37 ++++++++++++++++++++-------------
> fs/xfs/libxfs/xfs_sb.c | 35 ++++++++++++++++++++++++++++---
> fs/xfs/scrub/agheader.c | 2 ++
> fs/xfs/scrub/inode.c | 7 +++++++
> fs/xfs/scrub/inode_repair.c | 4 +++-
> fs/xfs/scrub/scrub.c | 2 ++
> fs/xfs/xfs_fsmap.c | 6 +++++-
> fs/xfs/xfs_inode.c | 3 ++-
> fs/xfs/xfs_inode.h | 12 ++++++++++-
> fs/xfs/xfs_inode_item.c | 1 +
> fs/xfs/xfs_inode_item_recover.c | 1 +
> fs/xfs/xfs_iomap.c | 1 +
> fs/xfs/xfs_message.c | 4 ++++
> fs/xfs/xfs_message.h | 1 +
> fs/xfs/xfs_mount.h | 13 +++++++++++-
> fs/xfs/xfs_rtalloc.c | 2 ++
> fs/xfs/xfs_super.c | 11 +++++++++-
> 23 files changed, 165 insertions(+), 35 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index b1007fb661ba..12979496f30a 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -178,9 +178,10 @@ typedef struct xfs_sb {
>
> xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
> xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
> -
> uint8_t sb_rgblklog; /* rt group number shift */
> uint8_t sb_pad[7]; /* zeroes */
> + xfs_fsblock_t sb_rtstart; /* start of internal RT section (FSB) */
> + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
>
> /* must be padded to 64 bit alignment */
> } xfs_sb_t;
> @@ -270,9 +271,10 @@ struct xfs_dsb {
> __be64 sb_metadirino; /* metadata directory tree root */
> __be32 sb_rgcount; /* # of realtime groups */
> __be32 sb_rgextents; /* size of rtgroup in rtx */
> -
> __u8 sb_rgblklog; /* rt group number shift */
> __u8 sb_pad[7]; /* zeroes */
> + __be64 sb_rtstart; /* start of internal RT section (FSB) */
> + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */
>
> /*
> * The size of this structure must be padded to 64 bit alignment.
> @@ -395,6 +397,8 @@ xfs_sb_has_ro_compat_feature(
> #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
> #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
> #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
> +#define XFS_SB_FEAT_INCOMPAT_ZONED (1U << 31)/* zoned RT allocator */
> +
> #define XFS_SB_FEAT_INCOMPAT_ALL \
> (XFS_SB_FEAT_INCOMPAT_FTYPE | \
> XFS_SB_FEAT_INCOMPAT_SPINODES | \
> @@ -952,7 +956,12 @@ struct xfs_dinode {
> __be64 di_changecount; /* number of attribute changes */
> __be64 di_lsn; /* flush sequence */
> __be64 di_flags2; /* more random flags */
> - __be32 di_cowextsize; /* basic cow extent size for file */
> + union {
> + /* basic cow extent size for (regular) file */
> + __be32 di_cowextsize;
> + /* used blocks in RTG for (zoned) rtrmap inode */
> + __be32 di_used_blocks;
> + };
> __u8 di_pad2[12]; /* more padding for future expansion */
>
> /* fields only written to during inode creation */
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index f24fa628fecf..cd38be2a69be 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -252,7 +252,10 @@ xfs_inode_from_disk(
> be64_to_cpu(from->di_changecount));
> ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
> ip->i_diflags2 = be64_to_cpu(from->di_flags2);
> + /* also covers the di_used_blocks union arm: */
> ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
> + BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
> + sizeof(from->di_used_blocks));
> }
>
> error = xfs_iformat_data_fork(ip, from);
> @@ -349,6 +352,7 @@ xfs_inode_to_disk(
> to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
> to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
> to->di_flags2 = cpu_to_be64(ip->i_diflags2);
> + /* also covers the di_used_blocks union arm: */
> to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
> to->di_ino = cpu_to_be64(ip->i_ino);
> to->di_lsn = cpu_to_be64(lsn);
> @@ -752,11 +756,15 @@ xfs_dinode_verify(
> !xfs_has_rtreflink(mp))
> return __this_address;
>
> - /* COW extent size hint validation */
> - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> - mode, flags, flags2);
> - if (fa)
> - return fa;
> + if (!xfs_has_zoned(mp) ||
> + dip->di_metatype != cpu_to_be16(XFS_METAFILE_RTRMAP)) {
> + /* COW extent size hint validation */
> + fa = xfs_inode_validate_cowextsize(mp,
> + be32_to_cpu(dip->di_cowextsize),
> + mode, flags, flags2);
I think there's *some* validation you could do, such as checking that
i_cowextsize <= the number of blocks in the rtgroup.
I almost wonder if you should add that kind of logic to
xfs_inode_validate_cowextsize but that might be one incoherence too
many. OTOH it would probably reduce the number of changes in the fsck
code.
--D
> + if (fa)
> + return fa;
> + }
>
> /* bigtime iflag can only happen on bigtime filesystems */
> if (xfs_dinode_has_bigtime(dip) &&
> diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
> index deb0b7c00a1f..48fe49a5f050 100644
> --- a/fs/xfs/libxfs/xfs_inode_util.c
> +++ b/fs/xfs/libxfs/xfs_inode_util.c
> @@ -322,6 +322,7 @@ xfs_inode_init(
>
> if (xfs_has_v3inodes(mp)) {
> inode_set_iversion(inode, 1);
> + /* also covers the di_used_blocks union arm: */
> ip->i_cowextsize = 0;
> times |= XFS_ICHGTIME_CREATE;
> }
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index ec7157eaba5f..8f6fd195ce6e 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -481,7 +481,12 @@ struct xfs_log_dinode {
> xfs_lsn_t di_lsn;
>
> uint64_t di_flags2; /* more random flags */
> - uint32_t di_cowextsize; /* basic cow extent size for file */
> + union {
> + /* basic cow extent size for (regular) file */
> + uint32_t di_cowextsize;
> + /* used blocks in RTG for (zoned) rtrmap inode */
> + uint32_t di_used_blocks;
> + };
> uint8_t di_pad2[12]; /* more padding for future expansion */
>
> /* fields only written to during inode creation */
> diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
> index a85ecddaa48e..5ed44fdf7491 100644
> --- a/fs/xfs/libxfs/xfs_ondisk.h
> +++ b/fs/xfs/libxfs/xfs_ondisk.h
> @@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
> 16299260424LL);
>
> /* superblock field checks we got from xfs/122 */
> - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
> - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
> + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
> + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
> XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
> XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
> XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
> @@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
> XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
> XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
> XFS_CHECK_SB_OFFSET(sb_pad, 281);
> + XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
> + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
> }
>
> #endif /* __XFS_ONDISK_H */
> diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
> index 770adf60dd73..5057536e586c 100644
> --- a/fs/xfs/libxfs/xfs_rtbitmap.c
> +++ b/fs/xfs/libxfs/xfs_rtbitmap.c
> @@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
> xfs_extlen_t mod;
> int error;
>
> + ASSERT(!xfs_has_zoned(mp));
> ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
>
> mod = xfs_blen_to_rtxoff(mp, rtlen);
> @@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
>
> end = min(end, rtg->rtg_extents - 1);
>
> + if (xfs_has_zoned(mp))
> + return -EINVAL;
> +
> /* Iterate the bitmap, looking for discrepancies. */
> while (start <= end) {
> struct xfs_rtalloc_rec rec;
> @@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
> struct xfs_mount *mp,
> xfs_rtbxlen_t rtextents)
> {
> + if (xfs_has_zoned(mp))
> + return 0;
> return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
> }
>
> @@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
> xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
> unsigned long long rsumwords;
>
> + if (xfs_has_zoned(mp)) {
> + *rsumlevels = 0;
> + return 0;
> + }
> +
> *rsumlevels = xfs_compute_rextslog(rextents) + 1;
> rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
> return howmany_64(rsumwords, mp->m_blockwsize);
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
> index 97aad8967149..9186c58e83d5 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.c
> +++ b/fs/xfs/libxfs/xfs_rtgroup.c
> @@ -194,15 +194,17 @@ xfs_rtgroup_lock(
> ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
> !(rtglock_flags & XFS_RTGLOCK_BITMAP));
>
> - if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
> - /*
> - * Lock both realtime free space metadata inodes for a freespace
> - * update.
> - */
> - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
> - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
> - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
> - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
> + if (!xfs_has_zoned(rtg_mount(rtg))) {
> + if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
> + /*
> + * Lock both realtime free space metadata inodes for a
> + * freespace update.
> + */
> + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
> + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
> + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
> + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
> + }
> }
>
> if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
> @@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
> if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
> xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
>
> - if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
> - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
> - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
> - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
> - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
> + if (!xfs_has_zoned(rtg_mount(rtg))) {
> + if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
> + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
> + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
> + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
> + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
> + }
> }
> }
>
> @@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
> ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
> ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
>
> - if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
> + if (!xfs_has_zoned(rtg_mount(rtg)) &&
> + (rtglock_flags & XFS_RTGLOCK_BITMAP)) {
> xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
> xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
> }
> @@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
> .sick = XFS_SICK_RG_BITMAP,
> .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
> (1U << XFS_DINODE_FMT_BTREE),
> + .enabled = xfs_has_nonzoned,
> .create = xfs_rtbitmap_create,
> },
> [XFS_RTGI_SUMMARY] = {
> @@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
> .sick = XFS_SICK_RG_SUMMARY,
> .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
> (1U << XFS_DINODE_FMT_BTREE),
> + .enabled = xfs_has_nonzoned,
> .create = xfs_rtsummary_create,
> },
> [XFS_RTGI_RMAP] = {
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 090f133f4da3..0bbe0b87bf04 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -30,6 +30,7 @@
> #include "xfs_rtgroup.h"
> #include "xfs_rtrmap_btree.h"
> #include "xfs_rtrefcount_btree.h"
> +#include "xfs_rtbitmap.h"
>
> /*
> * Physical superblock buffer manipulations. Shared with libxfs in userspace.
> @@ -185,6 +186,8 @@ xfs_sb_version_to_features(
> features |= XFS_FEAT_PARENT;
> if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
> features |= XFS_FEAT_METADIR;
> + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
> + features |= XFS_FEAT_ZONED;
>
> return features;
> }
> @@ -266,6 +269,9 @@ static uint64_t
> xfs_expected_rbmblocks(
> struct xfs_sb *sbp)
> {
> + if (xfs_sb_is_v5(sbp) &&
> + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
> + return 0;
> return howmany_64(xfs_extents_per_rbm(sbp),
> NBBY * xfs_rtbmblock_size(sbp));
> }
> @@ -275,9 +281,19 @@ bool
> xfs_validate_rt_geometry(
> struct xfs_sb *sbp)
> {
> - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
> - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
> - return false;
> + if (xfs_sb_is_v5(sbp) &&
> + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
> + if (sbp->sb_rextsize != 1)
> + return false;
> + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks)
> + return false;
> + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks)
> + return false;
> + } else {
> + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
> + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
> + return false;
> + }
>
> if (sbp->sb_rblocks == 0) {
> if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
> @@ -835,6 +851,14 @@ __xfs_sb_from_disk(
> to->sb_rgcount = 1;
> to->sb_rgextents = 0;
> }
> +
> + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
> + to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
> + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
> + } else {
> + to->sb_rtstart = 0;
> + to->sb_rtreserved = 0;
> + }
> }
>
> void
> @@ -1001,6 +1025,11 @@ xfs_sb_to_disk(
> to->sb_rbmino = cpu_to_be64(0);
> to->sb_rsumino = cpu_to_be64(0);
> }
> +
> + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
> + to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
> + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
> + }
> }
>
> /*
> diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
> index 190d56f81344..68de763b2543 100644
> --- a/fs/xfs/scrub/agheader.c
> +++ b/fs/xfs/scrub/agheader.c
> @@ -64,6 +64,8 @@ STATIC size_t
> xchk_superblock_ondisk_size(
> struct xfs_mount *mp)
> {
> + if (xfs_has_zoned(mp))
> + return offsetofend(struct xfs_dsb, sb_rtreserved);
> if (xfs_has_metadir(mp))
> return offsetofend(struct xfs_dsb, sb_pad);
> if (xfs_has_metauuid(mp))
> diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
> index db6edd5a5fe5..bb3f475b6353 100644
> --- a/fs/xfs/scrub/inode.c
> +++ b/fs/xfs/scrub/inode.c
> @@ -273,6 +273,13 @@ xchk_inode_cowextsize(
> xfs_failaddr_t fa;
> uint32_t value = be32_to_cpu(dip->di_cowextsize);
>
> + /*
> + * The used block counter for rtrmap is checked and repaired elsewhere.
> + */
> + if (xfs_has_zoned(sc->mp) &&
> + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
> + return;
> +
> fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
> if (fa)
> xchk_ino_set_corrupt(sc, ino);
> diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
> index 2f641b6d663e..c8d17dd4fc32 100644
> --- a/fs/xfs/scrub/inode_repair.c
> +++ b/fs/xfs/scrub/inode_repair.c
> @@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
> XFS_DIFLAG_EXTSZINHERIT);
> }
>
> - if (dip->di_version < 3)
> + if (dip->di_version < 3 ||
> + (xfs_has_zoned(sc->mp) &&
> + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
> return;
>
> fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index 7567dd5cad14..1a05c27ba471 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c
> @@ -387,12 +387,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
> },
> [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
> .type = ST_RTGROUP,
> + .has = xfs_has_nonzoned,
> .setup = xchk_setup_rtbitmap,
> .scrub = xchk_rtbitmap,
> .repair = xrep_rtbitmap,
> },
> [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
> .type = ST_RTGROUP,
> + .has = xfs_has_nonzoned,
> .setup = xchk_setup_rtsummary,
> .scrub = xchk_rtsummary,
> .repair = xrep_rtsummary,
> diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
> index 1dbd2d75f7ae..917d4d0e51b3 100644
> --- a/fs/xfs/xfs_fsmap.c
> +++ b/fs/xfs/xfs_fsmap.c
> @@ -1138,7 +1138,11 @@ xfs_getfsmap(
> handlers[1].fn = xfs_getfsmap_logdev;
> }
> #ifdef CONFIG_XFS_RT
> - if (mp->m_rtdev_targp) {
> + /*
> + * For zoned file systems there is no rtbitmap, so only support fsmap
> + * if the callers is privileged enough to use the full rmap version.
> + */
> + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
> handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
> handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
> if (use_rmap)
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index c95fe1b1de4e..4624d40c664c 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3071,5 +3071,6 @@ bool
> xfs_is_always_cow_inode(
> const struct xfs_inode *ip)
> {
> - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
> + return xfs_is_zoned_inode(ip) ||
> + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
> }
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index c08093a65352..10cd8f0c4697 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -69,8 +69,13 @@ typedef struct xfs_inode {
> xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
> prid_t i_projid; /* owner's project id */
> xfs_extlen_t i_extsize; /* basic/minimum extent size */
> - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */
> + /*
> + * i_used_blocks is used for zoned rtrmap inodes,
> + * i_cowextsize is used for other v3 inodes,
> + * i_flushiter for v1/2 inodes
> + */
> union {
> + uint32_t i_used_blocks; /* used blocks in RTG */
> xfs_extlen_t i_cowextsize; /* basic cow extent size */
> uint16_t i_flushiter; /* incremented on flush */
> };
> @@ -309,6 +314,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
> xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
> }
>
> +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
> +{
> + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
> +}
> +
> bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
>
> static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 70283c6419fd..2f1122e3ab12 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
> to->di_changecount = inode_peek_iversion(inode);
> to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
> to->di_flags2 = ip->i_diflags2;
> + /* also covers the di_used_blocks union arm: */
> to->di_cowextsize = ip->i_cowextsize;
> to->di_ino = ip->i_ino;
> to->di_lsn = lsn;
> diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
> index 4e583bfc5ca8..7b8b8610de35 100644
> --- a/fs/xfs/xfs_inode_item_recover.c
> +++ b/fs/xfs/xfs_inode_item_recover.c
> @@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
> to->di_crtime = xfs_log_dinode_to_disk_ts(from,
> from->di_crtime);
> to->di_flags2 = cpu_to_be64(from->di_flags2);
> + /* also covers the di_used_blocks union arm: */
> to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
> to->di_ino = cpu_to_be64(from->di_ino);
> to->di_lsn = cpu_to_be64(lsn);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index f3f4b5c328c3..aa1db0dc1d98 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1216,6 +1216,7 @@ xfs_bmapi_reserve_delalloc(
>
> fdblocks = indlen;
> if (XFS_IS_REALTIME_INODE(ip)) {
> + ASSERT(!xfs_is_zoned_inode(ip));
> error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
> if (error)
> goto out_unreserve_quota;
> diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
> index 6ed485ff2756..15d410d16bb2 100644
> --- a/fs/xfs/xfs_message.c
> +++ b/fs/xfs/xfs_message.c
> @@ -173,6 +173,10 @@ xfs_warn_experimental(
> .opstate = XFS_OPSTATE_WARNED_METADIR,
> .name = "metadata directory tree",
> },
> + [XFS_EXPERIMENTAL_ZONED] = {
> + .opstate = XFS_OPSTATE_WARNED_ZONED,
> + .name = "zoned RT device",
> + },
> };
> ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
> BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
> diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
> index 7fb36ced9df7..a92a4d09c8e9 100644
> --- a/fs/xfs/xfs_message.h
> +++ b/fs/xfs/xfs_message.h
> @@ -99,6 +99,7 @@ enum xfs_experimental_feat {
> XFS_EXPERIMENTAL_EXCHRANGE,
> XFS_EXPERIMENTAL_PPTR,
> XFS_EXPERIMENTAL_METADIR,
> + XFS_EXPERIMENTAL_ZONED,
>
> XFS_EXPERIMENTAL_MAX,
> };
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 73bc053fdd17..72c5389ff78b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -343,6 +343,7 @@ typedef struct xfs_mount {
> #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
> #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
> #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
> +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
>
> /* Mount features */
> #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
> @@ -399,6 +400,7 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
> __XFS_HAS_FEAT(large_extent_counts, NREXT64)
> __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
> __XFS_HAS_FEAT(metadir, METADIR)
> +__XFS_HAS_FEAT(zoned, ZONED)
>
> static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
> {
> @@ -409,7 +411,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
> static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
> {
> /* all rtgroups filesystems with an rt section have an rtsb */
> - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
> + return xfs_has_rtgroups(mp) &&
> + xfs_has_realtime(mp) &&
> + !xfs_has_zoned(mp);
> }
>
> static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
> @@ -424,6 +428,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
> xfs_has_reflink(mp);
> }
>
> +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
> +{
> + return !xfs_has_zoned(mp);
> +}
> +
> /*
> * Some features are always on for v5 file systems, allow the compiler to
> * eliminiate dead code when building without v4 support.
> @@ -527,6 +536,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
> #define XFS_OPSTATE_WARNED_METADIR 17
> /* Filesystem should use qflags to determine quotaon status */
> #define XFS_OPSTATE_RESUMING_QUOTAON 18
> +/* Kernel has logged a warning about zoned RT device being used on this fs. */
> +#define XFS_OPSTATE_WARNED_ZONED 19
>
> #define __XFS_IS_OPSTATE(name, NAME) \
> static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index 8da2498417f5..e457a2c2d561 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -2097,6 +2097,8 @@ xfs_bmap_rtalloc(
> ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
> int error;
>
> + ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
> +
> retry:
> error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
> if (error)
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index f57c27940467..92dd44965943 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1784,8 +1784,17 @@ xfs_fs_fill_super(
> mp->m_features &= ~XFS_FEAT_DISCARD;
> }
>
> - if (xfs_has_metadir(mp))
> + if (xfs_has_zoned(mp)) {
> + if (!xfs_has_metadir(mp)) {
> + xfs_alert(mp,
> + "metadir feature required for zoned realtime devices.");
> + error = -EINVAL;
> + goto out_filestream_unmount;
> + }
> + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
> + } else if (xfs_has_metadir(mp)) {
> xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
> + }
>
> if (xfs_has_reflink(mp)) {
> if (xfs_has_realtime(mp) &&
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 15/43] xfs: define the zoned on-disk format
2024-12-12 22:02 ` Darrick J. Wong
@ 2024-12-13 5:22 ` Christoph Hellwig
2024-12-13 17:09 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:22 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 02:02:20PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:40AM +0100, Christoph Hellwig wrote:
> > Zone file systems reuse the basic RT group enabled XFS file system
> > structure to support a mode where each RT group is always written from
> > start to end and then reset for reuse (after moving out any remaining
> > data). There are few minor but important changes, which are indicated
> > by a new incompat flag:
> >
> > 1) there are no bitmap and summary inodes, and thus the sb_rbmblocks
> > superblock field must be cleared to zero
>
> zoned rt requires rt rmap and reflink, and hence metadir. There is no
> such field as sb_rbmblocks anymore.
I doesn't actually require reflink - in fact it currently is incompatible
with reflink due to GC not understanding refcounts (it does depend on
your reflink code as it's reusing a few bits from that just to make it
confusing).
And sb_rbmblocks is actually still set for metadir file systems.
> > 4) an overlay of the cowextsizse field for the rtrmap inode so that we
>
> cowextsize
>
> > can persistently track the total amount of bytes currently used in
>
> Isn't this the total number of *fsblocks* currently used?
or rtblocks? :) But yes, it's not byte granularity obviously, no idea
why I wrote that.
> Heh, I guess I should go down to my lab and plug in this smr disk and
> see how many zones it reports...
It will be capacity in bytes / 256MB unless you found a really, really
weird beast.
> > - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> > - mode, flags, flags2);
> > - if (fa)
> > - return fa;
> > + if (!xfs_has_zoned(mp) ||
> > + dip->di_metatype != cpu_to_be16(XFS_METAFILE_RTRMAP)) {
> > + /* COW extent size hint validation */
> > + fa = xfs_inode_validate_cowextsize(mp,
> > + be32_to_cpu(dip->di_cowextsize),
> > + mode, flags, flags2);
>
> I think there's *some* validation you could do, such as checking that
> i_cowextsize <= the number of blocks in the rtgroup.
So we do a fair amount of validation in xfs_zone_validate based on the
hardware zone state. I tried to add more here but it failed because
we getting at the rtgroups wasn't easily possible. But yes, I think
a simple rgsize check should be possible at least.
> I almost wonder if you should add that kind of logic to
> xfs_inode_validate_cowextsize but that might be one incoherence too
> many. OTOH it would probably reduce the number of changes in the fsck
> code.
I'll take a look, but having a cowextsize helper that validated a field
overlay with an entirely different meaning sounds a bit confusing.
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 15/43] xfs: define the zoned on-disk format
2024-12-13 5:22 ` Christoph Hellwig
@ 2024-12-13 17:09 ` Darrick J. Wong
2024-12-15 5:20 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 17:09 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:22:10AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 02:02:20PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:40AM +0100, Christoph Hellwig wrote:
> > > Zone file systems reuse the basic RT group enabled XFS file system
> > > structure to support a mode where each RT group is always written from
> > > start to end and then reset for reuse (after moving out any remaining
> > > data). There are few minor but important changes, which are indicated
> > > by a new incompat flag:
> > >
> > > 1) there are no bitmap and summary inodes, and thus the sb_rbmblocks
> > > superblock field must be cleared to zero
> >
> > zoned rt requires rt rmap and reflink, and hence metadir. There is no
> > such field as sb_rbmblocks anymore.
>
> I doesn't actually require reflink - in fact it currently is incompatible
> with reflink due to GC not understanding refcounts (it does depend on
> your reflink code as it's reusing a few bits from that just to make it
> confusing).
>
> And sb_rbmblocks is actually still set for metadir file systems.
Oops, I misread that as sb_rbmino. Yes, sb_rbmblocks must be zero now.
> > > 4) an overlay of the cowextsizse field for the rtrmap inode so that we
> >
> > cowextsize
> >
> > > can persistently track the total amount of bytes currently used in
> >
> > Isn't this the total number of *fsblocks* currently used?
>
> or rtblocks? :) But yes, it's not byte granularity obviously, no idea
> why I wrote that.
>
> > Heh, I guess I should go down to my lab and plug in this smr disk and
> > see how many zones it reports...
>
> It will be capacity in bytes / 256MB unless you found a really, really
> weird beast.
I bet someone will get tempted to make bigger zones for their 120TB hard
disk.
> > > - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
> > > - mode, flags, flags2);
> > > - if (fa)
> > > - return fa;
> > > + if (!xfs_has_zoned(mp) ||
> > > + dip->di_metatype != cpu_to_be16(XFS_METAFILE_RTRMAP)) {
> > > + /* COW extent size hint validation */
> > > + fa = xfs_inode_validate_cowextsize(mp,
> > > + be32_to_cpu(dip->di_cowextsize),
> > > + mode, flags, flags2);
> >
> > I think there's *some* validation you could do, such as checking that
> > i_cowextsize <= the number of blocks in the rtgroup.
>
> So we do a fair amount of validation in xfs_zone_validate based on the
> hardware zone state. I tried to add more here but it failed because
> we getting at the rtgroups wasn't easily possible. But yes, I think
> a simple rgsize check should be possible at least.
>
> > I almost wonder if you should add that kind of logic to
> > xfs_inode_validate_cowextsize but that might be one incoherence too
> > many. OTOH it would probably reduce the number of changes in the fsck
> > code.
>
> I'll take a look, but having a cowextsize helper that validated a field
> overlay with an entirely different meaning sounds a bit confusing.
Yeah, like I said, it might be one incoherence too far. :)
--D
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 15/43] xfs: define the zoned on-disk format
2024-12-13 17:09 ` Darrick J. Wong
@ 2024-12-15 5:20 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 5:20 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 09:09:17AM -0800, Darrick J. Wong wrote:
> > It will be capacity in bytes / 256MB unless you found a really, really
> > weird beast.
>
> I bet someone will get tempted to make bigger zones for their 120TB hard
> disk.
Larger zones would make our life a lot easier, but there's not sign of
HDD zone size changing soon.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 16/43] xfs: allow internal RT devices for zoned mode
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (14 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 15/43] xfs: define the zoned on-disk format Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:06 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device Christoph Hellwig
` (26 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Allow creating an RT subvolume on the same device as the main data
device. This is mostly used for SMR HDDs where the conventional zones
are used for the data device and the sequential write required zones
for the zoned RT section.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_group.h | 6 ++++--
fs/xfs/libxfs/xfs_rtgroup.h | 8 +++++---
fs/xfs/libxfs/xfs_sb.c | 1 +
fs/xfs/xfs_file.c | 2 +-
fs/xfs/xfs_mount.h | 7 +++++++
fs/xfs/xfs_rtalloc.c | 3 ++-
fs/xfs/xfs_super.c | 12 ++++++++++--
7 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index 242b05627c7a..a70096113384 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -107,9 +107,11 @@ xfs_gbno_to_daddr(
xfs_agblock_t gbno)
{
struct xfs_mount *mp = xg->xg_mount;
- uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
+ struct xfs_groups *g = &mp->m_groups[xg->xg_type];
+ xfs_fsblock_t fsbno;
- return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
+ fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
+ return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
static inline uint32_t
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 0e1d9474ab77..d4c15c706b17 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -230,7 +230,8 @@ xfs_rtb_to_daddr(
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
- return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
+ return XFS_FSB_TO_BB(mp,
+ g->start_fsb + start_bno + (rtbno & g->blkmask));
}
static inline xfs_rtblock_t
@@ -238,10 +239,11 @@ xfs_daddr_to_rtb(
struct xfs_mount *mp,
xfs_daddr_t daddr)
{
- xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rfsblock_t bno;
+ bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
if (xfs_has_rtgroups(mp)) {
- struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
xfs_rgnumber_t rgno;
uint32_t rgbno;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0bbe0b87bf04..20b8318d4a59 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1175,6 +1175,7 @@ xfs_sb_mount_rextsize(
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
+ rgs->start_fsb = mp->m_sb.sb_rtstart;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6bcfd4c34a37..27301229011b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -150,7 +150,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 72c5389ff78b..3d92678d2c3b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,13 @@ struct xfs_groups {
* rtgroup, so this mask must be 64-bit.
*/
uint64_t blkmask;
+
+ /*
+ * Start of the first group in the device. This is used to support a
+ * RT device following the data device on the same block device for
+ * SMR hard drives.
+ */
+ xfs_fsblock_t start_fsb;
};
enum xfs_free_counter {
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e457a2c2d561..7ef62e7a91c1 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1266,7 +1266,8 @@ xfs_rt_check_size(
return -EFBIG;
}
- error = xfs_buf_read_uncached(mp->m_rtdev_targp, daddr,
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
xfs_warn(mp, "cannot read last RT device sector (%lld)",
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 92dd44965943..18430e975c53 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -533,7 +533,15 @@ xfs_setup_devices(
if (error)
return error;
}
- if (mp->m_rtdev_targp) {
+
+ if (mp->m_sb.sb_rtstart) {
+ if (mp->m_rtdev_targp) {
+ xfs_warn(mp,
+ "can't use internal and external rtdev at the same time");
+ return -EINVAL;
+ }
+ mp->m_rtdev_targp = mp->m_ddev_targp;
+ } else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
@@ -757,7 +765,7 @@ xfs_mount_free(
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
- if (mp->m_rtdev_targp)
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_rtdev_targp);
if (mp->m_ddev_targp)
xfs_free_buftarg(mp->m_ddev_targp);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 16/43] xfs: allow internal RT devices for zoned mode
2024-12-11 8:54 ` [PATCH 16/43] xfs: allow internal RT devices for zoned mode Christoph Hellwig
@ 2024-12-12 22:06 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:06 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:41AM +0100, Christoph Hellwig wrote:
> Allow creating an RT subvolume on the same device as the main data
> device. This is mostly used for SMR HDDs where the conventional zones
> are used for the data device and the sequential write required zones
> for the zoned RT section.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
Looks fine to me,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> fs/xfs/libxfs/xfs_group.h | 6 ++++--
> fs/xfs/libxfs/xfs_rtgroup.h | 8 +++++---
> fs/xfs/libxfs/xfs_sb.c | 1 +
> fs/xfs/xfs_file.c | 2 +-
> fs/xfs/xfs_mount.h | 7 +++++++
> fs/xfs/xfs_rtalloc.c | 3 ++-
> fs/xfs/xfs_super.c | 12 ++++++++++--
> 7 files changed, 30 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
> index 242b05627c7a..a70096113384 100644
> --- a/fs/xfs/libxfs/xfs_group.h
> +++ b/fs/xfs/libxfs/xfs_group.h
> @@ -107,9 +107,11 @@ xfs_gbno_to_daddr(
> xfs_agblock_t gbno)
> {
> struct xfs_mount *mp = xg->xg_mount;
> - uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
> + struct xfs_groups *g = &mp->m_groups[xg->xg_type];
> + xfs_fsblock_t fsbno;
>
> - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
> + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
> + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
> }
>
> static inline uint32_t
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> index 0e1d9474ab77..d4c15c706b17 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.h
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -230,7 +230,8 @@ xfs_rtb_to_daddr(
> xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
> uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
>
> - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
> + return XFS_FSB_TO_BB(mp,
> + g->start_fsb + start_bno + (rtbno & g->blkmask));
> }
>
> static inline xfs_rtblock_t
> @@ -238,10 +239,11 @@ xfs_daddr_to_rtb(
> struct xfs_mount *mp,
> xfs_daddr_t daddr)
> {
> - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
> + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
> + xfs_rfsblock_t bno;
>
> + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
> if (xfs_has_rtgroups(mp)) {
> - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
> xfs_rgnumber_t rgno;
> uint32_t rgbno;
>
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 0bbe0b87bf04..20b8318d4a59 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1175,6 +1175,7 @@ xfs_sb_mount_rextsize(
> rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
> rgs->blklog = mp->m_sb.sb_rgblklog;
> rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
> + rgs->start_fsb = mp->m_sb.sb_rtstart;
> } else {
> rgs->blocks = 0;
> rgs->blklog = 0;
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 6bcfd4c34a37..27301229011b 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -150,7 +150,7 @@ xfs_file_fsync(
> * ensure newly written file data make it to disk before logging the new
> * inode size in case of an extending write.
> */
> - if (XFS_IS_REALTIME_INODE(ip))
> + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
> error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
> else if (mp->m_logdev_targp != mp->m_ddev_targp)
> error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 72c5389ff78b..3d92678d2c3b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -103,6 +103,13 @@ struct xfs_groups {
> * rtgroup, so this mask must be 64-bit.
> */
> uint64_t blkmask;
> +
> + /*
> + * Start of the first group in the device. This is used to support a
> + * RT device following the data device on the same block device for
> + * SMR hard drives.
> + */
> + xfs_fsblock_t start_fsb;
> };
>
> enum xfs_free_counter {
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index e457a2c2d561..7ef62e7a91c1 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -1266,7 +1266,8 @@ xfs_rt_check_size(
> return -EFBIG;
> }
>
> - error = xfs_buf_read_uncached(mp->m_rtdev_targp, daddr,
> + error = xfs_buf_read_uncached(mp->m_rtdev_targp,
> + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
> XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
> if (error)
> xfs_warn(mp, "cannot read last RT device sector (%lld)",
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 92dd44965943..18430e975c53 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -533,7 +533,15 @@ xfs_setup_devices(
> if (error)
> return error;
> }
> - if (mp->m_rtdev_targp) {
> +
> + if (mp->m_sb.sb_rtstart) {
> + if (mp->m_rtdev_targp) {
> + xfs_warn(mp,
> + "can't use internal and external rtdev at the same time");
> + return -EINVAL;
> + }
> + mp->m_rtdev_targp = mp->m_ddev_targp;
> + } else if (mp->m_rtname) {
> error = xfs_setsize_buftarg(mp->m_rtdev_targp,
> mp->m_sb.sb_sectsize);
> if (error)
> @@ -757,7 +765,7 @@ xfs_mount_free(
> {
> if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
> xfs_free_buftarg(mp->m_logdev_targp);
> - if (mp->m_rtdev_targp)
> + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
> xfs_free_buftarg(mp->m_rtdev_targp);
> if (mp->m_ddev_targp)
> xfs_free_buftarg(mp->m_ddev_targp);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (15 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 16/43] xfs: allow internal RT devices for zoned mode Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:07 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM Christoph Hellwig
` (25 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Because the RT blocks follow right after.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_fsops.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index bb2e31e338b8..3c04fee284e2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -307,6 +307,10 @@ xfs_growfs_data(
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
+ /* we can't grow the data section when an internal RT section exists */
+ if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart)
+ return -EINVAL;
+
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device
2024-12-11 8:54 ` [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device Christoph Hellwig
@ 2024-12-12 22:07 ` Darrick J. Wong
2024-12-13 5:22 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:07 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:42AM +0100, Christoph Hellwig wrote:
> Because the RT blocks follow right after.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Aha, I was wondering about that. Does this belong in the previous
patch?
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_fsops.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index bb2e31e338b8..3c04fee284e2 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -307,6 +307,10 @@ xfs_growfs_data(
> if (!mutex_trylock(&mp->m_growlock))
> return -EWOULDBLOCK;
>
> + /* we can't grow the data section when an internal RT section exists */
> + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart)
> + return -EINVAL;
> +
> /* update imaxpct separately to the physical grow of the filesystem */
> if (in->imaxpct != mp->m_sb.sb_imax_pct) {
> error = xfs_growfs_imaxpct(mp, in->imaxpct);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device
2024-12-12 22:07 ` Darrick J. Wong
@ 2024-12-13 5:22 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:22 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 02:07:27PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:42AM +0100, Christoph Hellwig wrote:
> > Because the RT blocks follow right after.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Aha, I was wondering about that. Does this belong in the previous
> patch?
Sure.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (16 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 17/43] xfs: don't allow growfs of the data device with internal RT device Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:09 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 19/43] xfs: disable sb_frextents for zoned file systems Christoph Hellwig
` (24 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Export the zoned geometry information so that userspace can query it.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_fs.h | 5 ++++-
fs/xfs/libxfs/xfs_sb.c | 6 ++++++
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2c3171262b44..5e66fb2b2cc7 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
uint32_t checked; /* o: checked fs & rt metadata */
__u32 rgextents; /* rt extents in a realtime group */
__u32 rgcount; /* number of realtime groups */
- __u64 reserved[16]; /* reserved space */
+ __u64 rtstart; /* start of internal rt section */
+ __u64 rtreserved; /* RT (zoned) reserved blocks */
+ __u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
+#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 20b8318d4a59..6fc21c0a332b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1541,6 +1541,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
if (xfs_has_metadir(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
+ if (xfs_has_zoned(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1561,6 +1563,10 @@ xfs_fs_geometry(
geo->rgcount = sbp->sb_rgcount;
geo->rgextents = sbp->sb_rgextents;
}
+ if (xfs_has_zoned(mp)) {
+ geo->rtstart = XFS_FSB_TO_BB(mp, sbp->sb_rtstart);
+ geo->rtreserved = sbp->sb_rtreserved;
+ }
}
/* Read a secondary superblock. */
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM
2024-12-11 8:54 ` [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM Christoph Hellwig
@ 2024-12-12 22:09 ` Darrick J. Wong
2024-12-13 5:23 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:09 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:43AM +0100, Christoph Hellwig wrote:
> Export the zoned geometry information so that userspace can query it.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_fs.h | 5 ++++-
> fs/xfs/libxfs/xfs_sb.c | 6 ++++++
> 2 files changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index 2c3171262b44..5e66fb2b2cc7 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -189,7 +189,9 @@ struct xfs_fsop_geom {
> uint32_t checked; /* o: checked fs & rt metadata */
> __u32 rgextents; /* rt extents in a realtime group */
> __u32 rgcount; /* number of realtime groups */
> - __u64 reserved[16]; /* reserved space */
> + __u64 rtstart; /* start of internal rt section */
> + __u64 rtreserved; /* RT (zoned) reserved blocks */
> + __u64 reserved[14]; /* reserved space */
> };
>
> #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
> @@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
> #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
> #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
> #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
> +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
>
> /*
> * Minimum and maximum sizes need for growth checks.
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 20b8318d4a59..6fc21c0a332b 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1541,6 +1541,8 @@ xfs_fs_geometry(
> geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
> if (xfs_has_metadir(mp))
> geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
> + if (xfs_has_zoned(mp))
> + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
> geo->rtsectsize = sbp->sb_blocksize;
> geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
>
> @@ -1561,6 +1563,10 @@ xfs_fs_geometry(
> geo->rgcount = sbp->sb_rgcount;
> geo->rgextents = sbp->sb_rgextents;
> }
> + if (xfs_has_zoned(mp)) {
> + geo->rtstart = XFS_FSB_TO_BB(mp, sbp->sb_rtstart);
Not sure why this is reported in units of 512b, everything else set by
xfs_fs_geometry is in units of fsblocks.
--D
> + geo->rtreserved = sbp->sb_rtreserved;
> + }
> }
>
> /* Read a secondary superblock. */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM
2024-12-12 22:09 ` Darrick J. Wong
@ 2024-12-13 5:23 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:23 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 02:09:13PM -0800, Darrick J. Wong wrote:
> > + if (xfs_has_zoned(mp)) {
> > + geo->rtstart = XFS_FSB_TO_BB(mp, sbp->sb_rtstart);
>
> Not sure why this is reported in units of 512b, everything else set by
> xfs_fs_geometry is in units of fsblocks.
Because I didn't update it when switching the sb field to FSBs per
a pre-review request :) That being said the sectors actually work
pretty well for the users in xfsprogs, so this will create move code.
But I guess that's worth it to be consistent.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 19/43] xfs: disable sb_frextents for zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (17 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 18/43] xfs: export zoned geometry via XFS_FSOP_GEOM Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:26 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 20/43] xfs: disable FITRIM for zoned RT devices Christoph Hellwig
` (23 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Zoned file systems not only don't use the global frextents counter, but
for them the in-memory percpu counter also includes reservations taken
before even allocating delalloc extent records, so it will never match
the per-zone used information. Disable all updates and verification of
the sb counter for zoned file systems as it isn't useful for them.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_sb.c | 2 +-
fs/xfs/scrub/fscounters.c | 11 +++++++++--
fs/xfs/scrub/fscounters_repair.c | 10 ++++++----
fs/xfs/xfs_mount.c | 2 +-
fs/xfs/xfs_super.c | 4 +++-
5 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6fc21c0a332b..ee56fc22fd06 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1305,7 +1305,7 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
- if (xfs_has_rtgroups(mp)) {
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
mp->m_sb.sb_frextents =
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 732658a62a2d..5f5f67947440 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
- if (!xfs_has_realtime(mp))
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -597,7 +603,8 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents,
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
&mp->m_free[XC_FREE_RTEXTENTS],
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index 8fb0db78489e..f0d2b04644e4 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
- xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
- fsc->frextents - fsc->frextents_delayed);
- if (!xfs_has_rtgroups(mp))
- mp->m_sb.sb_frextents = fsc->frextents;
+ if (!xfs_has_zoned(mp)) {
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ fsc->frextents - fsc->frextents_delayed);
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
+ }
return 0;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index db910ecc1ed4..72fa28263e14 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -551,7 +551,7 @@ xfs_check_summary_counts(
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
- if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 18430e975c53..d0b7e0d02366 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1127,7 +1127,9 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
- xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
+ if (!xfs_has_zoned(mp))
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ mp->m_sb.sb_frextents);
}
static void
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 19/43] xfs: disable sb_frextents for zoned file systems
2024-12-11 8:54 ` [PATCH 19/43] xfs: disable sb_frextents for zoned file systems Christoph Hellwig
@ 2024-12-12 22:26 ` Darrick J. Wong
2024-12-13 5:29 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:26 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:44AM +0100, Christoph Hellwig wrote:
> Zoned file systems not only don't use the global frextents counter, but
> for them the in-memory percpu counter also includes reservations taken
> before even allocating delalloc extent records, so it will never match
> the per-zone used information. Disable all updates and verification of
> the sb counter for zoned file systems as it isn't useful for them.
How is XC_FREE_RTEXTENTS initialized at mount time, then?
/me peeks ahead.
Oh, it and XC_FREE_RTAVAILABLE are set in xfs_mount_zones from values
that are computed by querying the hardware zone write pointers (or its
software equivalents if the rt device isn't zoned). So the two incore
free rt space counters are completely detached from the ondisk
superblock counters.
What should be the value of sb_frextents, then? Zero? Please specify
that in the definition of xfs_dsb and update the verifiers to reject
nonzero values.
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_sb.c | 2 +-
> fs/xfs/scrub/fscounters.c | 11 +++++++++--
> fs/xfs/scrub/fscounters_repair.c | 10 ++++++----
> fs/xfs/xfs_mount.c | 2 +-
> fs/xfs/xfs_super.c | 4 +++-
> 5 files changed, 20 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index 6fc21c0a332b..ee56fc22fd06 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1305,7 +1305,7 @@ xfs_log_sb(
> * we handle nearly-lockless reservations, so we must use the _positive
> * variant here to avoid writing out nonsense frextents.
> */
> - if (xfs_has_rtgroups(mp)) {
> + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
> mp->m_sb.sb_frextents =
> xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
> }
> diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
> index 732658a62a2d..5f5f67947440 100644
> --- a/fs/xfs/scrub/fscounters.c
> +++ b/fs/xfs/scrub/fscounters.c
> @@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
>
> fsc->frextents = 0;
> fsc->frextents_delayed = 0;
> - if (!xfs_has_realtime(mp))
> +
> + /*
> + * Don't bother verifying and repairing the fs counters for zoned file
> + * systems as they don't track an on-disk frextents count, and the
> + * in-memory percpu counter also includes reservations.
> + */
> + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
> return 0;
>
> while ((rtg = xfs_rtgroup_next(mp, rtg))) {
> @@ -597,7 +603,8 @@ xchk_fscounters(
> try_again = true;
> }
>
> - if (!xchk_fscount_within_range(sc, frextents,
> + if (!xfs_has_zoned(mp) &&
> + !xchk_fscount_within_range(sc, frextents,
> &mp->m_free[XC_FREE_RTEXTENTS],
> fsc->frextents - fsc->frextents_delayed)) {
> if (fsc->frozen)
> diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
> index 8fb0db78489e..f0d2b04644e4 100644
> --- a/fs/xfs/scrub/fscounters_repair.c
> +++ b/fs/xfs/scrub/fscounters_repair.c
> @@ -74,10 +74,12 @@ xrep_fscounters(
> * track of the delalloc reservations separately, as they are are
> * subtracted from m_frextents, but not included in sb_frextents.
> */
> - xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> - fsc->frextents - fsc->frextents_delayed);
> - if (!xfs_has_rtgroups(mp))
> - mp->m_sb.sb_frextents = fsc->frextents;
> + if (!xfs_has_zoned(mp)) {
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> + fsc->frextents - fsc->frextents_delayed);
> + if (!xfs_has_rtgroups(mp))
> + mp->m_sb.sb_frextents = fsc->frextents;
> + }
>
> return 0;
> }
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index db910ecc1ed4..72fa28263e14 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -551,7 +551,7 @@ xfs_check_summary_counts(
> * If we're mounting the rt volume after recovering the log, recompute
> * frextents from the rtbitmap file to fix the inconsistency.
> */
> - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
> + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
> error = xfs_rtalloc_reinit_frextents(mp);
> if (error)
> return error;
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 18430e975c53..d0b7e0d02366 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1127,7 +1127,9 @@ xfs_reinit_percpu_counters(
> percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
> percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
> xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
> - xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
> + if (!xfs_has_zoned(mp))
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> + mp->m_sb.sb_frextents);
> }
>
> static void
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 19/43] xfs: disable sb_frextents for zoned file systems
2024-12-12 22:26 ` Darrick J. Wong
@ 2024-12-13 5:29 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:29 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 02:26:09PM -0800, Darrick J. Wong wrote:
> What should be the value of sb_frextents, then? Zero? Please specify
> that in the definition of xfs_dsb and update the verifiers to reject
> nonzero values.
Right now it's undefined. But forcing it to zero makes sense.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 20/43] xfs: disable FITRIM for zoned RT devices
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (18 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 19/43] xfs: disable sb_frextents for zoned file systems Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:13 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes Christoph Hellwig
` (22 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The zoned allocator unconditionally issues zone resets or discards after
emptying an entire zone, so supporting FITRIM for a zoned RT device is
not useful.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_discard.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c4bd145f5ec1..4447c835a373 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -844,7 +844,8 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp &&
+
+ if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 20/43] xfs: disable FITRIM for zoned RT devices
2024-12-11 8:54 ` [PATCH 20/43] xfs: disable FITRIM for zoned RT devices Christoph Hellwig
@ 2024-12-12 22:13 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:13 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:45AM +0100, Christoph Hellwig wrote:
> The zoned allocator unconditionally issues zone resets or discards after
> emptying an entire zone, so supporting FITRIM for a zoned RT device is
> not useful.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Makes sense,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_discard.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
> index c4bd145f5ec1..4447c835a373 100644
> --- a/fs/xfs/xfs_discard.c
> +++ b/fs/xfs/xfs_discard.c
> @@ -844,7 +844,8 @@ xfs_ioc_trim(
>
> if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> - if (mp->m_rtdev_targp &&
> +
> + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
> bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
> rt_bdev = mp->m_rtdev_targp->bt_bdev;
> if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (19 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 20/43] xfs: disable FITRIM for zoned RT devices Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:15 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 22/43] xfs: skip zoned RT inodes in xfs_inodegc_want_queue_rt_file Christoph Hellwig
` (21 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Zoned file systems require out of place writes and thus can't support
post-EOF speculative preallocations. Avoid the pointless ilock critical
section to find out that none can be freed.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_file.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 27301229011b..827f7819df6a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1356,15 +1356,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
+ * Inodes on the zoned RT device never have preallocations, so skip
+ * taking the locks below.
+ */
+ if (!inode->i_nlink ||
+ !(file->f_mode & FMODE_WRITE) ||
+ (ip->i_diflags & XFS_DIFLAG_APPEND) ||
+ xfs_is_zoned_inode(ip))
+ return 0;
+
+ /*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
- if (inode->i_nlink &&
- (file->f_mode & FMODE_WRITE) &&
- !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
- !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes
2024-12-11 8:54 ` [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes Christoph Hellwig
@ 2024-12-12 22:15 ` Darrick J. Wong
2024-12-13 5:28 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:15 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:46AM +0100, Christoph Hellwig wrote:
> Zoned file systems require out of place writes and thus can't support
> post-EOF speculative preallocations. Avoid the pointless ilock critical
> section to find out that none can be freed.
I wonder if this is true of alwayscow inodes in general, not just zoned
inodes?
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Anyway that makes sense to me, so
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_file.c | 15 +++++++++++----
> 1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 27301229011b..827f7819df6a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1356,15 +1356,22 @@ xfs_file_release(
> * blocks. This avoids open/read/close workloads from removing EOF
> * blocks that other writers depend upon to reduce fragmentation.
> *
> + * Inodes on the zoned RT device never have preallocations, so skip
> + * taking the locks below.
> + */
> + if (!inode->i_nlink ||
> + !(file->f_mode & FMODE_WRITE) ||
> + (ip->i_diflags & XFS_DIFLAG_APPEND) ||
> + xfs_is_zoned_inode(ip))
> + return 0;
> +
> + /*
> * If we can't get the iolock just skip truncating the blocks past EOF
> * because we could deadlock with the mmap_lock otherwise. We'll get
> * another chance to drop them once the last reference to the inode is
> * dropped, so we'll never leak blocks permanently.
> */
> - if (inode->i_nlink &&
> - (file->f_mode & FMODE_WRITE) &&
> - !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
> - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
> + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
> xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
> if (xfs_can_free_eofblocks(ip) &&
> !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes
2024-12-12 22:15 ` Darrick J. Wong
@ 2024-12-13 5:28 ` Christoph Hellwig
2024-12-13 17:13 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 5:28 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 12, 2024 at 02:15:23PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:54:46AM +0100, Christoph Hellwig wrote:
> > Zoned file systems require out of place writes and thus can't support
> > post-EOF speculative preallocations. Avoid the pointless ilock critical
> > section to find out that none can be freed.
>
> I wonder if this is true of alwayscow inodes in general, not just zoned
> inodes?
Maybe I'm missing something, but AFAICS always_cow still generates
preallocations in xfs_buffered_write_iomap_begin. It probably shouldn't.
Btw, the always_cow code as intended as the common support code for
zoned and atomic msync style atomic writes, which always require hard
out of place writes. It turns out it doesn't actually do that right
now (see the bounce buffering patch reviewed earlier), which makes it
a bit of an oddball. I'd personally love to kill it once the zoned
code lands, as just running the zoned mode on a regular device actually
gives you a good way to test always out of place write semantics,
which ended up diverging a bit from the earlier version after it hit
the hard reality of hardware actually enforcing out of place writes.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes
2024-12-13 5:28 ` Christoph Hellwig
@ 2024-12-13 17:13 ` Darrick J. Wong
2024-12-13 17:18 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 17:13 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 06:28:41AM +0100, Christoph Hellwig wrote:
> On Thu, Dec 12, 2024 at 02:15:23PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:54:46AM +0100, Christoph Hellwig wrote:
> > > Zoned file systems require out of place writes and thus can't support
> > > post-EOF speculative preallocations. Avoid the pointless ilock critical
> > > section to find out that none can be freed.
> >
> > I wonder if this is true of alwayscow inodes in general, not just zoned
> > inodes?
>
> Maybe I'm missing something, but AFAICS always_cow still generates
> preallocations in xfs_buffered_write_iomap_begin. It probably shouldn't.
For non-zoned alwayscow I think it's trying to generate preallocations
in the cow fork to reduce fragmentation of the bmbt since we don't have
to write in the linear order.
Unless... you're talking about preallocations in the data fork?
> Btw, the always_cow code as intended as the common support code for
> zoned and atomic msync style atomic writes, which always require hard
> out of place writes. It turns out it doesn't actually do that right
> now (see the bounce buffering patch reviewed earlier), which makes it
> a bit of an oddball. I'd personally love to kill it once the zoned
> code lands, as just running the zoned mode on a regular device actually
> gives you a good way to test always out of place write semantics,
> which ended up diverging a bit from the earlier version after it hit
> the hard reality of hardware actually enforcing out of place writes.
Which patch is the bounce buffering patch?
/me hands himself another cup of coffee
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes
2024-12-13 17:13 ` Darrick J. Wong
@ 2024-12-13 17:18 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-13 17:18 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 09:13:53AM -0800, Darrick J. Wong wrote:
> > > I wonder if this is true of alwayscow inodes in general, not just zoned
> > > inodes?
> >
> > Maybe I'm missing something, but AFAICS always_cow still generates
> > preallocations in xfs_buffered_write_iomap_begin. It probably shouldn't.
>
> For non-zoned alwayscow I think it's trying to generate preallocations
> in the cow fork to reduce fragmentation of the bmbt since we don't have
> to write in the linear order.
Ah yes, and xfs_can_free_eofblocks only deals with the data fork.
>
> > Btw, the always_cow code as intended as the common support code for
> > zoned and atomic msync style atomic writes, which always require hard
> > out of place writes. It turns out it doesn't actually do that right
> > now (see the bounce buffering patch reviewed earlier), which makes it
> > a bit of an oddball. I'd personally love to kill it once the zoned
> > code lands, as just running the zoned mode on a regular device actually
> > gives you a good way to test always out of place write semantics,
> > which ended up diverging a bit from the earlier version after it hit
> > the hard reality of hardware actually enforcing out of place writes.
>
> Which patch is the bounce buffering patch?
[PATCH 12/43] xfs: refine the unaligned check for always COW inodes in xfs_file_dio_write
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 22/43] xfs: skip zoned RT inodes in xfs_inodegc_want_queue_rt_file
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (20 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 21/43] xfs: don't call xfs_can_free_eofblocks from ->release for zoned inodes Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-12 22:15 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 23/43] xfs: parse and validate hardware zone information Christoph Hellwig
` (20 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The zoned allocator never performs speculative preallocations, so don't
bother queueing up zoned inodes here.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_icache.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c9ded501e89b..2f53ca7e12d4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2073,7 +2073,7 @@ xfs_inodegc_want_queue_rt_file(
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_REALTIME_INODE(ip))
+ if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 22/43] xfs: skip zoned RT inodes in xfs_inodegc_want_queue_rt_file
2024-12-11 8:54 ` [PATCH 22/43] xfs: skip zoned RT inodes in xfs_inodegc_want_queue_rt_file Christoph Hellwig
@ 2024-12-12 22:15 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-12 22:15 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:47AM +0100, Christoph Hellwig wrote:
> The zoned allocator never performs speculative preallocations, so don't
> bother queueing up zoned inodes here.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks good,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_icache.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index c9ded501e89b..2f53ca7e12d4 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -2073,7 +2073,7 @@ xfs_inodegc_want_queue_rt_file(
> {
> struct xfs_mount *mp = ip->i_mount;
>
> - if (!XFS_IS_REALTIME_INODE(ip))
> + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
> return false;
>
> if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 23/43] xfs: parse and validate hardware zone information
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (21 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 22/43] xfs: skip zoned RT inodes in xfs_inodegc_want_queue_rt_file Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 17:31 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 24/43] xfs: add the zoned space allocator Christoph Hellwig
` (19 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Add support to validate and parse reported hardware zone state.
Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_zones.c | 169 ++++++++++++++++++++++++++++++++++++++
fs/xfs/libxfs/xfs_zones.h | 33 ++++++++
3 files changed, 203 insertions(+)
create mode 100644 fs/xfs/libxfs/xfs_zones.c
create mode 100644 fs/xfs/libxfs/xfs_zones.h
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7afa51e41427..ea8e66c1e969 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
+ xfs_zones.o \
)
# highlevel code
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
new file mode 100644
index 000000000000..e170d7c13533
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2024 Christoph Hellwig.
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zones.h"
+
+static int
+xfs_zone_validate_empty(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > 0) {
+ xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return -EIO;
+ }
+ *write_pointer = 0;
+ return 0;
+}
+
+static int
+xfs_zone_validate_wp(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return -EIO;
+ }
+
+ if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+ xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
+ rtg_rgno(rtg), wp_fsb);
+ return -EFSCORRUPTED;
+ }
+
+ *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+ if (*write_pointer >= rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
+ rtg_rgno(rtg), *write_pointer);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_validate_full(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return -EIO;
+ }
+ *write_pointer = rtg->rtg_extents;
+
+ return 0;
+}
+
+static int
+xfs_zone_validate_seq(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ return xfs_zone_validate_empty(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return xfs_zone_validate_wp(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_FULL:
+ return xfs_zone_validate_full(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ default:
+ xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ }
+}
+
+static int
+xfs_zone_validate_conv(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ return 0;
+ default:
+ xfs_warn(mp,
+"conventional zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return -EIO;
+ }
+}
+
+int
+xfs_zone_validate(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+
+ /*
+ * Check that the zone capacity matches the rtgroup size stored in the
+ * superblock. Note that all zones including the last one must have a
+ * uniform capacity.
+ */
+ if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+ xfs_warn(mp,
+"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+ g->blocks);
+ return -EIO;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
+ xfs_warn(mp,
+"zone %u length (0x%llx) does match geometry (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+ 1 << g->blklog);
+ }
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ return xfs_zone_validate_conv(zone, rtg);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ return xfs_zone_validate_seq(zone, rtg, write_pointer);
+ default:
+ xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
+ rtg_rgno(rtg), zone->type);
+ return -EFSCORRUPTED;
+ }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
new file mode 100644
index 000000000000..4d3e53585654
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones: one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeping.
+ */
+#define XFS_GC_ZONES 3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES 1U
+#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+
+int xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer);
+
+#endif /* _LIBXFS_ZONES_H */
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 23/43] xfs: parse and validate hardware zone information
2024-12-11 8:54 ` [PATCH 23/43] xfs: parse and validate hardware zone information Christoph Hellwig
@ 2024-12-13 17:31 ` Darrick J. Wong
2024-12-15 5:24 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 17:31 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:48AM +0100, Christoph Hellwig wrote:
> Add support to validate and parse reported hardware zone state.
>
> Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/libxfs/xfs_zones.c | 169 ++++++++++++++++++++++++++++++++++++++
> fs/xfs/libxfs/xfs_zones.h | 33 ++++++++
> 3 files changed, 203 insertions(+)
> create mode 100644 fs/xfs/libxfs/xfs_zones.c
> create mode 100644 fs/xfs/libxfs/xfs_zones.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 7afa51e41427..ea8e66c1e969 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
> xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
> xfs_rtbitmap.o \
> xfs_rtgroup.o \
> + xfs_zones.o \
> )
>
> # highlevel code
> diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
> new file mode 100644
> index 000000000000..e170d7c13533
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_zones.c
> @@ -0,0 +1,169 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023-2024 Christoph Hellwig.
> + * Copyright (c) 2024, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_rtgroup.h"
> +#include "xfs_zones.h"
> +
> +static int
> +xfs_zone_validate_empty(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> +
> + if (rtg_rmap(rtg)->i_used_blocks > 0) {
> + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
> + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
> + return -EIO;
Why do some of these validation failures return EIO vs. EFSCORRUPTED?
Is "EIO" used for "filesystem metadata out of sync with storage device"
whereas "EFSCORRUPTED" is used for "filesystem metadata inconsistent
with itself"?
Do the _validate_{empty,full} functions need to validate zone->wp is
zero/rtg_extents, respectively?
--D
> + }
> + *write_pointer = 0;
> + return 0;
> +}
> +
> +static int
> +xfs_zone_validate_wp(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
> +
> + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
> + xfs_warn(mp, "zone %u has too large used counter (0x%x).",
> + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
> + return -EIO;
> + }
> +
> + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
> + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
> + rtg_rgno(rtg), wp_fsb);
> + return -EFSCORRUPTED;
> + }
> +
> + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
> + if (*write_pointer >= rtg->rtg_extents) {
> + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
> + rtg_rgno(rtg), *write_pointer);
> + return -EFSCORRUPTED;
> + }
> + return 0;
> +}
> +
> +static int
> +xfs_zone_validate_full(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> +
> + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
> + xfs_warn(mp, "zone %u has too large used counter (0x%x).",
> + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
> + return -EIO;
> + }
> + *write_pointer = rtg->rtg_extents;
> +
> + return 0;
> +}
> +
> +static int
> +xfs_zone_validate_seq(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> +
> + switch (zone->cond) {
> + case BLK_ZONE_COND_EMPTY:
> + return xfs_zone_validate_empty(zone, rtg, write_pointer);
> + case BLK_ZONE_COND_IMP_OPEN:
> + case BLK_ZONE_COND_EXP_OPEN:
> + case BLK_ZONE_COND_CLOSED:
> + return xfs_zone_validate_wp(zone, rtg, write_pointer);
> + case BLK_ZONE_COND_FULL:
> + return xfs_zone_validate_full(zone, rtg, write_pointer);
> + case BLK_ZONE_COND_NOT_WP:
> + case BLK_ZONE_COND_OFFLINE:
> + case BLK_ZONE_COND_READONLY:
> + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
> + rtg_rgno(rtg), zone->cond);
> + return -EIO;
> + default:
> + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
> + rtg_rgno(rtg), zone->cond);
> + return -EIO;
> + }
> +}
> +
> +static int
> +xfs_zone_validate_conv(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> +
> + switch (zone->cond) {
> + case BLK_ZONE_COND_NOT_WP:
> + return 0;
> + default:
> + xfs_warn(mp,
> +"conventional zone %u has unsupported zone condition 0x%x.",
> + rtg_rgno(rtg), zone->cond);
> + return -EIO;
> + }
> +}
> +
> +int
> +xfs_zone_validate(
> + struct blk_zone *zone,
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
> +
> + /*
> + * Check that the zone capacity matches the rtgroup size stored in the
> + * superblock. Note that all zones including the last one must have a
> + * uniform capacity.
> + */
> + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
> + xfs_warn(mp,
> +"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
> + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
> + g->blocks);
> + return -EIO;
> + }
> +
> + if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
> + xfs_warn(mp,
> +"zone %u length (0x%llx) does match geometry (0x%x).",
> + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
> + 1 << g->blklog);
> + }
> +
> + switch (zone->type) {
> + case BLK_ZONE_TYPE_CONVENTIONAL:
> + return xfs_zone_validate_conv(zone, rtg);
> + case BLK_ZONE_TYPE_SEQWRITE_REQ:
> + return xfs_zone_validate_seq(zone, rtg, write_pointer);
> + default:
> + xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
> + rtg_rgno(rtg), zone->type);
> + return -EFSCORRUPTED;
> + }
> +}
> diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
> new file mode 100644
> index 000000000000..4d3e53585654
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_zones.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LIBXFS_ZONES_H
> +#define _LIBXFS_ZONES_H
> +
> +/*
> + * In order to guarantee forward progress for GC we need to reserve at least
> + * two zones: one that will be used for moving data into and one spare zone
> + * making sure that we have enough space to relocate a nearly-full zone.
> + * To allow for slightly sloppy accounting for when we need to reserve the
> + * second zone, we actually reserve three as that is easier than doing fully
> + * accurate bookkeeping.
> + */
> +#define XFS_GC_ZONES 3U
> +
> +/*
> + * In addition we need two zones for user writes, one open zone for writing
> + * and one to still have available blocks without resetting the open zone
> + * when data in the open zone has been freed.
> + */
> +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
> +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
> +
> +/*
> + * Always keep one zone out of the general open zone pool to allow for GC to
> + * happen while other writers are waiting for free space.
> + */
> +#define XFS_OPEN_GC_ZONES 1U
> +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
> +
> +int xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
> + xfs_rgblock_t *write_pointer);
> +
> +#endif /* _LIBXFS_ZONES_H */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 23/43] xfs: parse and validate hardware zone information
2024-12-13 17:31 ` Darrick J. Wong
@ 2024-12-15 5:24 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 5:24 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 09:31:32AM -0800, Darrick J. Wong wrote:
> > + xfs_rgblock_t *write_pointer)
> > +{
> > + struct xfs_mount *mp = rtg_mount(rtg);
> > +
> > + if (rtg_rmap(rtg)->i_used_blocks > 0) {
> > + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
> > + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
> > + return -EIO;
>
> Why do some of these validation failures return EIO vs. EFSCORRUPTED?
> Is "EIO" used for "filesystem metadata out of sync with storage device"
> whereas "EFSCORRUPTED" is used for "filesystem metadata inconsistent
> with itself"?
If there was a rule I forgot about it :) This should be changed to
return the same error everywhere, and that should probably be
EFSCORRUPTED, or maybe the whole code should be changed to return a
bool.
> Do the _validate_{empty,full} functions need to validate zone->wp is
> zero/rtg_extents, respectively?
zone->wp is not defined for them in the hardware specs, so the only
thing we'd validate is what the block layer / drivers put into it.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 24/43] xfs: add the zoned space allocator
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (22 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 23/43] xfs: parse and validate hardware zone information Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 18:33 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 25/43] xfs: add support for zoned space reservations Christoph Hellwig
` (18 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
For zoned RT devices space is always allocated at the write pointer, that
is right after the last written block and only recorded on I/O completion.
Because the actual allocation algorithm is very simple and just involves
picking a good zone - preferably the one used for the last write to the
inode. As the number of zones that can written at the same time is
usually limited by the hardware, selecting a zone is done as late as
possible from the iomap dio and buffered writeback bio submissions
helpers just before submitting the bio.
Given that the writers already took a reservation before acquiring the
iolock, space will always be readily available if an open zone slot is
available. A new structure is used to track these open zones, and
pointed to by the xfs_rtgroup. Because zoned file systems don't have
a rsum cache the space for that pointer can be reused.
Allocations are only recorded at I/O completion time. The scheme used
for that is very similar to the reflink COW end I/O path.
Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/Makefile | 3 +-
fs/xfs/libxfs/xfs_rtgroup.h | 28 +-
fs/xfs/xfs_log.c | 4 +
fs/xfs/xfs_mount.c | 52 +-
fs/xfs/xfs_mount.h | 3 +
fs/xfs/xfs_rtalloc.c | 6 +-
fs/xfs/xfs_trace.c | 2 +
fs/xfs/xfs_trace.h | 96 ++++
fs/xfs/xfs_zone_alloc.c | 971 ++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_zone_alloc.h | 36 ++
fs/xfs/xfs_zone_priv.h | 85 ++++
11 files changed, 1262 insertions(+), 24 deletions(-)
create mode 100644 fs/xfs/xfs_zone_alloc.c
create mode 100644 fs/xfs/xfs_zone_alloc.h
create mode 100644 fs/xfs/xfs_zone_priv.h
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ea8e66c1e969..28bd2627e9ef 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -137,7 +137,8 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_zone_alloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index d4c15c706b17..85d8d329d417 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -37,15 +37,33 @@ struct xfs_rtgroup {
xfs_rtxnum_t rtg_extents;
/*
- * Cache of rt summary level per bitmap block with the invariant that
- * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
- * or 0 if rsum[i][bbno] == 0 for all i.
- *
+ * For bitmap based RT devices this points to a cache of rt summary
+ * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+ * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+ * rsum[i][bbno] == 0 for all i.
* Reads and writes are serialized by the rsumip inode lock.
+ *
+ * For zoned RT devices this points to the open zone structure for
+ * a group that is open for writers, or is NULL.
*/
- uint8_t *rtg_rsum_cache;
+ union {
+ uint8_t *rtg_rsum_cache;
+ struct xfs_open_zone *rtg_open_zone;
+ };
};
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE XA_MARK_0
+
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks. Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE XA_MARK_1
+
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 05daad8a8d34..a3c3ab0f3e15 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -20,6 +20,7 @@
#include "xfs_sysfs.h"
#include "xfs_sb.h"
#include "xfs_health.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_log_ticket_cache;
@@ -3542,6 +3543,9 @@ xlog_force_shutdown(
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
+ xfs_zoned_wake_all(log->l_mp);
+
return log_error;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 72fa28263e14..70ecbbaba7fd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,7 @@
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
+#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -469,22 +470,27 @@ xfs_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
- uint64_t resblks;
-
- if (ctr == XC_FREE_RTEXTENTS)
+ switch (ctr) {
+ case XC_FREE_BLOCKS:
+ /*
+ * Default to 5% or 8192 FSBs of space reserved, whichever is
+ * smaller.
+ *
+ * This is intended to cover concurrent allocation transactions
+ * when we initially hit ENOSPC. These each require a 4 block
+ * reservation. Hence by default we cover roughly 2000
+ * concurrent allocation reservations.
+ */
+ return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+ case XC_FREE_RTEXTENTS:
+ case XC_FREE_RTAVAILABLE:
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ return xfs_zoned_default_resblks(mp, ctr);
return 0;
-
- /*
- * We default to 5% or 8192 fsbs of space reserved, whichever is
- * smaller. This is intended to cover concurrent allocation
- * transactions when we initially hit enospc. These each require a 4
- * block reservation. Hence by default we cover roughly 2000 concurrent
- * allocation reservations.
- */
- resblks = mp->m_sb.sb_dblocks;
- do_div(resblks, 20);
- resblks = min_t(uint64_t, resblks, 8192);
- return resblks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
/* Ensure the summary counts are correct. */
@@ -1042,6 +1048,12 @@ xfs_mountfs(
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
+ if (xfs_has_zoned(mp)) {
+ error = xfs_mount_zones(mp);
+ if (error)
+ goto out_rtunmount;
+ }
+
/*
* Complete the quota initialisation, post-log-replay component.
*/
@@ -1083,6 +1095,8 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
@@ -1164,6 +1178,8 @@ xfs_unmountfs(
xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
if (mp->m_metadirip)
@@ -1247,7 +1263,7 @@ xfs_freecounter_unavailable(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
- if (ctr == XC_FREE_RTEXTENTS)
+ if (ctr == XC_FREE_RTEXTENTS || ctr == XC_FREE_RTAVAILABLE)
return 0;
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
@@ -1345,7 +1361,9 @@ xfs_dec_freecounter(
spin_unlock(&mp->m_sb_lock);
return 0;
}
- xfs_warn_once(mp,
+
+ if (ctr == XC_FREE_BLOCKS)
+ xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
fdblocks_enospc:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 3d92678d2c3b..02a3609a3322 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -115,6 +115,7 @@ struct xfs_groups {
enum xfs_free_counter {
XC_FREE_BLOCKS, /* free block counter */
XC_FREE_RTEXTENTS, /* free rt extent counter */
+ XC_FREE_RTAVAILABLE, /* actually available (zoned) rt extents */
XC_FREE_NR,
};
@@ -211,6 +212,7 @@ typedef struct xfs_mount {
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
+ unsigned int m_max_open_zones;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -263,6 +265,7 @@ typedef struct xfs_mount {
uint64_t save; /* reserved blks @ remount,ro */
} m_resblks[XC_FREE_NR];
struct delayed_work m_reclaim_work; /* background inode reclaim */
+ struct xfs_zone_info *m_zone_info; /* zone allocator information */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 7ef62e7a91c1..47c94ac74259 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
- kvfree(rtg->rtg_rsum_cache);
+ if (!xfs_has_zoned(rtg_mount(rtg)))
+ kvfree(rtg->rtg_rsum_cache);
}
static int
@@ -1614,6 +1616,8 @@ xfs_rtmount_rtg(
}
}
+ if (xfs_has_zoned(mp))
+ return 0;
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
}
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8f530e69c18a..a60556dbd172 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,8 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 15dec76fec10..763dd3d271b9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
+struct xfs_open_zone;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -265,6 +266,100 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+ TP_PROTO(struct xfs_rtgroup *rtg),
+ TP_ARGS(rtg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used)
+);
+
+#define DEFINE_ZONE_EVENT(name) \
+DEFINE_EVENT(xfs_zone_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg), \
+ TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_activate);
+
+TRACE_EVENT(xfs_zone_free_blocks,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(rtg, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->rgbno,
+ __entry->len)
+);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(oz, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, written)
+ __field(xfs_rgblock_t, write_pointer)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(oz->oz_rtg);
+ __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+ __entry->written = oz->oz_written;
+ __entry->write_pointer = oz->oz_write_pointer;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->written,
+ __entry->write_pointer,
+ __entry->rgbno,
+ __entry->len)
+);
+
+#define DEFINE_ZONE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_zone_alloc_class, name, \
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(oz, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@@ -3982,6 +4077,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644
index 000000000000..1a746e9cfbf4
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,971 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2024 Christoph Hellwig.
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_refcount.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+void
+xfs_open_zone_put(
+ struct xfs_open_zone *oz)
+{
+ if (atomic_dec_and_test(&oz->oz_ref)) {
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+ }
+}
+
+static void
+xfs_zone_mark_reclaimable(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+}
+
+static void
+xfs_open_zone_mark_full(
+ struct xfs_open_zone *oz)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ trace_xfs_zone_full(rtg);
+
+ WRITE_ONCE(rtg->rtg_open_zone, NULL);
+
+ /*
+ * GC zones are fully owned by the GC thread, don't free them here.
+ */
+ if (!oz->oz_is_gc) {
+ spin_lock(&zi->zi_zone_list_lock);
+ zi->zi_nr_open_zones--;
+ list_del_init(&oz->oz_entry);
+ spin_unlock(&zi->zi_zone_list_lock);
+
+ xfs_open_zone_put(oz);
+ }
+
+ wake_up_all(&zi->zi_zone_wait);
+ if (rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg))
+ xfs_zone_mark_reclaimable(rtg);
+}
+
+static int
+xfs_zone_record_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len,
+ bool used)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(mp, fsbno);
+ struct xfs_inode *rmapip;
+ struct xfs_open_zone *oz;
+ struct xfs_rtgroup *rtg;
+ int error = 0;
+
+ rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
+ if (WARN_ON_ONCE(!rtg))
+ return -EIO;
+ rmapip = rtg_rmap(rtg);
+
+ xfs_ilock(rmapip, XFS_ILOCK_EXCL);
+
+ /*
+ * There is a reference on the oz until all blocks were written, and it
+ * is only dropped below with the rmapip ILOCK held. Thus we don't need
+ * to grab an extra reference here.
+ */
+ oz = READ_ONCE(rtg->rtg_open_zone);
+ if (WARN_ON_ONCE(!oz)) {
+ xfs_iunlock(rmapip, XFS_ILOCK_EXCL);
+ error = -EIO;
+ goto out_put;
+ }
+
+ trace_xfs_zone_record_blocks(oz, rgbno, len);
+ xfs_trans_ijoin(tp, rmapip, XFS_ILOCK_EXCL);
+ if (used) {
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+ } else {
+ xfs_add_frextents(mp, len);
+ }
+
+ oz->oz_written += len;
+ ASSERT(oz->oz_written <= oz->oz_write_pointer);
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+out_put:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+
+static int
+xfs_zoned_end_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *new,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_bmbt_irec data;
+ int nmaps = 1;
+ int error;
+
+ /* Grab the corresponding mapping in the data fork. */
+ error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+ &nmaps, 0);
+ if (error)
+ return error;
+
+ /*
+ * Cap the update to the existing extent in the data fork because we can
+ * only overwrite one extent at a time.
+ */
+ ASSERT(new->br_blockcount >= data.br_blockcount);
+ new->br_blockcount = data.br_blockcount;
+
+ /*
+ * If a data write raced with this GC write, keep the existing data in
+ * the data fork, mark our newly written GC extent as reclaimable, then
+ * move on to the next extent.
+ */
+ if (old_startblock != NULLFSBLOCK &&
+ old_startblock != data.br_startblock)
+ goto skip;
+
+ trace_xfs_reflink_cow_remap_from(ip, new);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ return error;
+
+ if (data.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+ ASSERT(!isnullstartblock(data.br_startblock));
+
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+ if (xfs_is_reflink_inode(ip)) {
+ xfs_refcount_decrease_extent(tp, true, &data);
+ } else {
+ error = xfs_free_extent_later(tp, data.br_startblock,
+ data.br_blockcount, NULL,
+ XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_REALTIME);
+ if (error)
+ return error;
+ }
+ }
+
+ error = xfs_zone_record_blocks(tp, new->br_startblock,
+ new->br_blockcount, true);
+ if (error)
+ return error;
+
+ /* Map the new blocks into the data fork. */
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+ return 0;
+
+skip:
+ trace_xfs_reflink_cow_remap_skip(ip, new);
+ return xfs_zone_record_blocks(tp, new->br_startblock,
+ new->br_blockcount, false);
+}
+
+int
+xfs_zoned_end_io(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ xfs_daddr_t daddr,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ struct xfs_bmbt_irec new = {
+ .br_startoff = XFS_B_TO_FSBT(mp, offset),
+ .br_startblock = xfs_daddr_to_rtb(mp, daddr),
+ .br_state = XFS_EXT_NORM,
+ };
+ unsigned int resblks =
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ struct xfs_trans *tp;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ while (new.br_startoff < end_fsb) {
+ new.br_blockcount = end_fsb - new.br_startoff;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_zoned_end_extent(tp, ip, &new, old_startblock);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ new.br_startoff += new.br_blockcount;
+ new.br_startblock += new.br_blockcount;
+ if (old_startblock != NULLFSBLOCK)
+ old_startblock += new.br_blockcount;
+ }
+
+ return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
+
+ xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
+
+ if (len > rmapip->i_used_blocks) {
+ xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%u).",
+ len, rmapip->i_used_blocks);
+ ASSERT(len <= rmapip->i_used_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ rmapip->i_used_blocks -= len;
+ if (!READ_ONCE(rtg->rtg_open_zone)) {
+ /*
+ * If the zone is not open, mark it reclaimable when the first
+ * block is freed.
+ */
+ if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
+ xfs_zone_mark_reclaimable(rtg);
+ }
+ xfs_add_frextents(mp, len);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_open_zone *
+xfs_last_used_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_open_zone *oz = NULL;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+ &icur, &got)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return NULL;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+ if (!rtg)
+ return NULL;
+
+ xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+ oz = READ_ONCE(rtg->rtg_open_zone);
+ if (oz && !atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
+ xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+
+ xfs_rtgroup_rele(rtg);
+ return oz;
+}
+
+static struct xfs_group *
+xfs_find_free_zone(
+ struct xfs_mount *mp,
+ unsigned long start,
+ unsigned long end)
+{
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
+ struct xfs_group *xg;
+
+ xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
+ if (atomic_inc_not_zero(&xg->xg_active_ref))
+ return xg;
+
+ return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_init_open_zone(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t write_pointer,
+ bool is_gc)
+{
+ struct xfs_open_zone *oz;
+
+ oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
+ spin_lock_init(&oz->oz_alloc_lock);
+ atomic_set(&oz->oz_ref, 1);
+ oz->oz_rtg = rtg;
+ oz->oz_write_pointer = write_pointer;
+ oz->oz_written = write_pointer;
+ oz->oz_is_gc = is_gc;
+
+ /*
+ * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
+ * inode, but we don't really want to take that here because we are
+ * under the zone_list_lock. Ensure the pointer is only set for a fully
+ * initialized open zone structure so that a racy lookup finding it is
+ * fine.
+ */
+ WRITE_ONCE(rtg->rtg_open_zone, oz);
+ return oz;
+}
+
+struct xfs_open_zone *
+xfs_open_zone(
+ struct xfs_mount *mp,
+ bool is_gc)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_group *xg;
+
+ lockdep_assert_held(&zi->zi_zone_list_lock);
+
+ xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
+ if (!xg)
+ xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
+ if (!xg)
+ return NULL;
+ xfs_group_clear_mark(xg, XFS_RTG_FREE);
+ atomic_dec(&zi->zi_nr_free_zones);
+ zi->zi_free_zone_cursor = xg->xg_gno;
+ return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
+}
+
+/*
+ * Activate a free zone.
+ *
+ * This just does the accounting and allows to find the zone on the open
+ * zones list. Don't bother with an explicit open command, we'll just open it
+ * implicitly with the first write to it.
+ */
+static struct xfs_open_zone *
+xfs_activate_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ if (atomic_read(&zi->zi_nr_free_zones) <
+ XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+ return NULL;
+
+ oz = xfs_open_zone(mp, false);
+ if (!oz)
+ return NULL;
+
+ atomic_inc(&oz->oz_ref);
+ zi->zi_nr_open_zones++;
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+
+ /* XXX: this is a little verbose, but let's keep it for now */
+ xfs_info(mp, "using zone %u (%u)",
+ rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);
+ trace_xfs_zone_activate(oz->oz_rtg);
+ return oz;
+}
+
+static bool
+xfs_try_use_zone(
+ struct xfs_zone_info *zi,
+ struct xfs_open_zone *oz)
+{
+ if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ return false;
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ return false;
+
+ /*
+ * If we couldn't match by inode or life time we just pick the first
+ * zone with enough space above. For that we want the least busy zone
+ * for some definition of "least" busy. For now this simple LRU
+ * algorithm that rotates every zone to the end of the list will do it,
+ * even if it isn't exactly cache friendly.
+ */
+ if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
+ list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_lru(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz;
+
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, oz))
+ return oz;
+ return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_mru(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz;
+
+ list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, oz))
+ return oz;
+ return NULL;
+}
+
+/*
+ * Try to pack inodes that are written back after they were closed tight instead
+ * of trying to open new zones for them or spread them to the least recently
+ * used zone. This optimizes the data layout for workloads that untar or copy
+ * a lot of small files. Right now this does not separate multiple such
+ * streams.
+ */
+static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
+{
+ return !inode_is_open_for_write(VFS_I(ip)) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND);
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from
+ * the freelist. Else try to find one that matches the expected allocation
+ * length, or at least the minimum required length. If we don't find one
+ * that is good enough we pick one anyway and let the caller finish it to
+ * free up open zone resources.
+ */
+static struct xfs_open_zone *
+xfs_select_zone_nowait(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = NULL;
+
+ if (xfs_zoned_pack_tight(ip))
+ oz = xfs_select_open_zone_mru(zi);
+ if (oz)
+ return oz;
+
+ /*
+ * If we are below the open limit try to activate a zone.
+ */
+ if (zi->zi_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
+ oz = xfs_activate_zone(mp);
+ if (oz)
+ return oz;
+ }
+
+ return xfs_select_open_zone_lru(zi);
+}
+
+static struct xfs_open_zone *
+xfs_select_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+ struct xfs_open_zone *oz = NULL;
+ DEFINE_WAIT (wait);
+
+ spin_lock(&zi->zi_zone_list_lock);
+ if (xfs_is_shutdown(mp))
+ goto out_unlock;
+
+ oz = xfs_select_zone_nowait(ip, count_fsb);
+ if (oz)
+ goto out_unlock;
+
+ for (;;) {
+ prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_is_shutdown(mp))
+ break;
+
+ oz = xfs_select_zone_nowait(ip, count_fsb);
+ if (oz)
+ break;
+
+ spin_unlock(&zi->zi_zone_list_lock);
+ schedule();
+ spin_lock(&zi->zi_zone_list_lock);
+ }
+ finish_wait(&zi->zi_zone_wait, &wait);
+
+out_unlock:
+ spin_unlock(&zi->zi_zone_list_lock);
+ return oz;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone *oz,
+ bool *is_seq)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
+ xfs_rgblock_t rgbno;
+
+ spin_lock(&oz->oz_alloc_lock);
+ count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+ if (!count_fsb) {
+ spin_unlock(&oz->oz_alloc_lock);
+ return 0;
+ }
+ rgbno = oz->oz_write_pointer;
+ oz->oz_write_pointer += count_fsb;
+ spin_unlock(&oz->oz_alloc_lock);
+
+ trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+
+ ioend->io_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(ioend->io_bio.bi_bdev, ioend->io_sector);
+ if (!*is_seq)
+ ioend->io_sector += XFS_FSB_TO_BB(mp, rgbno);
+ return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+void
+xfs_mark_rtg_boundary(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ sector_t sector = ioend->io_bio.bi_iter.bi_sector;
+
+ if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+ ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ bool is_seq)
+{
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
+ }
+
+ submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz)
+{
+ unsigned int alloc_len;
+ struct iomap_ioend *split;
+ bool is_seq;
+
+ if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
+ goto out_error;
+
+ /*
+ * If we don't have a cached zone in this write context, see if the
+ * last extent before the one we are writing points of an active zone.
+ * If so, just continue writing to it.
+ */
+ if (!*oz && ioend->io_offset)
+ *oz = xfs_last_used_zone(ioend);
+ if (!*oz) {
+select_zone:
+ *oz = xfs_select_zone(ioend);
+ if (!*oz)
+ goto out_error;
+ }
+
+ alloc_len = xfs_zone_alloc_blocks(ioend, *oz, &is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+
+ while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
+ xfs_submit_zoned_bio(split, is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+ }
+
+ xfs_submit_zoned_bio(ioend, is_seq);
+ return;
+
+out_error:
+ bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zoned_wake_all(
+ struct xfs_mount *mp)
+{
+ if (!(mp->m_super->s_flags & SB_ACTIVE))
+ return; /* can happen during log recovery */
+ spin_lock(&mp->m_zone_info->zi_zone_list_lock);
+ wake_up_all(&mp->m_zone_info->zi_zone_wait);
+ spin_unlock(&mp->m_zone_info->zi_zone_list_lock);
+}
+
+/*
+ * Check if @rgbno in @rgb is a potentially valid block. It might still be
+ * unused, but that information is only found in the rmap.
+ */
+bool
+xfs_zone_rgbno_is_valid(
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgbno)
+{
+ lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
+
+ if (rtg->rtg_open_zone)
+ return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+ return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
+ rtg_rgno(rtg), XFS_RTG_FREE);
+}
+
+static void
+xfs_free_open_zones(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz;
+
+ spin_lock(&zi->zi_zone_list_lock);
+ while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
+ struct xfs_open_zone, oz_entry))) {
+ list_del(&oz->oz_entry);
+ xfs_open_zone_put(oz);
+ }
+ spin_unlock(&zi->zi_zone_list_lock);
+}
+
+struct xfs_init_zones {
+ struct xfs_mount *mp;
+ uint64_t available;
+ uint64_t reclaimable;
+};
+
+static int
+xfs_init_zone(
+ struct xfs_init_zones *iz,
+ struct xfs_rtgroup *rtg,
+ struct blk_zone *zone)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint64_t used = rtg_rmap(rtg)->i_used_blocks;
+ xfs_rgblock_t write_pointer;
+ int error;
+
+ if (zone) {
+ error = xfs_zone_validate(zone, rtg, &write_pointer);
+ if (error)
+ return error;
+ }
+
+ /*
+ * For sequential write required zones we retrieved the hardware write
+ * pointer above.
+ *
+ * For conventional zones or conventional devices we don't have that
+ * luxury. Instead query the rmap to find the highest recorded block
+ * and set the write pointer to the block after that. In case of a
+ * power loss this misses blocks where the data I/O has completed but
+ * not recorded in the rmap yet, and it also rewrites blocks if the most
+ * recently written ones got deleted again before unmount, but this is
+ * the best we can do without hardware support.
+ */
+ if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ write_pointer = xfs_rtrmap_first_unwritten_rgbno(rtg);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ }
+
+ if (write_pointer == 0) {
+ /* zone is empty */
+ atomic_inc(&zi->zi_nr_free_zones);
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ iz->available += rtg_blocks(rtg);
+ } else if (write_pointer < rtg_blocks(rtg)) {
+ /* zone is open */
+ struct xfs_open_zone *oz;
+
+ atomic_inc(&rtg_group(rtg)->xg_active_ref);
+ oz = xfs_init_open_zone(rtg, write_pointer, false);
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+ zi->zi_nr_open_zones++;
+
+ iz->available += (rtg_blocks(rtg) - write_pointer);
+ iz->reclaimable += write_pointer - used;
+ } else if (used < rtg_blocks(rtg)) {
+ /* zone fully written, but has freed blocks */
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+ iz->reclaimable += (rtg_blocks(rtg) - used);
+ }
+
+ return 0;
+}
+
+static int
+xfs_get_zone_info_cb(
+ struct blk_zone *zone,
+ unsigned int idx,
+ void *data)
+{
+ struct xfs_init_zones *iz = data;
+ struct xfs_mount *mp = iz->mp;
+ xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start);
+ xfs_rgnumber_t rgno;
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+ xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+ return -EFSCORRUPTED;
+ }
+
+ rgno = xfs_rtb_to_rgno(mp, zsbno);
+ rtg = xfs_rtgroup_grab(mp, rgno);
+ if (!rtg) {
+ xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+ return -EFSCORRUPTED;
+ }
+ error = xfs_init_zone(iz, rtg, zone);
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+ struct xfs_mount *mp)
+{
+ unsigned int max_open, max_open_data_zones;
+ /*
+ * We need two zones for every open data zone,
+ * one in reserve as we don't reclaim open zones. One data zone
+ * and its spare is included in XFS_MIN_ZONES.
+ */
+ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+ max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+ /*
+ * Cap the max open limit to 1/4 of available space
+ */
+ max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+ return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+/*
+ * Normally we use the open zone limit that the device reports. If there is
+ * none let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no override,
+ * allow to hold about a quarter of the zones open. In theory we could allow
+ * all to be open, but at that point we run into GC deadlocks because we can't
+ * reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as we'll
+ * otherwise overwhelm the FTL just as much as a conventional block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to 1 here.
+ */
+static int
+xfs_calc_open_zones(
+ struct xfs_mount *mp)
+{
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ unsigned int bdev_open_zones = bdev_max_open_zones(bdev);
+
+ if (!mp->m_max_open_zones) {
+ if (bdev_open_zones)
+ mp->m_max_open_zones = bdev_open_zones;
+ else
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ }
+
+ if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+ xfs_notice(mp, "need at least %u open zones.",
+ XFS_MIN_OPEN_ZONES);
+ return -EIO;
+ }
+
+ if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+ mp->m_max_open_zones = bdev_open_zones;
+ xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+ bdev_open_zones);
+ }
+
+ if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+ mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+ }
+
+ return 0;
+}
+
+static struct xfs_zone_info *
+xfs_alloc_zone_info(void)
+{
+ struct xfs_zone_info *zi;
+
+ zi = kzalloc(sizeof(*zi), GFP_KERNEL);
+ if (!zi)
+ return NULL;
+ INIT_LIST_HEAD(&zi->zi_open_zones);
+ INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
+ spin_lock_init(&zi->zi_reset_list_lock);
+ spin_lock_init(&zi->zi_zone_list_lock);
+ spin_lock_init(&zi->zi_reservation_lock);
+ init_waitqueue_head(&zi->zi_zone_wait);
+ return zi;
+}
+
+int
+xfs_mount_zones(
+ struct xfs_mount *mp)
+{
+ struct xfs_init_zones iz = {
+ .mp = mp,
+ };
+ struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ int error;
+
+ if (!bt) {
+ xfs_notice(mp, "RT device missing.");
+ return -EINVAL;
+ }
+
+ if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+ xfs_notice(mp, "invalid flag combination.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rextsize != 1) {
+ xfs_notice(mp, "zoned file systems do not support rextsize.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+ xfs_notice(mp,
+"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_calc_open_zones(mp);
+ if (error)
+ return error;
+
+ mp->m_zone_info = xfs_alloc_zone_info();
+ if (!mp->m_zone_info)
+ return -ENOMEM;
+
+ xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+ mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+ mp->m_max_open_zones);
+
+ if (bdev_is_zoned(bt->bt_bdev)) {
+ error = blkdev_report_zones(bt->bt_bdev,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
+ mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
+ if (error < 0)
+ goto out_free_open_zones;
+ } else {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_init_zone(&iz, rtg, NULL);
+ if (error)
+ goto out_free_open_zones;
+ }
+ }
+
+ xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ iz.available + iz.reclaimable);
+
+ return 0;
+
+out_free_open_zones:
+ xfs_free_open_zones(mp->m_zone_info);
+ kfree(mp->m_zone_info);
+ return error;
+}
+
+void
+xfs_unmount_zones(
+ struct xfs_mount *mp)
+{
+ xfs_free_open_zones(mp->m_zone_info);
+ kfree(mp->m_zone_info);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644
index 000000000000..37a49f4ce40c
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+struct iomap_ioend;
+struct xfs_open_zone;
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno, xfs_filblks_t len);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+ xfs_daddr_t daddr, xfs_fsblock_t old_startblock);
+void xfs_open_zone_put(struct xfs_open_zone *oz);
+
+void xfs_zoned_wake_all(struct xfs_mount *mp);
+bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
+void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+
+#ifdef CONFIG_XFS_RT
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+#else
+static inline int xfs_mount_zones(struct xfs_mount *mp)
+{
+ return -EIO;
+}
+static inline void xfs_unmount_zones(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
new file mode 100644
index 000000000000..ae1556871596
--- /dev/null
+++ b/fs/xfs/xfs_zone_priv.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_PRIV_H
+#define _XFS_ZONE_PRIV_H
+
+struct xfs_open_zone {
+ /*
+ * Entry in the open zone list and refcount. Protected by
+ * zi_zone_list_lock in struct xfs_zone_info.
+ */
+ struct list_head oz_entry;
+ atomic_t oz_ref;
+
+ /*
+ * Protects oz_write_pointer and oz_written.
+ */
+ spinlock_t oz_alloc_lock;
+
+ /*
+ * oz_write_pointer is the write pointer at which space is handed out
+ * for conventional zones, or simple the count of blocks handed out
+ * so far for sequential write required zones.
+ *
+ * oz_written is the number of blocks for which we've received a
+ * write completion. oz_written must always be <= oz_write_pointer.
+ */
+ xfs_rgblock_t oz_write_pointer;
+ xfs_rgblock_t oz_written;
+
+ /*
+ * Is this open zone used for garbage collection? There can only be a
+ * single open GC zone, which is pointed to by zi_open_gc_zone in
+ * struct xfs_zone_info. Constant over the life time of an open zone.
+ */
+ bool oz_is_gc;
+
+ /*
+ * Pointer to the RT groups structure for this open zone. Constant over
+ * the life time of an open zone.
+ */
+ struct xfs_rtgroup *oz_rtg;
+};
+
+struct xfs_zone_info {
+ /*
+ * List of pending space reservations:
+ */
+ spinlock_t zi_reservation_lock;
+ struct list_head zi_reclaim_reservations;
+
+ /*
+ * Lock for open and free zone information, and wait queue to wait for
+ * free zones or open zone resources to become available:
+ */
+ spinlock_t zi_zone_list_lock;
+ wait_queue_head_t zi_zone_wait;
+
+ /*
+ * List and number of open zones:
+ */
+ struct list_head zi_open_zones;
+ unsigned int zi_nr_open_zones;
+
+ /*
+ * Free zone search cursor and number of free zones:
+ */
+ unsigned long zi_free_zone_cursor;
+ atomic_t zi_nr_free_zones;
+
+ /*
+ * Pointer to the GC thread, and the current open zone used by GC
+ * (if any).
+ */
+ struct task_struct *zi_gc_thread;
+ struct xfs_open_zone *zi_open_gc_zone;
+
+ /*
+ * List of zones that need a reset:
+ */
+ spinlock_t zi_reset_list_lock;
+ struct xfs_group *zi_reset_list;
+};
+
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+
+#endif /* _XFS_ZONE_PRIV_H */
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 24/43] xfs: add the zoned space allocator
2024-12-11 8:54 ` [PATCH 24/43] xfs: add the zoned space allocator Christoph Hellwig
@ 2024-12-13 18:33 ` Darrick J. Wong
2024-12-15 5:27 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 18:33 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:49AM +0100, Christoph Hellwig wrote:
> For zoned RT devices space is always allocated at the write pointer, that
> is right after the last written block and only recorded on I/O completion.
>
> Because the actual allocation algorithm is very simple and just involves
> picking a good zone - preferably the one used for the last write to the
> inode. As the number of zones that can written at the same time is
> usually limited by the hardware, selecting a zone is done as late as
> possible from the iomap dio and buffered writeback bio submissions
> helpers just before submitting the bio.
>
> Given that the writers already took a reservation before acquiring the
> iolock, space will always be readily available if an open zone slot is
> available. A new structure is used to track these open zones, and
> pointed to by the xfs_rtgroup. Because zoned file systems don't have
> a rsum cache the space for that pointer can be reused.
>
> Allocations are only recorded at I/O completion time. The scheme used
> for that is very similar to the reflink COW end I/O path.
>
> Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/Makefile | 3 +-
> fs/xfs/libxfs/xfs_rtgroup.h | 28 +-
> fs/xfs/xfs_log.c | 4 +
> fs/xfs/xfs_mount.c | 52 +-
> fs/xfs/xfs_mount.h | 3 +
> fs/xfs/xfs_rtalloc.c | 6 +-
> fs/xfs/xfs_trace.c | 2 +
> fs/xfs/xfs_trace.h | 96 ++++
> fs/xfs/xfs_zone_alloc.c | 971 ++++++++++++++++++++++++++++++++++++
> fs/xfs/xfs_zone_alloc.h | 36 ++
> fs/xfs/xfs_zone_priv.h | 85 ++++
> 11 files changed, 1262 insertions(+), 24 deletions(-)
> create mode 100644 fs/xfs/xfs_zone_alloc.c
> create mode 100644 fs/xfs/xfs_zone_alloc.h
> create mode 100644 fs/xfs/xfs_zone_priv.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index ea8e66c1e969..28bd2627e9ef 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -137,7 +137,8 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
> xfs_quotaops.o
>
> # xfs_rtbitmap is shared with libxfs
> -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
> +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
> + xfs_zone_alloc.o
>
> xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
> xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> index d4c15c706b17..85d8d329d417 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.h
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -37,15 +37,33 @@ struct xfs_rtgroup {
> xfs_rtxnum_t rtg_extents;
>
> /*
> - * Cache of rt summary level per bitmap block with the invariant that
> - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
> - * or 0 if rsum[i][bbno] == 0 for all i.
> - *
> + * For bitmap based RT devices this points to a cache of rt summary
> + * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
> + * > the maximum i for which rsum[i][bbno] != 0, or 0 if
> + * rsum[i][bbno] == 0 for all i.
> * Reads and writes are serialized by the rsumip inode lock.
> + *
> + * For zoned RT devices this points to the open zone structure for
> + * a group that is open for writers, or is NULL.
> */
> - uint8_t *rtg_rsum_cache;
> + union {
> + uint8_t *rtg_rsum_cache;
> + struct xfs_open_zone *rtg_open_zone;
> + };
> };
>
> +/*
> + * For zoned RT devices this is set on groups that have no written blocks
> + * and can be picked by the allocator for opening.
> + */
> +#define XFS_RTG_FREE XA_MARK_0
> +
> +/*
> + * For zoned RT devices this is set on groups that are fully written and that
> + * have unused blocks. Used by the garbage collection to pick targets.
> + */
> +#define XFS_RTG_RECLAIMABLE XA_MARK_1
> +
> static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
> {
> return container_of(xg, struct xfs_rtgroup, rtg_group);
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 05daad8a8d34..a3c3ab0f3e15 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -20,6 +20,7 @@
> #include "xfs_sysfs.h"
> #include "xfs_sb.h"
> #include "xfs_health.h"
> +#include "xfs_zone_alloc.h"
>
> struct kmem_cache *xfs_log_ticket_cache;
>
> @@ -3542,6 +3543,9 @@ xlog_force_shutdown(
> spin_unlock(&log->l_icloglock);
>
> wake_up_var(&log->l_opstate);
> + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
> + xfs_zoned_wake_all(log->l_mp);
> +
> return log_error;
> }
>
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 72fa28263e14..70ecbbaba7fd 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -40,6 +40,7 @@
> #include "xfs_rtrmap_btree.h"
> #include "xfs_rtrefcount_btree.h"
> #include "scrub/stats.h"
> +#include "xfs_zone_alloc.h"
>
> static DEFINE_MUTEX(xfs_uuid_table_mutex);
> static int xfs_uuid_table_size;
> @@ -469,22 +470,27 @@ xfs_default_resblks(
> struct xfs_mount *mp,
> enum xfs_free_counter ctr)
> {
> - uint64_t resblks;
> -
> - if (ctr == XC_FREE_RTEXTENTS)
> + switch (ctr) {
> + case XC_FREE_BLOCKS:
> + /*
> + * Default to 5% or 8192 FSBs of space reserved, whichever is
> + * smaller.
> + *
> + * This is intended to cover concurrent allocation transactions
> + * when we initially hit ENOSPC. These each require a 4 block
> + * reservation. Hence by default we cover roughly 2000
> + * concurrent allocation reservations.
> + */
> + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
> + case XC_FREE_RTEXTENTS:
> + case XC_FREE_RTAVAILABLE:
> + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
> + return xfs_zoned_default_resblks(mp, ctr);
> return 0;
> -
> - /*
> - * We default to 5% or 8192 fsbs of space reserved, whichever is
> - * smaller. This is intended to cover concurrent allocation
> - * transactions when we initially hit enospc. These each require a 4
> - * block reservation. Hence by default we cover roughly 2000 concurrent
> - * allocation reservations.
> - */
> - resblks = mp->m_sb.sb_dblocks;
> - do_div(resblks, 20);
> - resblks = min_t(uint64_t, resblks, 8192);
> - return resblks;
> + default:
> + ASSERT(0);
> + return 0;
> + }
> }
>
> /* Ensure the summary counts are correct. */
> @@ -1042,6 +1048,12 @@ xfs_mountfs(
> if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
> xfs_log_clean(mp);
>
> + if (xfs_has_zoned(mp)) {
> + error = xfs_mount_zones(mp);
> + if (error)
> + goto out_rtunmount;
> + }
> +
> /*
> * Complete the quota initialisation, post-log-replay component.
> */
> @@ -1083,6 +1095,8 @@ xfs_mountfs(
> out_agresv:
> xfs_fs_unreserve_ag_blocks(mp);
> xfs_qm_unmount_quotas(mp);
> + if (xfs_has_zoned(mp))
> + xfs_unmount_zones(mp);
> out_rtunmount:
> xfs_rtunmount_inodes(mp);
> out_rele_rip:
> @@ -1164,6 +1178,8 @@ xfs_unmountfs(
> xfs_blockgc_stop(mp);
> xfs_fs_unreserve_ag_blocks(mp);
> xfs_qm_unmount_quotas(mp);
> + if (xfs_has_zoned(mp))
> + xfs_unmount_zones(mp);
> xfs_rtunmount_inodes(mp);
> xfs_irele(mp->m_rootip);
> if (mp->m_metadirip)
> @@ -1247,7 +1263,7 @@ xfs_freecounter_unavailable(
> struct xfs_mount *mp,
> enum xfs_free_counter ctr)
> {
> - if (ctr == XC_FREE_RTEXTENTS)
> + if (ctr == XC_FREE_RTEXTENTS || ctr == XC_FREE_RTAVAILABLE)
> return 0;
> return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
> }
> @@ -1345,7 +1361,9 @@ xfs_dec_freecounter(
> spin_unlock(&mp->m_sb_lock);
> return 0;
> }
> - xfs_warn_once(mp,
> +
> + if (ctr == XC_FREE_BLOCKS)
> + xfs_warn_once(mp,
> "Reserve blocks depleted! Consider increasing reserve pool size.");
>
> fdblocks_enospc:
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 3d92678d2c3b..02a3609a3322 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -115,6 +115,7 @@ struct xfs_groups {
> enum xfs_free_counter {
> XC_FREE_BLOCKS, /* free block counter */
> XC_FREE_RTEXTENTS, /* free rt extent counter */
> + XC_FREE_RTAVAILABLE, /* actually available (zoned) rt extents */
> XC_FREE_NR,
> };
>
> @@ -211,6 +212,7 @@ typedef struct xfs_mount {
> bool m_fail_unmount;
> bool m_finobt_nores; /* no per-AG finobt resv. */
> bool m_update_sb; /* sb needs update in mount */
> + unsigned int m_max_open_zones;
>
> /*
> * Bitsets of per-fs metadata that have been checked and/or are sick.
> @@ -263,6 +265,7 @@ typedef struct xfs_mount {
> uint64_t save; /* reserved blks @ remount,ro */
> } m_resblks[XC_FREE_NR];
> struct delayed_work m_reclaim_work; /* background inode reclaim */
> + struct xfs_zone_info *m_zone_info; /* zone allocator information */
> struct dentry *m_debugfs; /* debugfs parent */
> struct xfs_kobj m_kobj;
> struct xfs_kobj m_error_kobj;
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index 7ef62e7a91c1..47c94ac74259 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -33,6 +33,7 @@
> #include "xfs_trace.h"
> #include "xfs_rtrefcount_btree.h"
> #include "xfs_reflink.h"
> +#include "xfs_zone_alloc.h"
>
> /*
> * Return whether there are any free extents in the size range given
> @@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
>
> for (i = 0; i < XFS_RTGI_MAX; i++)
> xfs_rtginode_irele(&rtg->rtg_inodes[i]);
> - kvfree(rtg->rtg_rsum_cache);
> + if (!xfs_has_zoned(rtg_mount(rtg)))
> + kvfree(rtg->rtg_rsum_cache);
> }
>
> static int
> @@ -1614,6 +1616,8 @@ xfs_rtmount_rtg(
> }
> }
>
> + if (xfs_has_zoned(mp))
> + return 0;
> return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
> }
>
> diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
> index 8f530e69c18a..a60556dbd172 100644
> --- a/fs/xfs/xfs_trace.c
> +++ b/fs/xfs/xfs_trace.c
> @@ -49,6 +49,8 @@
> #include "xfs_metafile.h"
> #include "xfs_metadir.h"
> #include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
>
> /*
> * We include this last to have the helpers above available for the trace
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 15dec76fec10..763dd3d271b9 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -102,6 +102,7 @@ struct xfs_rmap_intent;
> struct xfs_refcount_intent;
> struct xfs_metadir_update;
> struct xfs_rtgroup;
> +struct xfs_open_zone;
>
> #define XFS_ATTR_FILTER_FLAGS \
> { XFS_ATTR_ROOT, "ROOT" }, \
> @@ -265,6 +266,100 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
> DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
> DEFINE_GROUP_REF_EVENT(xfs_group_rele);
>
> +#ifdef CONFIG_XFS_RT
> +DECLARE_EVENT_CLASS(xfs_zone_class,
> + TP_PROTO(struct xfs_rtgroup *rtg),
> + TP_ARGS(rtg),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_rgnumber_t, rgno)
> + __field(xfs_rgblock_t, used)
> + ),
> + TP_fast_assign(
> + __entry->dev = rtg_mount(rtg)->m_super->s_dev;
> + __entry->rgno = rtg_rgno(rtg);
> + __entry->used = rtg_rmap(rtg)->i_used_blocks;
> + ),
> + TP_printk("dev %d:%d rgno 0x%x used 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rgno,
> + __entry->used)
> +);
> +
> +#define DEFINE_ZONE_EVENT(name) \
> +DEFINE_EVENT(xfs_zone_class, name, \
> + TP_PROTO(struct xfs_rtgroup *rtg), \
> + TP_ARGS(rtg))
> +DEFINE_ZONE_EVENT(xfs_zone_full);
> +DEFINE_ZONE_EVENT(xfs_zone_activate);
> +
> +TRACE_EVENT(xfs_zone_free_blocks,
> + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
> + xfs_extlen_t len),
> + TP_ARGS(rtg, rgbno, len),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_rgnumber_t, rgno)
> + __field(xfs_rgblock_t, used)
> + __field(xfs_rgblock_t, rgbno)
> + __field(xfs_extlen_t, len)
> + ),
> + TP_fast_assign(
> + __entry->dev = rtg_mount(rtg)->m_super->s_dev;
> + __entry->rgno = rtg_rgno(rtg);
> + __entry->used = rtg_rmap(rtg)->i_used_blocks;
> + __entry->rgbno = rgbno;
> + __entry->len = len;
> + ),
> + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rgno,
> + __entry->used,
> + __entry->rgbno,
> + __entry->len)
> +);
> +
> +DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
> + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
> + xfs_extlen_t len),
> + TP_ARGS(oz, rgbno, len),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(xfs_rgnumber_t, rgno)
> + __field(xfs_rgblock_t, used)
> + __field(xfs_rgblock_t, written)
> + __field(xfs_rgblock_t, write_pointer)
> + __field(xfs_rgblock_t, rgbno)
> + __field(xfs_extlen_t, len)
> + ),
> + TP_fast_assign(
> + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
> + __entry->rgno = rtg_rgno(oz->oz_rtg);
> + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
> + __entry->written = oz->oz_written;
> + __entry->write_pointer = oz->oz_write_pointer;
> + __entry->rgbno = rgbno;
> + __entry->len = len;
> + ),
> + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->rgno,
> + __entry->used,
> + __entry->written,
> + __entry->write_pointer,
> + __entry->rgbno,
> + __entry->len)
> +);
> +
> +#define DEFINE_ZONE_ALLOC_EVENT(name) \
> +DEFINE_EVENT(xfs_zone_alloc_class, name, \
> + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
> + xfs_extlen_t len), \
> + TP_ARGS(oz, rgbno, len))
> +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
> +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
> +#endif /* CONFIG_XFS_RT */
> +
> TRACE_EVENT(xfs_inodegc_worker,
> TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
> TP_ARGS(mp, shrinker_hits),
> @@ -3982,6 +4077,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
> DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
> DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
> DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
> +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
>
> DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
> DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> new file mode 100644
> index 000000000000..1a746e9cfbf4
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -0,0 +1,971 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023-2024 Christoph Hellwig.
> + * Copyright (c) 2024, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_iomap.h"
> +#include "xfs_trans.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bmap.h"
> +#include "xfs_bmap_btree.h"
> +#include "xfs_trans_space.h"
> +#include "xfs_refcount.h"
> +#include "xfs_rtbitmap.h"
> +#include "xfs_rtrmap_btree.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
> +#include "xfs_zones.h"
> +#include "xfs_trace.h"
> +
> +void
> +xfs_open_zone_put(
> + struct xfs_open_zone *oz)
> +{
> + if (atomic_dec_and_test(&oz->oz_ref)) {
> + xfs_rtgroup_rele(oz->oz_rtg);
> + kfree(oz);
> + }
> +}
> +
> +static void
> +xfs_zone_mark_reclaimable(
> + struct xfs_rtgroup *rtg)
> +{
> + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
> +}
> +
> +static void
> +xfs_open_zone_mark_full(
> + struct xfs_open_zone *oz)
> +{
> + struct xfs_rtgroup *rtg = oz->oz_rtg;
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_zone_info *zi = mp->m_zone_info;
> +
> + trace_xfs_zone_full(rtg);
> +
> + WRITE_ONCE(rtg->rtg_open_zone, NULL);
> +
> + /*
> + * GC zones are fully owned by the GC thread, don't free them here.
> + */
> + if (!oz->oz_is_gc) {
> + spin_lock(&zi->zi_zone_list_lock);
> + zi->zi_nr_open_zones--;
> + list_del_init(&oz->oz_entry);
> + spin_unlock(&zi->zi_zone_list_lock);
> +
> + xfs_open_zone_put(oz);
> + }
> +
> + wake_up_all(&zi->zi_zone_wait);
> + if (rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg))
> + xfs_zone_mark_reclaimable(rtg);
> +}
> +
> +static int
> +xfs_zone_record_blocks(
> + struct xfs_trans *tp,
> + xfs_fsblock_t fsbno,
> + xfs_filblks_t len,
> + bool used)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(mp, fsbno);
> + struct xfs_inode *rmapip;
> + struct xfs_open_zone *oz;
> + struct xfs_rtgroup *rtg;
> + int error = 0;
> +
> + rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
> + if (WARN_ON_ONCE(!rtg))
> + return -EIO;
> + rmapip = rtg_rmap(rtg);
> +
> + xfs_ilock(rmapip, XFS_ILOCK_EXCL);
xfs_rtgroup_lock?
> +
> + /*
> + * There is a reference on the oz until all blocks were written, and it
> + * is only dropped below with the rmapip ILOCK held. Thus we don't need
> + * to grab an extra reference here.
> + */
> + oz = READ_ONCE(rtg->rtg_open_zone);
> + if (WARN_ON_ONCE(!oz)) {
I wonder if this should be using XFS_IS_CORRUPT() instead of
WARN_ON_ONCE? I suppose we're in transaction context so the EIO will
suffice to kill the filesystem but let's have the fs corruption
error-out logging be consistent.
> + xfs_iunlock(rmapip, XFS_ILOCK_EXCL);
> + error = -EIO;
> + goto out_put;
> + }
> +
> + trace_xfs_zone_record_blocks(oz, rgbno, len);
> + xfs_trans_ijoin(tp, rmapip, XFS_ILOCK_EXCL);
xfs_rtgroup_trans_join?
> + if (used) {
> + rmapip->i_used_blocks += len;
> + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
> + } else {
> + xfs_add_frextents(mp, len);
> + }
> +
> + oz->oz_written += len;
> + ASSERT(oz->oz_written <= oz->oz_write_pointer);
> + if (oz->oz_written == rtg_blocks(rtg))
> + xfs_open_zone_mark_full(oz);
> + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
> +out_put:
> + xfs_rtgroup_put(rtg);
> + return error;
> +}
> +
> +static int
> +xfs_zoned_end_extent(
xfs_zoned_remap_extent?
> + struct xfs_trans *tp,
> + struct xfs_inode *ip,
> + struct xfs_bmbt_irec *new,
> + xfs_fsblock_t old_startblock)
> +{
> + struct xfs_bmbt_irec data;
> + int nmaps = 1;
> + int error;
> +
> + /* Grab the corresponding mapping in the data fork. */
> + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
> + &nmaps, 0);
> + if (error)
> + return error;
> +
> + /*
> + * Cap the update to the existing extent in the data fork because we can
> + * only overwrite one extent at a time.
> + */
> + ASSERT(new->br_blockcount >= data.br_blockcount);
> + new->br_blockcount = data.br_blockcount;
> +
> + /*
> + * If a data write raced with this GC write, keep the existing data in
> + * the data fork, mark our newly written GC extent as reclaimable, then
> + * move on to the next extent.
> + */
> + if (old_startblock != NULLFSBLOCK &&
> + old_startblock != data.br_startblock)
> + goto skip;
> +
> + trace_xfs_reflink_cow_remap_from(ip, new);
> + trace_xfs_reflink_cow_remap_to(ip, &data);
> +
> + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
> + XFS_IEXT_REFLINK_END_COW_CNT);
> + if (error)
> + return error;
> +
> + if (data.br_startblock != HOLESTARTBLOCK) {
> + ASSERT(data.br_startblock != DELAYSTARTBLOCK);
> + ASSERT(!isnullstartblock(data.br_startblock));
> +
> + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
> + if (xfs_is_reflink_inode(ip)) {
> + xfs_refcount_decrease_extent(tp, true, &data);
> + } else {
> + error = xfs_free_extent_later(tp, data.br_startblock,
> + data.br_blockcount, NULL,
> + XFS_AG_RESV_NONE,
> + XFS_FREE_EXTENT_REALTIME);
> + if (error)
> + return error;
> + }
> + }
> +
> + error = xfs_zone_record_blocks(tp, new->br_startblock,
> + new->br_blockcount, true);
> + if (error)
> + return error;
> +
> + /* Map the new blocks into the data fork. */
> + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
> + return 0;
> +
> +skip:
> + trace_xfs_reflink_cow_remap_skip(ip, new);
> + return xfs_zone_record_blocks(tp, new->br_startblock,
> + new->br_blockcount, false);
> +}
> +
> +int
> +xfs_zoned_end_io(
> + struct xfs_inode *ip,
> + xfs_off_t offset,
> + xfs_off_t count,
> + xfs_daddr_t daddr,
> + xfs_fsblock_t old_startblock)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
> + struct xfs_bmbt_irec new = {
> + .br_startoff = XFS_B_TO_FSBT(mp, offset),
> + .br_startblock = xfs_daddr_to_rtb(mp, daddr),
> + .br_state = XFS_EXT_NORM,
> + };
> + unsigned int resblks =
> + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
> + struct xfs_trans *tp;
> + int error;
Odd indenting here between the variables and the arguments.
> +
> + if (xfs_is_shutdown(mp))
> + return -EIO;
> +
> + while (new.br_startoff < end_fsb) {
> + new.br_blockcount = end_fsb - new.br_startoff;
> +
> + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
> + if (error)
> + return error;
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + xfs_trans_ijoin(tp, ip, 0);
> +
> + error = xfs_zoned_end_extent(tp, ip, &new, old_startblock);
> + if (error)
> + xfs_trans_cancel(tp);
> + else
> + error = xfs_trans_commit(tp);
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + if (error)
> + return error;
> +
> + new.br_startoff += new.br_blockcount;
> + new.br_startblock += new.br_blockcount;
> + if (old_startblock != NULLFSBLOCK)
> + old_startblock += new.br_blockcount;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * "Free" blocks allocated in a zone.
> + *
> + * Just decrement the used blocks counter and report the space as freed.
> + */
> +int
> +xfs_zone_free_blocks(
> + struct xfs_trans *tp,
> + struct xfs_rtgroup *rtg,
> + xfs_fsblock_t fsbno,
> + xfs_filblks_t len)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> + struct xfs_inode *rmapip = rtg_rmap(rtg);
> +
> + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
> +
> + if (len > rmapip->i_used_blocks) {
> + xfs_err(mp,
> +"trying to free more blocks (%lld) than used counter (%u).",
> + len, rmapip->i_used_blocks);
> + ASSERT(len <= rmapip->i_used_blocks);
> + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> + return -EFSCORRUPTED;
> + }
> +
> + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
> +
> + rmapip->i_used_blocks -= len;
> + if (!READ_ONCE(rtg->rtg_open_zone)) {
> + /*
> + * If the zone is not open, mark it reclaimable when the first
> + * block is freed.
> + */
> + if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
> + xfs_zone_mark_reclaimable(rtg);
> + }
> + xfs_add_frextents(mp, len);
> + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
> + return 0;
> +}
> +
> +/*
> + * Check if the zone containing the data just before the offset we are
> + * writing to is still open and has space.
> + */
> +static struct xfs_open_zone *
> +xfs_last_used_zone(
> + struct iomap_ioend *ioend)
> +{
> + struct xfs_inode *ip = XFS_I(ioend->io_inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
> + struct xfs_rtgroup *rtg = NULL;
> + struct xfs_open_zone *oz = NULL;
> + struct xfs_iext_cursor icur;
> + struct xfs_bmbt_irec got;
> +
> + xfs_ilock(ip, XFS_ILOCK_SHARED);
> + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
> + &icur, &got)) {
> + xfs_iunlock(ip, XFS_ILOCK_SHARED);
> + return NULL;
> + }
> + xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +
> + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
> + if (!rtg)
> + return NULL;
> +
> + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
> + oz = READ_ONCE(rtg->rtg_open_zone);
> + if (oz && !atomic_inc_not_zero(&oz->oz_ref))
> + oz = NULL;
> + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
> +
> + xfs_rtgroup_rele(rtg);
> + return oz;
> +}
> +
> +static struct xfs_group *
> +xfs_find_free_zone(
> + struct xfs_mount *mp,
> + unsigned long start,
> + unsigned long end)
> +{
> + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
> + struct xfs_group *xg;
> +
> + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
> + if (atomic_inc_not_zero(&xg->xg_active_ref))
> + return xg;
> +
> + return NULL;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_init_open_zone(
> + struct xfs_rtgroup *rtg,
> + xfs_rgblock_t write_pointer,
> + bool is_gc)
> +{
> + struct xfs_open_zone *oz;
> +
> + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
> + spin_lock_init(&oz->oz_alloc_lock);
> + atomic_set(&oz->oz_ref, 1);
> + oz->oz_rtg = rtg;
> + oz->oz_write_pointer = write_pointer;
> + oz->oz_written = write_pointer;
> + oz->oz_is_gc = is_gc;
> +
> + /*
> + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
> + * inode, but we don't really want to take that here because we are
> + * under the zone_list_lock. Ensure the pointer is only set for a fully
> + * initialized open zone structure so that a racy lookup finding it is
> + * fine.
> + */
> + WRITE_ONCE(rtg->rtg_open_zone, oz);
> + return oz;
> +}
> +
/* Find a completely free zone, open it, and return a reference */ ?
--D
> +struct xfs_open_zone *
> +xfs_open_zone(
> + struct xfs_mount *mp,
> + bool is_gc)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_group *xg;
> +
> + lockdep_assert_held(&zi->zi_zone_list_lock);
> +
> + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
> + if (!xg)
> + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
> + if (!xg)
> + return NULL;
> + xfs_group_clear_mark(xg, XFS_RTG_FREE);
> + atomic_dec(&zi->zi_nr_free_zones);
> + zi->zi_free_zone_cursor = xg->xg_gno;
> + return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
> +}
> +
> +/*
> + * Activate a free zone.
> + *
> + * This just does the accounting and allows to find the zone on the open
> + * zones list. Don't bother with an explicit open command, we'll just open it
> + * implicitly with the first write to it.
> + */
> +static struct xfs_open_zone *
> +xfs_activate_zone(
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_open_zone *oz;
> +
> + if (atomic_read(&zi->zi_nr_free_zones) <
> + XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
> + return NULL;
> +
> + oz = xfs_open_zone(mp, false);
> + if (!oz)
> + return NULL;
> +
> + atomic_inc(&oz->oz_ref);
> + zi->zi_nr_open_zones++;
> + list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> +
> + /* XXX: this is a little verbose, but let's keep it for now */
> + xfs_info(mp, "using zone %u (%u)",
> + rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);
> + trace_xfs_zone_activate(oz->oz_rtg);
> + return oz;
> +}
> +
> +static bool
> +xfs_try_use_zone(
> + struct xfs_zone_info *zi,
> + struct xfs_open_zone *oz)
> +{
> + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
> + return false;
> + if (!atomic_inc_not_zero(&oz->oz_ref))
> + return false;
> +
> + /*
> + * If we couldn't match by inode or life time we just pick the first
> + * zone with enough space above. For that we want the least busy zone
> + * for some definition of "least" busy. For now this simple LRU
> + * algorithm that rotates every zone to the end of the list will do it,
> + * even if it isn't exactly cache friendly.
> + */
> + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
> + list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
> + return true;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_open_zone_lru(
> + struct xfs_zone_info *zi)
> +{
> + struct xfs_open_zone *oz;
> +
> + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
> + if (xfs_try_use_zone(zi, oz))
> + return oz;
> + return NULL;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_open_zone_mru(
> + struct xfs_zone_info *zi)
> +{
> + struct xfs_open_zone *oz;
> +
> + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
> + if (xfs_try_use_zone(zi, oz))
> + return oz;
> + return NULL;
> +}
> +
> +/*
> + * Try to pack inodes that are written back after they were closed tight instead
> + * of trying to open new zones for them or spread them to the least recently
> + * used zone. This optimizes the data layout for workloads that untar or copy
> + * a lot of small files. Right now this does not separate multiple such
> + * streams.
> + */
> +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
> +{
> + return !inode_is_open_for_write(VFS_I(ip)) &&
> + !(ip->i_diflags & XFS_DIFLAG_APPEND);
> +}
> +
> +/*
> + * Pick a new zone for writes.
> + *
> + * If we aren't using up our budget of open zones just open a new one from
> + * the freelist. Else try to find one that matches the expected allocation
> + * length, or at least the minimum required length. If we don't find one
> + * that is good enough we pick one anyway and let the caller finish it to
> + * free up open zone resources.
> + */
> +static struct xfs_open_zone *
> +xfs_select_zone_nowait(
> + struct xfs_inode *ip,
> + xfs_filblks_t count_fsb)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_open_zone *oz = NULL;
> +
> + if (xfs_zoned_pack_tight(ip))
> + oz = xfs_select_open_zone_mru(zi);
> + if (oz)
> + return oz;
> +
> + /*
> + * If we are below the open limit try to activate a zone.
> + */
> + if (zi->zi_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
> + oz = xfs_activate_zone(mp);
> + if (oz)
> + return oz;
> + }
> +
> + return xfs_select_open_zone_lru(zi);
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_zone(
> + struct iomap_ioend *ioend)
> +{
> + struct xfs_inode *ip = XFS_I(ioend->io_inode);
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
> + struct xfs_open_zone *oz = NULL;
> + DEFINE_WAIT (wait);
> +
> + spin_lock(&zi->zi_zone_list_lock);
> + if (xfs_is_shutdown(mp))
> + goto out_unlock;
> +
> + oz = xfs_select_zone_nowait(ip, count_fsb);
> + if (oz)
> + goto out_unlock;
> +
> + for (;;) {
> + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
> + if (xfs_is_shutdown(mp))
> + break;
> +
> + oz = xfs_select_zone_nowait(ip, count_fsb);
> + if (oz)
> + break;
> +
> + spin_unlock(&zi->zi_zone_list_lock);
> + schedule();
> + spin_lock(&zi->zi_zone_list_lock);
> + }
> + finish_wait(&zi->zi_zone_wait, &wait);
> +
> +out_unlock:
> + spin_unlock(&zi->zi_zone_list_lock);
> + return oz;
> +}
> +
> +static unsigned int
> +xfs_zone_alloc_blocks(
> + struct iomap_ioend *ioend,
> + struct xfs_open_zone *oz,
> + bool *is_seq)
> +{
> + struct xfs_rtgroup *rtg = oz->oz_rtg;
> + struct xfs_mount *mp = rtg_mount(rtg);
> + xfs_filblks_t count_fsb = XFS_B_TO_FSB(mp, ioend->io_size);
> + xfs_rgblock_t rgbno;
> +
> + spin_lock(&oz->oz_alloc_lock);
> + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
> + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
> + if (!count_fsb) {
> + spin_unlock(&oz->oz_alloc_lock);
> + return 0;
> + }
> + rgbno = oz->oz_write_pointer;
> + oz->oz_write_pointer += count_fsb;
> + spin_unlock(&oz->oz_alloc_lock);
> +
> + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
> +
> + ioend->io_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
> + *is_seq = bdev_zone_is_seq(ioend->io_bio.bi_bdev, ioend->io_sector);
> + if (!*is_seq)
> + ioend->io_sector += XFS_FSB_TO_BB(mp, rgbno);
> + return XFS_FSB_TO_B(mp, count_fsb);
> +}
> +
> +void
> +xfs_mark_rtg_boundary(
> + struct iomap_ioend *ioend)
> +{
> + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
> + sector_t sector = ioend->io_bio.bi_iter.bi_sector;
> +
> + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
> + ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
> +}
> +
> +static void
> +xfs_submit_zoned_bio(
> + struct iomap_ioend *ioend,
> + bool is_seq)
> +{
> + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
> +
> + if (is_seq) {
> + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
> + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
> + } else {
> + xfs_mark_rtg_boundary(ioend);
> + }
> +
> + submit_bio(&ioend->io_bio);
> +}
> +
> +void
> +xfs_zone_alloc_and_submit(
> + struct iomap_ioend *ioend,
> + struct xfs_open_zone **oz)
> +{
> + unsigned int alloc_len;
> + struct iomap_ioend *split;
> + bool is_seq;
> +
> + if (xfs_is_shutdown(XFS_I(ioend->io_inode)->i_mount))
> + goto out_error;
> +
> + /*
> + * If we don't have a cached zone in this write context, see if the
> + * last extent before the one we are writing points of an active zone.
> + * If so, just continue writing to it.
> + */
> + if (!*oz && ioend->io_offset)
> + *oz = xfs_last_used_zone(ioend);
> + if (!*oz) {
> +select_zone:
> + *oz = xfs_select_zone(ioend);
> + if (!*oz)
> + goto out_error;
> + }
> +
> + alloc_len = xfs_zone_alloc_blocks(ioend, *oz, &is_seq);
> + if (!alloc_len) {
> + xfs_open_zone_put(*oz);
> + goto select_zone;
> + }
> +
> + while ((split = iomap_split_ioend(ioend, is_seq, &alloc_len))) {
> + xfs_submit_zoned_bio(split, is_seq);
> + if (!alloc_len) {
> + xfs_open_zone_put(*oz);
> + goto select_zone;
> + }
> + }
> +
> + xfs_submit_zoned_bio(ioend, is_seq);
> + return;
> +
> +out_error:
> + bio_io_error(&ioend->io_bio);
> +}
> +
> +void
> +xfs_zoned_wake_all(
> + struct xfs_mount *mp)
> +{
> + if (!(mp->m_super->s_flags & SB_ACTIVE))
> + return; /* can happen during log recovery */
> + spin_lock(&mp->m_zone_info->zi_zone_list_lock);
> + wake_up_all(&mp->m_zone_info->zi_zone_wait);
> + spin_unlock(&mp->m_zone_info->zi_zone_list_lock);
> +}
> +
> +/*
> + * Check if @rgbno in @rgb is a potentially valid block. It might still be
> + * unused, but that information is only found in the rmap.
> + */
> +bool
> +xfs_zone_rgbno_is_valid(
> + struct xfs_rtgroup *rtg,
> + xfs_rgnumber_t rgbno)
> +{
> + lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
> +
> + if (rtg->rtg_open_zone)
> + return rgbno < rtg->rtg_open_zone->oz_write_pointer;
> + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
> + rtg_rgno(rtg), XFS_RTG_FREE);
> +}
> +
> +static void
> +xfs_free_open_zones(
> + struct xfs_zone_info *zi)
> +{
> + struct xfs_open_zone *oz;
> +
> + spin_lock(&zi->zi_zone_list_lock);
> + while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
> + struct xfs_open_zone, oz_entry))) {
> + list_del(&oz->oz_entry);
> + xfs_open_zone_put(oz);
> + }
> + spin_unlock(&zi->zi_zone_list_lock);
> +}
> +
> +struct xfs_init_zones {
> + struct xfs_mount *mp;
> + uint64_t available;
> + uint64_t reclaimable;
> +};
> +
> +static int
> +xfs_init_zone(
> + struct xfs_init_zones *iz,
> + struct xfs_rtgroup *rtg,
> + struct blk_zone *zone)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + uint64_t used = rtg_rmap(rtg)->i_used_blocks;
> + xfs_rgblock_t write_pointer;
> + int error;
> +
> + if (zone) {
> + error = xfs_zone_validate(zone, rtg, &write_pointer);
> + if (error)
> + return error;
> + }
> +
> + /*
> + * For sequential write required zones we retrieved the hardware write
> + * pointer above.
> + *
> + * For conventional zones or conventional devices we don't have that
> + * luxury. Instead query the rmap to find the highest recorded block
> + * and set the write pointer to the block after that. In case of a
> + * power loss this misses blocks where the data I/O has completed but
> + * not recorded in the rmap yet, and it also rewrites blocks if the most
> + * recently written ones got deleted again before unmount, but this is
> + * the best we can do without hardware support.
> + */
> + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
> + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> + write_pointer = xfs_rtrmap_first_unwritten_rgbno(rtg);
> + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
> + }
> +
> + if (write_pointer == 0) {
> + /* zone is empty */
> + atomic_inc(&zi->zi_nr_free_zones);
> + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
> + iz->available += rtg_blocks(rtg);
> + } else if (write_pointer < rtg_blocks(rtg)) {
> + /* zone is open */
> + struct xfs_open_zone *oz;
> +
> + atomic_inc(&rtg_group(rtg)->xg_active_ref);
> + oz = xfs_init_open_zone(rtg, write_pointer, false);
> + list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> + zi->zi_nr_open_zones++;
> +
> + iz->available += (rtg_blocks(rtg) - write_pointer);
> + iz->reclaimable += write_pointer - used;
> + } else if (used < rtg_blocks(rtg)) {
> + /* zone fully written, but has freed blocks */
> + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
> + iz->reclaimable += (rtg_blocks(rtg) - used);
> + }
> +
> + return 0;
> +}
> +
> +static int
> +xfs_get_zone_info_cb(
> + struct blk_zone *zone,
> + unsigned int idx,
> + void *data)
> +{
> + struct xfs_init_zones *iz = data;
> + struct xfs_mount *mp = iz->mp;
> + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start);
> + xfs_rgnumber_t rgno;
> + struct xfs_rtgroup *rtg;
> + int error;
> +
> + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
> + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
> + return -EFSCORRUPTED;
> + }
> +
> + rgno = xfs_rtb_to_rgno(mp, zsbno);
> + rtg = xfs_rtgroup_grab(mp, rgno);
> + if (!rtg) {
> + xfs_warn(mp, "realtime group not found for zone %u.", rgno);
> + return -EFSCORRUPTED;
> + }
> + error = xfs_init_zone(iz, rtg, zone);
> + xfs_rtgroup_rele(rtg);
> + return error;
> +}
> +
> +/*
> + * Calculate the max open zone limit based on the of number of
> + * backing zones available
> + */
> +static inline uint32_t
> +xfs_max_open_zones(
> + struct xfs_mount *mp)
> +{
> + unsigned int max_open, max_open_data_zones;
> + /*
> + * We need two zones for every open data zone,
> + * one in reserve as we don't reclaim open zones. One data zone
> + * and its spare is included in XFS_MIN_ZONES.
> + */
> + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
> + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
> +
> + /*
> + * Cap the max open limit to 1/4 of available space
> + */
> + max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
> +
> + return max(XFS_MIN_OPEN_ZONES, max_open);
> +}
> +
> +/*
> + * Normally we use the open zone limit that the device reports. If there is
> + * none let the user pick one from the command line.
> + *
> + * If the device doesn't report an open zone limit and there is no override,
> + * allow to hold about a quarter of the zones open. In theory we could allow
> + * all to be open, but at that point we run into GC deadlocks because we can't
> + * reclaim open zones.
> + *
> + * When used on conventional SSDs a lower open limit is advisable as we'll
> + * otherwise overwhelm the FTL just as much as a conventional block allocator.
> + *
> + * Note: To debug the open zone management code, force max_open to 1 here.
> + */
> +static int
> +xfs_calc_open_zones(
> + struct xfs_mount *mp)
> +{
> + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
> + unsigned int bdev_open_zones = bdev_max_open_zones(bdev);
> +
> + if (!mp->m_max_open_zones) {
> + if (bdev_open_zones)
> + mp->m_max_open_zones = bdev_open_zones;
> + else
> + mp->m_max_open_zones = xfs_max_open_zones(mp);
> + }
> +
> + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
> + xfs_notice(mp, "need at least %u open zones.",
> + XFS_MIN_OPEN_ZONES);
> + return -EIO;
> + }
> +
> + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
> + mp->m_max_open_zones = bdev_open_zones;
> + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
> + bdev_open_zones);
> + }
> +
> + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
> + mp->m_max_open_zones = xfs_max_open_zones(mp);
> + xfs_info(mp,
> +"limiting open zones to %u due to total zone count (%u)",
> + mp->m_max_open_zones, mp->m_sb.sb_rgcount);
> + }
> +
> + return 0;
> +}
> +
> +static struct xfs_zone_info *
> +xfs_alloc_zone_info(void)
> +{
> + struct xfs_zone_info *zi;
> +
> + zi = kzalloc(sizeof(*zi), GFP_KERNEL);
> + if (!zi)
> + return NULL;
> + INIT_LIST_HEAD(&zi->zi_open_zones);
> + INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
> + spin_lock_init(&zi->zi_reset_list_lock);
> + spin_lock_init(&zi->zi_zone_list_lock);
> + spin_lock_init(&zi->zi_reservation_lock);
> + init_waitqueue_head(&zi->zi_zone_wait);
> + return zi;
> +}
> +
> +int
> +xfs_mount_zones(
> + struct xfs_mount *mp)
> +{
> + struct xfs_init_zones iz = {
> + .mp = mp,
> + };
> + struct xfs_buftarg *bt = mp->m_rtdev_targp;
> + int error;
> +
> + if (!bt) {
> + xfs_notice(mp, "RT device missing.");
> + return -EINVAL;
> + }
> +
> + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
> + xfs_notice(mp, "invalid flag combination.");
> + return -EFSCORRUPTED;
> + }
> + if (mp->m_sb.sb_rextsize != 1) {
> + xfs_notice(mp, "zoned file systems do not support rextsize.");
> + return -EFSCORRUPTED;
> + }
> + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
> + xfs_notice(mp,
> +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
> + return -EFSCORRUPTED;
> + }
> +
> + error = xfs_calc_open_zones(mp);
> + if (error)
> + return error;
> +
> + mp->m_zone_info = xfs_alloc_zone_info();
> + if (!mp->m_zone_info)
> + return -ENOMEM;
> +
> + xfs_info(mp, "%u zones of %u blocks size (%u max open)",
> + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
> + mp->m_max_open_zones);
> +
> + if (bdev_is_zoned(bt->bt_bdev)) {
> + error = blkdev_report_zones(bt->bt_bdev,
> + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
> + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
> + if (error < 0)
> + goto out_free_open_zones;
> + } else {
> + struct xfs_rtgroup *rtg = NULL;
> +
> + while ((rtg = xfs_rtgroup_next(mp, rtg))) {
> + error = xfs_init_zone(&iz, rtg, NULL);
> + if (error)
> + goto out_free_open_zones;
> + }
> + }
> +
> + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
> + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> + iz.available + iz.reclaimable);
> +
> + return 0;
> +
> +out_free_open_zones:
> + xfs_free_open_zones(mp->m_zone_info);
> + kfree(mp->m_zone_info);
> + return error;
> +}
> +
> +void
> +xfs_unmount_zones(
> + struct xfs_mount *mp)
> +{
> + xfs_free_open_zones(mp->m_zone_info);
> + kfree(mp->m_zone_info);
> +}
> diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
> new file mode 100644
> index 000000000000..37a49f4ce40c
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_alloc.h
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _XFS_ZONE_ALLOC_H
> +#define _XFS_ZONE_ALLOC_H
> +
> +struct iomap_ioend;
> +struct xfs_open_zone;
> +
> +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
> + struct xfs_open_zone **oz);
> +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
> + xfs_fsblock_t fsbno, xfs_filblks_t len);
> +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
> + xfs_daddr_t daddr, xfs_fsblock_t old_startblock);
> +void xfs_open_zone_put(struct xfs_open_zone *oz);
> +
> +void xfs_zoned_wake_all(struct xfs_mount *mp);
> +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
> +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
> +
> +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
> + enum xfs_free_counter ctr);
> +
> +#ifdef CONFIG_XFS_RT
> +int xfs_mount_zones(struct xfs_mount *mp);
> +void xfs_unmount_zones(struct xfs_mount *mp);
> +#else
> +static inline int xfs_mount_zones(struct xfs_mount *mp)
> +{
> + return -EIO;
> +}
> +static inline void xfs_unmount_zones(struct xfs_mount *mp)
> +{
> +}
> +#endif /* CONFIG_XFS_RT */
> +
> +#endif /* _XFS_ZONE_ALLOC_H */
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> new file mode 100644
> index 000000000000..ae1556871596
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -0,0 +1,85 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _XFS_ZONE_PRIV_H
> +#define _XFS_ZONE_PRIV_H
> +
> +struct xfs_open_zone {
> + /*
> + * Entry in the open zone list and refcount. Protected by
> + * zi_zone_list_lock in struct xfs_zone_info.
> + */
> + struct list_head oz_entry;
> + atomic_t oz_ref;
> +
> + /*
> + * Protects oz_write_pointer and oz_written.
> + */
> + spinlock_t oz_alloc_lock;
> +
> + /*
> + * oz_write_pointer is the write pointer at which space is handed out
> + * for conventional zones, or simple the count of blocks handed out
> + * so far for sequential write required zones.
> + *
> + * oz_written is the number of blocks for which we've received a
> + * write completion. oz_written must always be <= oz_write_pointer.
> + */
> + xfs_rgblock_t oz_write_pointer;
> + xfs_rgblock_t oz_written;
> +
> + /*
> + * Is this open zone used for garbage collection? There can only be a
> + * single open GC zone, which is pointed to by zi_open_gc_zone in
> + * struct xfs_zone_info. Constant over the life time of an open zone.
> + */
> + bool oz_is_gc;
> +
> + /*
> + * Pointer to the RT groups structure for this open zone. Constant over
> + * the life time of an open zone.
> + */
> + struct xfs_rtgroup *oz_rtg;
> +};
> +
> +struct xfs_zone_info {
> + /*
> + * List of pending space reservations:
> + */
> + spinlock_t zi_reservation_lock;
> + struct list_head zi_reclaim_reservations;
> +
> + /*
> + * Lock for open and free zone information, and wait queue to wait for
> + * free zones or open zone resources to become available:
> + */
> + spinlock_t zi_zone_list_lock;
> + wait_queue_head_t zi_zone_wait;
> +
> + /*
> + * List and number of open zones:
> + */
> + struct list_head zi_open_zones;
> + unsigned int zi_nr_open_zones;
> +
> + /*
> + * Free zone search cursor and number of free zones:
> + */
> + unsigned long zi_free_zone_cursor;
> + atomic_t zi_nr_free_zones;
> +
> + /*
> + * Pointer to the GC thread, and the current open zone used by GC
> + * (if any).
> + */
> + struct task_struct *zi_gc_thread;
> + struct xfs_open_zone *zi_open_gc_zone;
> +
> + /*
> + * List of zones that need a reset:
> + */
> + spinlock_t zi_reset_list_lock;
> + struct xfs_group *zi_reset_list;
> +};
> +
> +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
> +
> +#endif /* _XFS_ZONE_PRIV_H */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 24/43] xfs: add the zoned space allocator
2024-12-13 18:33 ` Darrick J. Wong
@ 2024-12-15 5:27 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 5:27 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 10:33:04AM -0800, Darrick J. Wong wrote:
> > + struct xfs_mount *mp = tp->t_mountp;
> > + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(mp, fsbno);
> > + struct xfs_inode *rmapip;
> > + struct xfs_open_zone *oz;
> > + struct xfs_rtgroup *rtg;
> > + int error = 0;
> > +
> > + rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, fsbno));
> > + if (WARN_ON_ONCE(!rtg))
> > + return -EIO;
> > + rmapip = rtg_rmap(rtg);
> > +
> > + xfs_ilock(rmapip, XFS_ILOCK_EXCL);
>
> xfs_rtgroup_lock?
Yeah, I'll do an audit for using the proper helpers.
>
> > +
> > + /*
> > + * There is a reference on the oz until all blocks were written, and it
> > + * is only dropped below with the rmapip ILOCK held. Thus we don't need
> > + * to grab an extra reference here.
> > + */
> > + oz = READ_ONCE(rtg->rtg_open_zone);
> > + if (WARN_ON_ONCE(!oz)) {
>
> I wonder if this should be using XFS_IS_CORRUPT() instead of
> WARN_ON_ONCE?
Probably.
> > +}
> > +
> > +static int
> > +xfs_zoned_end_extent(
>
> xfs_zoned_remap_extent?
It doesn't really remap much, right? It maps, so I guess I could
use that.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (23 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 24/43] xfs: add the zoned space allocator Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 21:01 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 26/43] xfs: implement zoned garbage collection Christoph Hellwig
` (17 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
For zoned file systems garbage collection (GC) has to take the iolock
and mmaplock after moving data to a new place to synchronize with
readers. This means waiting for garbage collection with the iolock can
deadlock.
To avoid this, the worst case required blocks have to be reserved before
taking the iolock, which is done using a new RTAVAILABLE counter that
tracks blocks that are free to write into and don't require garbage
collection. The new helpers try to take these available blocks, and
if there aren't enough available it wakes and waits for GC. This is
done using a list of on-stack reservations to ensure fairness.
Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/Makefile | 3 +-
fs/xfs/libxfs/xfs_bmap.c | 15 ++-
fs/xfs/xfs_zone_alloc.h | 15 +++
fs/xfs/xfs_zone_priv.h | 2 +
fs/xfs/xfs_zone_space_resv.c | 244 +++++++++++++++++++++++++++++++++++
5 files changed, 274 insertions(+), 5 deletions(-)
create mode 100644 fs/xfs/xfs_zone_space_resv.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 28bd2627e9ef..bdedf4bdb1db 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
- xfs_zone_alloc.o
+ xfs_zone_alloc.o \
+ xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 512f1ceca47f..625d853f248b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -40,6 +40,7 @@
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -4787,12 +4788,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (bflags & XFS_BMAPI_REMAP)
+ if (bflags & XFS_BMAPI_REMAP) {
;
- else if (isrt)
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
- else
+ } else if (isrt) {
+ xfs_rtxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 37a49f4ce40c..6d0404c2c46c 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -5,6 +5,21 @@
struct iomap_ioend;
struct xfs_open_zone;
+struct xfs_zone_alloc_ctx {
+ struct xfs_open_zone *open_zone;
+ xfs_filblks_t reserved_blocks;
+};
+
+#define XFS_ZR_GREEDY (1U << 0)
+#define XFS_ZR_NOWAIT (1U << 1)
+#define XFS_ZR_RESERVED (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+ unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
struct xfs_open_zone **oz);
int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index ae1556871596..f56f3ca8ea00 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -82,4 +82,6 @@ struct xfs_zone_info {
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
#endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
new file mode 100644
index 000000000000..5ee525e18759
--- /dev/null
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Christoph Hellwig.
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+ struct list_head entry;
+ struct task_struct *task;
+ xfs_filblks_t count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks. This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ switch (ctr) {
+ case XC_FREE_RTEXTENTS:
+ return (uint64_t)XFS_RESERVED_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks +
+ mp->m_sb.sb_rtreserved;
+ case XC_FREE_RTAVAILABLE:
+ return (uint64_t)XFS_GC_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+void
+xfs_zoned_resv_wake_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ return;
+ }
+
+ spin_lock(&zi->zi_reservation_lock);
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+ if (reservation->count_fsb > count_fsb)
+ break;
+ wake_up_process(reservation->task);
+ count_fsb -= reservation->count_fsb;
+
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .count_fsb = count_fsb,
+ };
+ int error;
+
+ /*
+ * If there are no waiters, try to directly grab the available blocks
+ * from the percpu counter.
+ *
+ * If the caller wants to dip into the reserved pool also bypass the
+ * wait list. This relies on the fact that we have a very graciously
+ * sized reserved pool that always has enough space. If the reserved
+ * allocations fail we're in trouble.
+ */
+ if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+ (flags & XFS_ZR_RESERVED))) {
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ return error;
+ }
+
+ if (flags & XFS_ZR_NOWAIT)
+ return -EAGAIN;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+ while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+ set_current_state(TASK_KILLABLE);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ break;
+
+ spin_unlock(&zi->zi_reservation_lock);
+ schedule();
+ spin_lock(&zi->zi_reservation_lock);
+ }
+ list_del(&reservation.entry);
+ spin_unlock(&zi->zi_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+ struct xfs_inode *ip,
+ xfs_filblks_t *count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ s64 len = *count_fsb;
+ int error = -ENOSPC;
+
+ spin_lock(&zi->zi_reservation_lock);
+ len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ if (len > 0) {
+ *count_fsb = len;
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+ return error;
+}
+
+int
+xfs_zoned_space_reserve(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ ASSERT(ac->reserved_blocks == 0);
+ ASSERT(ac->open_zone == NULL);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+ error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+ if (error)
+ return error;
+
+ error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+ if (error) {
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+ return error;
+ }
+ ac->reserved_blocks = count_fsb;
+ return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+ struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (ac->reserved_blocks > 0) {
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+ }
+ if (ac->open_zone)
+ xfs_open_zone_put(ac->open_zone);
+}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-11 8:54 ` [PATCH 25/43] xfs: add support for zoned space reservations Christoph Hellwig
@ 2024-12-13 21:01 ` Darrick J. Wong
2024-12-15 5:31 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 21:01 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:50AM +0100, Christoph Hellwig wrote:
> For zoned file systems garbage collection (GC) has to take the iolock
> and mmaplock after moving data to a new place to synchronize with
> readers. This means waiting for garbage collection with the iolock can
> deadlock.
>
> To avoid this, the worst case required blocks have to be reserved before
> taking the iolock, which is done using a new RTAVAILABLE counter that
> tracks blocks that are free to write into and don't require garbage
> collection. The new helpers try to take these available blocks, and
> if there aren't enough available it wakes and waits for GC. This is
> done using a list of on-stack reservations to ensure fairness.
>
> Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/Makefile | 3 +-
> fs/xfs/libxfs/xfs_bmap.c | 15 ++-
> fs/xfs/xfs_zone_alloc.h | 15 +++
> fs/xfs/xfs_zone_priv.h | 2 +
> fs/xfs/xfs_zone_space_resv.c | 244 +++++++++++++++++++++++++++++++++++
> 5 files changed, 274 insertions(+), 5 deletions(-)
> create mode 100644 fs/xfs/xfs_zone_space_resv.c
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 28bd2627e9ef..bdedf4bdb1db 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
>
> # xfs_rtbitmap is shared with libxfs
> xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
> - xfs_zone_alloc.o
> + xfs_zone_alloc.o \
> + xfs_zone_space_resv.o
>
> xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
> xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 512f1ceca47f..625d853f248b 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -40,6 +40,7 @@
> #include "xfs_symlink_remote.h"
> #include "xfs_inode_util.h"
> #include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
>
> struct kmem_cache *xfs_bmap_intent_cache;
>
> @@ -4787,12 +4788,18 @@ xfs_bmap_del_extent_delay(
> da_diff = da_old - da_new;
> fdblocks = da_diff;
>
> - if (bflags & XFS_BMAPI_REMAP)
> + if (bflags & XFS_BMAPI_REMAP) {
> ;
> - else if (isrt)
> - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
> - else
> + } else if (isrt) {
> + xfs_rtxlen_t rtxlen;
> +
> + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
> + if (xfs_is_zoned_inode(ip))
> + xfs_zoned_add_available(mp, rtxlen);
> + xfs_add_frextents(mp, rtxlen);
> + } else {
> fdblocks += del->br_blockcount;
> + }
>
> xfs_add_fdblocks(mp, fdblocks);
> xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
> diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
> index 37a49f4ce40c..6d0404c2c46c 100644
> --- a/fs/xfs/xfs_zone_alloc.h
> +++ b/fs/xfs/xfs_zone_alloc.h
> @@ -5,6 +5,21 @@
> struct iomap_ioend;
> struct xfs_open_zone;
>
> +struct xfs_zone_alloc_ctx {
> + struct xfs_open_zone *open_zone;
> + xfs_filblks_t reserved_blocks;
> +};
> +
> +#define XFS_ZR_GREEDY (1U << 0)
> +#define XFS_ZR_NOWAIT (1U << 1)
> +#define XFS_ZR_RESERVED (1U << 2)
What do these flag values mean? Can we put that into comments?
> +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
> + unsigned int flags, struct xfs_zone_alloc_ctx *ac);
> +void xfs_zoned_space_unreserve(struct xfs_inode *ip,
> + struct xfs_zone_alloc_ctx *ac);
> +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
> +
> void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
> struct xfs_open_zone **oz);
> int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index ae1556871596..f56f3ca8ea00 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -82,4 +82,6 @@ struct xfs_zone_info {
>
> struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
>
> +void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
> +
> #endif /* _XFS_ZONE_PRIV_H */
> diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
> new file mode 100644
> index 000000000000..5ee525e18759
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_space_resv.c
> @@ -0,0 +1,244 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023 Christoph Hellwig.
> + * Copyright (c) 2024, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_rtbitmap.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
> +#include "xfs_zones.h"
> +
> +/*
> + * Note: the zoned allocator does not support a rtextsize > 1, so this code and
> + * the allocator itself uses file system blocks interchangable with realtime
> + * extents without doing the otherwise required conversions.
> + */
> +
> +/*
> + * Per-task space reservation.
> + *
> + * Tasks that need to wait for GC to free up space allocate one of these
> + * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
> + * The GC thread will then wake the tasks in order when space becomes available.
> + */
> +struct xfs_zone_reservation {
> + struct list_head entry;
> + struct task_struct *task;
> + xfs_filblks_t count_fsb;
> +};
> +
> +/*
> + * Calculate the number of reserved blocks.
> + *
> + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
> + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
> + * available for writes without waiting for GC.
> + *
> + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
> + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
> + * is further restricted by at least one zone as well as the optional
> + * persistently reserved blocks. This allows the allocator to run more
> + * smoothly by not always triggering GC.
Hmm, so _RTAVAILABLE really means _RTNOGC? That makes sense.
> + */
> +uint64_t
> +xfs_zoned_default_resblks(
> + struct xfs_mount *mp,
> + enum xfs_free_counter ctr)
> +{
> + switch (ctr) {
> + case XC_FREE_RTEXTENTS:
> + return (uint64_t)XFS_RESERVED_ZONES *
> + mp->m_groups[XG_TYPE_RTG].blocks +
> + mp->m_sb.sb_rtreserved;
> + case XC_FREE_RTAVAILABLE:
> + return (uint64_t)XFS_GC_ZONES *
> + mp->m_groups[XG_TYPE_RTG].blocks;
> + default:
> + ASSERT(0);
> + return 0;
> + }
> +}
> +
> +void
> +xfs_zoned_resv_wake_all(
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_zone_reservation *reservation;
> +
> + spin_lock(&zi->zi_reservation_lock);
> + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
> + wake_up_process(reservation->task);
> + spin_unlock(&zi->zi_reservation_lock);
> +}
> +
> +void
> +xfs_zoned_add_available(
> + struct xfs_mount *mp,
> + xfs_filblks_t count_fsb)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_zone_reservation *reservation;
> +
> + if (list_empty_careful(&zi->zi_reclaim_reservations)) {
> + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
> + return;
> + }
> +
> + spin_lock(&zi->zi_reservation_lock);
> + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
> + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
> + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
> + if (reservation->count_fsb > count_fsb)
> + break;
> + wake_up_process(reservation->task);
> + count_fsb -= reservation->count_fsb;
> +
> + }
> + spin_unlock(&zi->zi_reservation_lock);
> +}
> +
> +static int
> +xfs_zoned_space_wait_error(
> + struct xfs_mount *mp)
> +{
> + if (xfs_is_shutdown(mp))
> + return -EIO;
> + if (fatal_signal_pending(current))
> + return -EINTR;
> + return 0;
> +}
> +
> +static int
> +xfs_zoned_reserve_available(
> + struct xfs_inode *ip,
> + xfs_filblks_t count_fsb,
> + unsigned int flags)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_zone_reservation reservation = {
> + .task = current,
> + .count_fsb = count_fsb,
> + };
> + int error;
> +
> + /*
> + * If there are no waiters, try to directly grab the available blocks
> + * from the percpu counter.
> + *
> + * If the caller wants to dip into the reserved pool also bypass the
> + * wait list. This relies on the fact that we have a very graciously
> + * sized reserved pool that always has enough space. If the reserved
> + * allocations fail we're in trouble.
> + */
> + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
> + (flags & XFS_ZR_RESERVED))) {
> + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
> + flags & XFS_ZR_RESERVED);
> + if (error != -ENOSPC)
> + return error;
> + }
> +
> + if (flags & XFS_ZR_NOWAIT)
> + return -EAGAIN;
> +
> + spin_lock(&zi->zi_reservation_lock);
> + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
> + while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
> + set_current_state(TASK_KILLABLE);
> +
> + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
> + flags & XFS_ZR_RESERVED);
> + if (error != -ENOSPC)
> + break;
> +
> + spin_unlock(&zi->zi_reservation_lock);
> + schedule();
> + spin_lock(&zi->zi_reservation_lock);
> + }
> + list_del(&reservation.entry);
> + spin_unlock(&zi->zi_reservation_lock);
Hmm. So if I'm understanding correctly, threads wanting to write to a
file try to locklessly reserve space from RTAVAILABLE. If they can't
get space because the zone is nearly full / needs gc / etc then everyone
gets to wait FIFO style in the reclaim_reservations list. They can be
woken up from the wait if either (a) someone gives back reserved space
or (b) the copygc empties out this zone.
Or if the thread isn't willing to wait, we skip the fifo and either fail
up to userspace or just move on to the next zone?
I think I understand the general idea, but I don't quite know when we're
going to use the greedy algorithm. Later I see XFS_ZR_GREEDY gets used
from the buffered write path, but there doesn't seem to be an obvious
reason why?
--D
> +
> + __set_current_state(TASK_RUNNING);
> + return error;
> +}
> +
> +/*
> + * Implement greedy space allocation for short writes by trying to grab all
> + * that is left after locking out other threads from trying to do the same.
> + *
> + * This isn't exactly optimal and can hopefully be replaced by a proper
> + * percpu_counter primitive one day.
> + */
> +static int
> +xfs_zoned_reserve_extents_greedy(
> + struct xfs_inode *ip,
> + xfs_filblks_t *count_fsb,
> + unsigned int flags)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + s64 len = *count_fsb;
> + int error = -ENOSPC;
> +
> + spin_lock(&zi->zi_reservation_lock);
> + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
> + if (len > 0) {
> + *count_fsb = len;
> + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
> + flags & XFS_ZR_RESERVED);
> + }
> + spin_unlock(&zi->zi_reservation_lock);
> + return error;
> +}
> +
> +int
> +xfs_zoned_space_reserve(
> + struct xfs_inode *ip,
> + xfs_filblks_t count_fsb,
> + unsigned int flags,
> + struct xfs_zone_alloc_ctx *ac)
> +{
> + struct xfs_mount *mp = ip->i_mount;
> + int error;
> +
> + ASSERT(ac->reserved_blocks == 0);
> + ASSERT(ac->open_zone == NULL);
> +
> + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
> + flags & XFS_ZR_RESERVED);
> + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
> + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
> + if (error)
> + return error;
> +
> + error = xfs_zoned_reserve_available(ip, count_fsb, flags);
> + if (error) {
> + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
> + return error;
> + }
> + ac->reserved_blocks = count_fsb;
> + return 0;
> +}
> +
> +void
> +xfs_zoned_space_unreserve(
> + struct xfs_inode *ip,
> + struct xfs_zone_alloc_ctx *ac)
> +{
> + if (ac->reserved_blocks > 0) {
> + struct xfs_mount *mp = ip->i_mount;
> +
> + xfs_zoned_add_available(mp, ac->reserved_blocks);
> + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
> + }
> + if (ac->open_zone)
> + xfs_open_zone_put(ac->open_zone);
> +}
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-13 21:01 ` Darrick J. Wong
@ 2024-12-15 5:31 ` Christoph Hellwig
2024-12-17 16:59 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 5:31 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 01:01:40PM -0800, Darrick J. Wong wrote:
> > +#define XFS_ZR_GREEDY (1U << 0)
> > +#define XFS_ZR_NOWAIT (1U << 1)
> > +#define XFS_ZR_RESERVED (1U << 2)
>
> What do these flag values mean? Can we put that into comments?
Sure.
> > + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
> > + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
> > + * is further restricted by at least one zone as well as the optional
> > + * persistently reserved blocks. This allows the allocator to run more
> > + * smoothly by not always triggering GC.
>
> Hmm, so _RTAVAILABLE really means _RTNOGC? That makes sense.
Yes, it means block available without doing further work.
I can't say _RTNOGC is very descriptive either, but I would not mind
a better name if someone came up with a good one :)
> > + spin_unlock(&zi->zi_reservation_lock);
> > + schedule();
> > + spin_lock(&zi->zi_reservation_lock);
> > + }
> > + list_del(&reservation.entry);
> > + spin_unlock(&zi->zi_reservation_lock);
>
> Hmm. So if I'm understanding correctly, threads wanting to write to a
> file try to locklessly reserve space from RTAVAILABLE.
At least if there are no waiters yet, yes.
> If they can't
> get space because the zone is nearly full / needs gc / etc then everyone
> gets to wait FIFO style in the reclaim_reservations list.
Yes (In a way modelled after the log grant waits).
> They can be
> woken up from the wait if either (a) someone gives back reserved space
> or (b) the copygc empties out this zone.
>
> Or if the thread isn't willing to wait, we skip the fifo and either fail
> up to userspace
Yes.
> or just move on to the next zone?
No other zone to move to.
> I think I understand the general idea, but I don't quite know when we're
> going to use the greedy algorithm. Later I see XFS_ZR_GREEDY gets used
> from the buffered write path, but there doesn't seem to be an obvious
> reason why?
Posix/Linux semantics for buffered writes require us to implement
short writes. That is if a single (p)write(v) syscall for say 10MB
only find 512k of space it should write those instead of failing
with ENOSPC. The XFS_ZR_GREEDY implements that by backing down to
what we can allocate (and the current implementation for that is
a little ugly, I plan to find some time for changes to the core
percpu_counters to improve this after the code is merged).
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-15 5:31 ` Christoph Hellwig
@ 2024-12-17 16:59 ` Darrick J. Wong
2024-12-19 5:50 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 16:59 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 06:31:35AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 01:01:40PM -0800, Darrick J. Wong wrote:
> > > +#define XFS_ZR_GREEDY (1U << 0)
> > > +#define XFS_ZR_NOWAIT (1U << 1)
> > > +#define XFS_ZR_RESERVED (1U << 2)
> >
> > What do these flag values mean? Can we put that into comments?
>
> Sure.
>
> > > + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
> > > + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
> > > + * is further restricted by at least one zone as well as the optional
> > > + * persistently reserved blocks. This allows the allocator to run more
> > > + * smoothly by not always triggering GC.
> >
> > Hmm, so _RTAVAILABLE really means _RTNOGC? That makes sense.
>
> Yes, it means block available without doing further work.
> I can't say _RTNOGC is very descriptive either, but I would not mind
> a better name if someone came up with a good one :)
Hrmm, they're rt extents that are available "now", or "for cheap"...
XC_FREE_NOW_RTEXTENTS
XC_FREE_RTEXTENTS_IMMED
XC_FREE_RTEXTENTS_CHEAP
Eh, I'm not enthusiastic about any of those. The best I can think of
is:
XC_FREE_RTEXTENTS_NOGC, /* space available without gc */
> > > + spin_unlock(&zi->zi_reservation_lock);
> > > + schedule();
> > > + spin_lock(&zi->zi_reservation_lock);
> > > + }
> > > + list_del(&reservation.entry);
> > > + spin_unlock(&zi->zi_reservation_lock);
> >
> > Hmm. So if I'm understanding correctly, threads wanting to write to a
> > file try to locklessly reserve space from RTAVAILABLE.
>
> At least if there are no waiters yet, yes.
>
> > If they can't
> > get space because the zone is nearly full / needs gc / etc then everyone
> > gets to wait FIFO style in the reclaim_reservations list.
>
> Yes (In a way modelled after the log grant waits).
>
> > They can be
> > woken up from the wait if either (a) someone gives back reserved space
> > or (b) the copygc empties out this zone.
> >
> > Or if the thread isn't willing to wait, we skip the fifo and either fail
> > up to userspace
>
> Yes.
>
> > or just move on to the next zone?
>
> No other zone to move to.
<nod>
> > I think I understand the general idea, but I don't quite know when we're
> > going to use the greedy algorithm. Later I see XFS_ZR_GREEDY gets used
> > from the buffered write path, but there doesn't seem to be an obvious
> > reason why?
>
> Posix/Linux semantics for buffered writes require us to implement
> short writes. That is if a single (p)write(v) syscall for say 10MB
> only find 512k of space it should write those instead of failing
> with ENOSPC. The XFS_ZR_GREEDY implements that by backing down to
> what we can allocate (and the current implementation for that is
> a little ugly, I plan to find some time for changes to the core
> percpu_counters to improve this after the code is merged).
Ah, ok. Can you put that in the comments defining XFS_ZR_GREEDY?
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-17 16:59 ` Darrick J. Wong
@ 2024-12-19 5:50 ` Christoph Hellwig
2024-12-19 16:00 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-19 5:50 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Tue, Dec 17, 2024 at 08:59:55AM -0800, Darrick J. Wong wrote:
> Hrmm, they're rt extents that are available "now", or "for cheap"...
>
> XC_FREE_NOW_RTEXTENTS
>
> XC_FREE_RTEXTENTS_IMMED
>
> XC_FREE_RTEXTENTS_CHEAP
>
> Eh, I'm not enthusiastic about any of those. The best I can think of
> is:
>
> XC_FREE_RTEXTENTS_NOGC, /* space available without gc */
I really hate that, as it encodes the policy how to get (or not) the
blocks vs what they are.
> > > going to use the greedy algorithm. Later I see XFS_ZR_GREEDY gets used
> > > from the buffered write path, but there doesn't seem to be an obvious
> > > reason why?
> >
> > Posix/Linux semantics for buffered writes require us to implement
> > short writes. That is if a single (p)write(v) syscall for say 10MB
> > only find 512k of space it should write those instead of failing
> > with ENOSPC. The XFS_ZR_GREEDY implements that by backing down to
> > what we can allocate (and the current implementation for that is
> > a little ugly, I plan to find some time for changes to the core
> > percpu_counters to improve this after the code is merged).
>
> Ah, ok. Can you put that in the comments defining XFS_ZR_GREEDY?
This is what I added earlier this week:
/*
* Grab any available space, even if it is less than what the caller asked for.
*/
#define XFS_ZR_GREEDY (1U << 0)
Do you want the glory details on why we're doing it here as well?
I guess it if can't be left as an exercise to the reader I'd probably
rather place it in the caller.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-19 5:50 ` Christoph Hellwig
@ 2024-12-19 16:00 ` Darrick J. Wong
2024-12-19 17:36 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-19 16:00 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 19, 2024 at 06:50:59AM +0100, Christoph Hellwig wrote:
> On Tue, Dec 17, 2024 at 08:59:55AM -0800, Darrick J. Wong wrote:
> > Hrmm, they're rt extents that are available "now", or "for cheap"...
> >
> > XC_FREE_NOW_RTEXTENTS
> >
> > XC_FREE_RTEXTENTS_IMMED
> >
> > XC_FREE_RTEXTENTS_CHEAP
> >
> > Eh, I'm not enthusiastic about any of those. The best I can think of
> > is:
> >
> > XC_FREE_RTEXTENTS_NOGC, /* space available without gc */
>
> I really hate that, as it encodes the policy how to get (or not) the
> blocks vs what they are.
Yeah, me too. Want to leave it as XC_FREE_RTAVAILABLE?
> > > > going to use the greedy algorithm. Later I see XFS_ZR_GREEDY gets used
> > > > from the buffered write path, but there doesn't seem to be an obvious
> > > > reason why?
> > >
> > > Posix/Linux semantics for buffered writes require us to implement
> > > short writes. That is if a single (p)write(v) syscall for say 10MB
> > > only find 512k of space it should write those instead of failing
> > > with ENOSPC. The XFS_ZR_GREEDY implements that by backing down to
> > > what we can allocate (and the current implementation for that is
> > > a little ugly, I plan to find some time for changes to the core
> > > percpu_counters to improve this after the code is merged).
> >
> > Ah, ok. Can you put that in the comments defining XFS_ZR_GREEDY?
>
> This is what I added earlier this week:
>
> /*
> * Grab any available space, even if it is less than what the caller asked for.
> */
> #define XFS_ZR_GREEDY (1U << 0)
>
> Do you want the glory details on why we're doing it here as well?
> I guess it if can't be left as an exercise to the reader I'd probably
> rather place it in the caller.
Nah, that's enough to hint at what I should be looking for any time
there's a branch involving XFS_ZR_GREEDY.
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-19 16:00 ` Darrick J. Wong
@ 2024-12-19 17:36 ` Christoph Hellwig
2024-12-19 17:37 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:36 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 19, 2024 at 08:00:04AM -0800, Darrick J. Wong wrote:
> > I really hate that, as it encodes the policy how to get (or not) the
> > blocks vs what they are.
>
> Yeah, me too. Want to leave it as XC_FREE_RTAVAILABLE?
That would be my preference (at least based on the so far presented
options)
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 25/43] xfs: add support for zoned space reservations
2024-12-19 17:36 ` Christoph Hellwig
@ 2024-12-19 17:37 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-19 17:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Thu, Dec 19, 2024 at 06:36:16PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 19, 2024 at 08:00:04AM -0800, Darrick J. Wong wrote:
> > > I really hate that, as it encodes the policy how to get (or not) the
> > > blocks vs what they are.
> >
> > Yeah, me too. Want to leave it as XC_FREE_RTAVAILABLE?
>
> That would be my preference (at least based on the so far presented
> options)
Fine by me.
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (24 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 25/43] xfs: add support for zoned space reservations Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:18 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 27/43] xfs: implement buffered writes to zoned RT devices Christoph Hellwig
` (16 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
RT groups on a zoned file system need to be completely empty before their
space can be reused. This means that partially empty groups need to be
emptied entirely to free up space if no entirely free groups are
available.
Add a garbage collection thread that moves all data out of the least used
zone when not enough free zones are available, and which resets all zones
that have been emptied. To empty zones, the rmap is walked to find the
owners and the data is read and then written to the new place.
To automatically defragment files the rmap records are sorted by inode
and logical offset. This means defragmentation of parallel writes into
a single zone happens automatically when performing garbage collection.
Because holding the iolock over the entire GC cycle would inject very
noticeable latency for other accesses to the inodes, the iolock is not
taken while performing I/O. Instead the I/O completion handler checks
that the mapping hasn't changed over the one recorded at the start of
the GC cycle and doesn't update the mapping if it change.
Note: selection of garbage collection victims is extremely simple at the
moment and will probably see additional near term improvements.
Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_group.h | 15 +-
fs/xfs/xfs_extent_busy.c | 2 +-
fs/xfs/xfs_mount.c | 4 +
fs/xfs/xfs_mount.h | 3 +
fs/xfs/xfs_super.c | 7 +
fs/xfs/xfs_trace.h | 4 +
fs/xfs/xfs_zone_alloc.c | 52 +-
fs/xfs/xfs_zone_alloc.h | 8 +
fs/xfs/xfs_zone_gc.c | 1045 ++++++++++++++++++++++++++++++++++
fs/xfs/xfs_zone_priv.h | 5 +
fs/xfs/xfs_zone_space_resv.c | 7 +
12 files changed, 1146 insertions(+), 7 deletions(-)
create mode 100644 fs/xfs/xfs_zone_gc.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index bdedf4bdb1db..e38838409271 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -139,6 +139,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
xfs_zone_alloc.o \
+ xfs_zone_gc.o \
xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index a70096113384..430a43e1591e 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -19,10 +19,17 @@ struct xfs_group {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
- /*
- * Track freed but not yet committed extents.
- */
- struct xfs_extent_busy_tree *xg_busy_extents;
+ union {
+ /*
+ * Track freed but not yet committed extents.
+ */
+ struct xfs_extent_busy_tree *xg_busy_extents;
+
+ /*
+ * List of groups that need a zone reset for zoned file systems.
+ */
+ struct xfs_group *xg_next_reset;
+ };
/*
* Bitsets of per-ag metadata that have been checked and/or are sick.
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ea43c9a6e54c..da3161572735 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
while ((pag = xfs_perag_next(mp, pag)))
xfs_extent_busy_wait_group(pag_group(pag));
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_extent_busy_wait_group(rtg_group(rtg));
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 70ecbbaba7fd..20d564b3b564 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1088,6 +1088,8 @@ xfs_mountfs(
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
+
+ xfs_zone_gc_start(mp);
}
return 0;
@@ -1176,6 +1178,8 @@ xfs_unmountfs(
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
+ if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+ xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
if (xfs_has_zoned(mp))
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 02a3609a3322..831d9e09fe72 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -548,6 +548,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_RESUMING_QUOTAON 18
/* Kernel has logged a warning about zoned RT device being used on this fs. */
#define XFS_OPSTATE_WARNED_ZONED 19
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_IN_GC 20
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -592,6 +594,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(in_gc, IN_GC)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d0b7e0d02366..b289b2ba78b1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -1947,6 +1948,9 @@ xfs_remount_rw(
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
+ /* Restart zone reclaim */
+ xfs_zone_gc_start(mp);
+
return 0;
}
@@ -1991,6 +1995,9 @@ xfs_remount_ro(
*/
xfs_inodegc_stop(mp);
+ /* Stop zone reclaim */
+ xfs_zone_gc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 763dd3d271b9..bbaf9b2665c7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -290,8 +290,12 @@ DECLARE_EVENT_CLASS(xfs_zone_class,
DEFINE_EVENT(xfs_zone_class, name, \
TP_PROTO(struct xfs_rtgroup *rtg), \
TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_activate);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_reclaim);
+DEFINE_ZONE_EVENT(xfs_gc_zone_activate);
TRACE_EVENT(xfs_zone_free_blocks,
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1a746e9cfbf4..291cf39a5989 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -34,11 +34,43 @@ xfs_open_zone_put(
}
}
+static void
+xfs_zone_emptied(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ trace_xfs_zone_emptied(rtg);
+
+ /*
+ * This can be called from log recovery, where the zone_info structure
+ * hasn't been allocated yet. But we'll look for empty zones when
+ * setting it up, so don't need to track the empty zone here in that
+ * case.
+ */
+ if (!zi)
+ return;
+
+ xfs_group_clear_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+
+ spin_lock(&zi->zi_reset_list_lock);
+ rtg_group(rtg)->xg_next_reset = zi->zi_reset_list;
+ zi->zi_reset_list = rtg_group(rtg);
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ wake_up_process(zi->zi_gc_thread);
+}
+
static void
xfs_zone_mark_reclaimable(
struct xfs_rtgroup *rtg)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(mp->m_zone_info->zi_gc_thread);
}
static void
@@ -278,9 +310,12 @@ xfs_zone_free_blocks(
if (!READ_ONCE(rtg->rtg_open_zone)) {
/*
* If the zone is not open, mark it reclaimable when the first
- * block is freed.
+ * block is freed. As an optimization kick of a zone reset if
+ * the usage counter hits zero.
*/
- if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
+ if (rmapip->i_used_blocks == 0)
+ xfs_zone_emptied(rtg);
+ else if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
xfs_zone_mark_reclaimable(rtg);
}
xfs_add_frextents(mp, len);
@@ -415,6 +450,8 @@ xfs_activate_zone(
atomic_inc(&oz->oz_ref);
zi->zi_nr_open_zones++;
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(zi->zi_gc_thread);
/* XXX: this is a little verbose, but let's keep it for now */
xfs_info(mp, "using zone %u (%u)",
@@ -747,6 +784,13 @@ xfs_init_zone(
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
}
+ if (write_pointer == rtg_blocks(rtg) && used == 0) {
+ error = xfs_zone_reset_sync(rtg);
+ if (error)
+ return error;
+ write_pointer = 0;
+ }
+
if (write_pointer == 0) {
/* zone is empty */
atomic_inc(&zi->zi_nr_free_zones);
@@ -954,6 +998,9 @@ xfs_mount_zones(
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
+ error = xfs_zone_gc_mount(mp);
+ if (error)
+ goto out_free_open_zones;
return 0;
out_free_open_zones:
@@ -966,6 +1013,7 @@ void
xfs_unmount_zones(
struct xfs_mount *mp)
{
+ xfs_zone_gc_unmount(mp);
xfs_free_open_zones(mp->m_zone_info);
kfree(mp->m_zone_info);
}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 6d0404c2c46c..44fa1594f73e 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -38,6 +38,8 @@ uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
#ifdef CONFIG_XFS_RT
int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
#else
static inline int xfs_mount_zones(struct xfs_mount *mp)
{
@@ -46,6 +48,12 @@ static inline int xfs_mount_zones(struct xfs_mount *mp)
static inline void xfs_unmount_zones(struct xfs_mount *mp)
{
}
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
#endif /* CONFIG_XFS_RT */
#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644
index 000000000000..085d7001935e
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1045 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2024 Christoph Hellwig.
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Size of each GC scratch pad. This is also the upper bound for each
+ * GC I/O, which helps to keep latency down.
+ */
+#define XFS_GC_CHUNK_SIZE SZ_1M
+
+/*
+ * Scratchpad data to read GCed data into.
+ *
+ * The offset member tracks where the next allocation starts, and freed tracks
+ * the amount of space that is not used anymore.
+ */
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+ struct folio *folio;
+ unsigned int offset;
+ unsigned int freed;
+};
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+ struct xfs_zone_gc_data *data;
+
+ /*
+ * Entry into the reading/writing/resetting list. Only accessed from
+ * the GC thread, so no locking needed.
+ */
+ struct list_head entry;
+
+ /*
+ * State of this gc_bio. Done means the current I/O completed.
+ * Set from the bio end I/O handler, read from the GC thread.
+ */
+ unsigned long state;
+#define XFS_GC_BIO_NEW 0
+#define XFS_GC_BIO_DONE 1
+
+ /*
+ * Pointer to the inode and range of the inode that the GC is performed
+ * for.
+ */
+ struct xfs_inode *ip;
+ loff_t offset;
+ unsigned int len;
+
+ /*
+ * Existing startblock (in the zone to be freed) and newly assigned
+ * daddr in the zone GCed into.
+ */
+ xfs_fsblock_t old_startblock;
+ xfs_daddr_t new_daddr;
+ struct xfs_zone_scratch *scratch;
+
+ /* Are we writing to a sequential write required zone? */
+ bool is_seq;
+
+ /* Bio used for reads and writes, including the bvec used by it */
+ struct bio_vec bv;
+ struct bio bio; /* must be last */
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+ struct xfs_mount *mp;
+
+ /* bioset used to allocate the gc_bios */
+ struct bio_set bio_set;
+
+ /*
+ * Scratchpad used, and index to indicated which one is used.
+ */
+ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
+ unsigned int scratch_idx;
+
+ /*
+ * List of bios currently being read, written and reset.
+ * These lists are only accessed by the GC thread itself, and must only
+ * be processed in order.
+ */
+ struct list_head reading;
+ struct list_head writing;
+ struct list_head resetting;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
+{
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ return false;
+ if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+ mp->m_groups[XG_TYPE_RTG].blocks *
+ (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+ return false;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_gc_data *data;
+ int i;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+
+ /*
+ * We actually only need a single bio_vec. It would be nice to have
+ * a flag that only allocates the inline bvecs and not the separate
+ * bvec pool.
+ */
+ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_data;
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+ data->scratch[i].folio =
+ folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+ if (!data->scratch[i].folio)
+ goto out_free_scratch;
+ }
+ INIT_LIST_HEAD(&data->reading);
+ INIT_LIST_HEAD(&data->writing);
+ INIT_LIST_HEAD(&data->resetting);
+ data->mp = mp;
+ return data;
+
+out_free_scratch:
+ while (--i >= 0)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+ struct xfs_zone_gc_data *data)
+{
+ int i;
+
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+ kfree(data);
+}
+
+#define XFS_ZONE_GC_RECS 1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+ struct xfs_rtgroup *victim_rtg;
+ unsigned int rec_count;
+ unsigned int rec_idx;
+ xfs_agblock_t next_startblock;
+ struct xfs_rmap_irec recs[XFS_ZONE_GC_RECS];
+};
+
+static void
+xfs_zone_gc_iter_init(
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rtgroup *victim_rtg)
+
+{
+ iter->next_startblock = 0;
+ iter->rec_count = 0;
+ iter->rec_idx = 0;
+ iter->victim_rtg = victim_rtg;
+}
+
+static int
+xfs_zone_gc_query_cb(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec,
+ void *private)
+{
+ struct xfs_zone_gc_iter *iter = private;
+
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+ ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+ ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+ iter->recs[iter->rec_count] = *irec;
+ if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+ iter->next_startblock =
+ irec->rm_startblock + irec->rm_blockcount;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_rmap_irec *reca = a;
+ const struct xfs_rmap_irec *recb = b;
+ int64_t diff;
+
+ diff = reca->rm_owner - recb->rm_owner;
+ if (!diff)
+ diff = reca->rm_offset - recb->rm_offset;
+ return clamp(diff, -1, 1);
+}
+
+static int
+xfs_zone_gc_query(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *rtg = iter->victim_rtg;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_btree_cur *cur;
+ struct xfs_trans *tp;
+ int error;
+
+ ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+ if (iter->next_startblock == rtg_blocks(rtg))
+ goto done;
+
+ ASSERT(iter->next_startblock < rtg_blocks(rtg));
+ ri_low.rm_startblock = iter->next_startblock;
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+
+ iter->rec_idx = 0;
+ iter->rec_count = 0;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_zone_gc_query_cb, iter);
+ xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+ xfs_trans_cancel(tp);
+
+ if (error < 0)
+ return error;
+
+ /*
+ * Sort the rmap records by inode number and increasing offset to
+ * defragment the mappings.
+ *
+ * This could be further enhanced by an even bigger look ahead window,
+ * but that's better left until we have better detection of changes to
+ * inode mapping to avoid the potential of GCing already dead data.
+ */
+ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+ xfs_zone_gc_rmap_rec_cmp, NULL);
+
+ if (error == 0) {
+ /*
+ * We finished iterating through the zone.
+ */
+ iter->next_startblock = rtg_blocks(rtg);
+ if (iter->rec_count == 0)
+ goto done;
+ }
+
+ return 0;
+done:
+ xfs_rtgroup_rele(iter->victim_rtg);
+ iter->victim_rtg = NULL;
+ return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rmap_irec *chunk_rec,
+ struct xfs_inode **ipp)
+{
+ struct xfs_rmap_irec *irec;
+ int error;
+
+ if (!iter->victim_rtg)
+ return false;
+
+retry:
+ if (iter->rec_idx == iter->rec_count) {
+ error = xfs_zone_gc_query(mp, iter);
+ if (error)
+ goto fail;
+ if (!iter->victim_rtg)
+ return false;
+ }
+
+ irec = &iter->recs[iter->rec_idx];
+ error = xfs_iget(mp, NULL, irec->rm_owner,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+ if (error) {
+ /*
+ * If the inode was already deleted, skip over it.
+ */
+ if (error == -ENOENT) {
+ iter->rec_idx++;
+ goto retry;
+ }
+ goto fail;
+ }
+
+ if (!S_ISREG(VFS_I(*ipp)->i_mode)) {
+ iter->rec_idx++;
+ xfs_irele(*ipp);
+ goto retry;
+ }
+
+ *chunk_rec = *irec;
+ return true;
+
+fail:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+ struct xfs_zone_gc_iter *iter,
+ xfs_extlen_t count_fsb)
+{
+ struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
+
+ irec->rm_offset += count_fsb;
+ irec->rm_startblock += count_fsb;
+ irec->rm_blockcount -= count_fsb;
+ if (!irec->rm_blockcount)
+ iter->rec_idx++;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate that is
+ * either good enough for instant reclaim, or the one with the least used space.
+ */
+static bool
+xfs_zone_reclaim_pick(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0);
+ struct xfs_rtgroup *victim_rtg = NULL, *rtg;
+ uint32_t victim_used = U32_MAX;
+ bool easy = false;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (iter->victim_rtg)
+ return true;
+
+ /*
+ * Don't start new work if we are asked to stop or park.
+ */
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+
+ if (!xfs_zoned_need_gc(mp))
+ return false;
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, rtg, ULONG_MAX, XFS_RTG_RECLAIMABLE) {
+ u64 used = rtg_rmap(rtg)->i_used_blocks;
+
+ /* skip zones that are just waiting for a reset */
+ if (used == 0)
+ continue;
+
+ if (used >= victim_used)
+ continue;
+ if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
+ continue;
+
+ if (victim_rtg)
+ xfs_rtgroup_rele(victim_rtg);
+ victim_rtg = rtg;
+ victim_used = used;
+
+ /*
+ * Any zone that is less than 1 percent used is fair game for
+ * instant reclaim.
+ */
+ if (used < div_u64(rtg_blocks(rtg), 100)) {
+ easy = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!victim_rtg)
+ return false;
+
+ xfs_info(mp, "reclaiming zone %d, used = %u/%u (%s)",
+ rtg_rgno(victim_rtg), victim_used,
+ rtg_blocks(victim_rtg),
+ easy ? "easy" : "best");
+ trace_xfs_zone_reclaim(victim_rtg);
+ xfs_zone_gc_iter_init(iter, victim_rtg);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_steal_open_zone_for_gc(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz, *found = NULL;
+
+ lockdep_assert_held(&zi->zi_zone_list_lock);
+
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+ if (!found ||
+ oz->oz_write_pointer < found->oz_write_pointer)
+ found = oz;
+ }
+
+ if (found) {
+ found->oz_is_gc = true;
+ list_del_init(&found->oz_entry);
+ zi->zi_nr_open_zones--;
+ }
+ return found;
+}
+
+static struct xfs_open_zone *
+xfs_select_gc_zone(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = zi->zi_open_gc_zone;
+
+ if (oz && oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) {
+ /*
+ * We need to wait for pending writes to finish.
+ */
+ if (oz->oz_written < rtg_blocks(oz->oz_rtg))
+ return NULL;
+ xfs_open_zone_put(oz);
+ oz = NULL;
+ }
+
+ if (!oz) {
+ /*
+ * If there are no free zones available for GC, pick the open
+ * zone with the least used space to GC into. This should
+ * only happen after an unclean shutdown near ENOSPC while
+ * GC was ongoing.
+ */
+ spin_lock(&zi->zi_zone_list_lock);
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
+ oz = xfs_steal_open_zone_for_gc(zi);
+ else
+ oz = xfs_open_zone(mp, true);
+ spin_unlock(&zi->zi_zone_list_lock);
+
+ if (oz)
+ trace_xfs_gc_zone_activate(oz->oz_rtg);
+ zi->zi_open_gc_zone = oz;
+ }
+
+ return oz;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+ struct xfs_zone_gc_data *data)
+{
+ return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_open_zone *oz;
+
+ oz = xfs_select_gc_zone(data->mp);
+ if (!oz)
+ return false;
+ return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+ xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ struct xfs_zone_gc_data *data = chunk->data;
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+ wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static bool
+xfs_zone_gc_allocate(
+ struct xfs_zone_gc_data *data,
+ xfs_extlen_t *count_fsb,
+ xfs_daddr_t *daddr,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = data->mp;
+ struct xfs_open_zone *oz;
+
+ oz = xfs_select_gc_zone(mp);
+ if (!oz)
+ return false;
+
+ *count_fsb = min(*count_fsb,
+ XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ *
+ * If we'd take them from the normal pool we could be stealing blocks a
+ * regular writer, which would then have to wait for GC and deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ *count_fsb = min(*count_fsb,
+ rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+ *count_fsb = min3(*count_fsb,
+ mp->m_resblks[XC_FREE_RTEXTENTS].avail,
+ mp->m_resblks[XC_FREE_RTAVAILABLE].avail);
+ mp->m_resblks[XC_FREE_RTEXTENTS].avail -= *count_fsb;
+ mp->m_resblks[XC_FREE_RTAVAILABLE].avail -= *count_fsb;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (!*count_fsb)
+ return false;
+
+ *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+ if (!*is_seq)
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
+ oz->oz_write_pointer += *count_fsb;
+ return true;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+ struct xfs_zone_gc_data *data,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_mount *mp = data->mp;
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_rmap_irec irec;
+ struct xfs_gc_bio *chunk;
+ struct xfs_inode *ip;
+ struct bio *bio;
+ xfs_daddr_t daddr;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+ return false;
+ if (!xfs_zone_gc_allocate(data, &irec.rm_blockcount, &daddr, &is_seq)) {
+ xfs_irele(ip);
+ return false;
+ }
+
+ bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->ip = ip;
+ chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+ chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+ chunk->old_startblock =
+ xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+ chunk->new_daddr = daddr;
+ chunk->is_seq = is_seq;
+ chunk->scratch = &data->scratch[data->scratch_idx];
+ chunk->data = data;
+
+ bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+ bio->bi_end_io = xfs_zone_gc_end_io;
+ bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+ chunk->scratch->offset);
+ chunk->scratch->offset += chunk->len;
+ if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+ data->scratch_idx =
+ (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+ }
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->reading);
+ xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+ submit_bio(bio);
+ return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ list_del(&chunk->entry);
+ xfs_irele(chunk->ip);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_gc_submit_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ if (chunk->is_seq) {
+ chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+ chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+ chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+ submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_gc_split_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ struct queue_limits *lim =
+ &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+ struct xfs_gc_bio *split_chunk;
+ int split_sectors;
+ unsigned int split_len;
+ struct bio *split;
+ unsigned int nsegs;
+
+ if (!chunk->is_seq)
+ return NULL;
+
+ split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+ if (!split_sectors)
+ return NULL;
+ split_len = split_sectors << SECTOR_SHIFT;
+
+ split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+ split_chunk = container_of(split, struct xfs_gc_bio, bio);
+ split_chunk->data = data;
+ ihold(VFS_I(chunk->ip));
+ split_chunk->ip = chunk->ip;
+ split_chunk->is_seq = chunk->is_seq;
+ split_chunk->scratch = chunk->scratch;
+ split_chunk->offset = chunk->offset;
+ split_chunk->len = split_len;
+ split_chunk->old_startblock = chunk->old_startblock;
+ split_chunk->new_daddr = chunk->new_daddr;
+
+ chunk->offset += split_len;
+ chunk->len -= split_len;
+ chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+ /* add right before the original chunk */
+ WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&split_chunk->entry, &chunk->entry);
+ return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_zone_gc_data *data = chunk->data;
+ struct xfs_mount *mp = chunk->ip->i_mount;
+ unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
+ struct xfs_gc_bio *split_chunk;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_move_tail(&chunk->entry, &data->writing);
+
+ bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+ bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
+ folio_offset);
+
+ while ((split_chunk = xfs_gc_split_write(data, chunk)))
+ xfs_gc_submit_write(data, split_chunk);
+ xfs_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_inode *ip = chunk->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ chunk->scratch->freed += chunk->len;
+ if (chunk->scratch->freed == chunk->scratch->offset) {
+ chunk->scratch->offset = 0;
+ chunk->scratch->freed = 0;
+ }
+
+ /*
+ * Cycle through the iolock and wait for direct I/O and layouts to
+ * ensure no one is reading from the old mapping before it goes away.
+ */
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+ if (!error)
+ inode_dio_wait(VFS_I(ip));
+ xfs_iunlock(ip, iolock);
+ if (error)
+ goto free;
+
+ if (chunk->is_seq)
+ chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+ error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+ chunk->new_daddr, chunk->old_startblock);
+free:
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_rtgroup *rtg = chunk->bio.bi_private;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ if (chunk->bio.bi_status) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out;
+ }
+
+ spin_lock(&zi->zi_zone_list_lock);
+ atomic_inc(&zi->zi_nr_free_zones);
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ spin_unlock(&zi->zi_zone_list_lock);
+
+ xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+ wake_up_all(&zi->zi_zone_wait);
+out:
+ list_del(&chunk->entry);
+ bio_put(&chunk->bio);
+}
+
+static bool
+xfs_prepare_zone_reset(
+ struct bio *bio,
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_zone_reset(rtg);
+
+ ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+ bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+ if (!bdev_max_discard_sectors(bio->bi_bdev))
+ return false;
+ bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+ bio->bi_iter.bi_size =
+ XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
+ }
+
+ return true;
+}
+
+int
+xfs_zone_reset_sync(
+ struct xfs_rtgroup *rtg)
+{
+ int error = 0;
+ struct bio bio;
+
+ bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+ REQ_OP_ZONE_RESET);
+ if (xfs_prepare_zone_reset(&bio, rtg))
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ return error;
+}
+
+static void
+xfs_reset_zones(
+ struct xfs_zone_gc_data *data,
+ struct xfs_group *reset_list)
+{
+ struct xfs_group *next = reset_list;
+
+ if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+ xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ do {
+ struct xfs_rtgroup *rtg = to_rtg(next);
+ struct xfs_gc_bio *chunk;
+ struct bio *bio;
+
+ xfs_log_force_inode(rtg_rmap(rtg));
+
+ next = rtg_group(rtg)->xg_next_reset;
+ rtg_group(rtg)->xg_next_reset = NULL;
+
+ bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+ 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+ bio->bi_private = rtg;
+ bio->bi_end_io = xfs_zone_gc_end_io;
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->data = data;
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->resetting);
+
+ /*
+ * Also use the bio to drive the state machine when neither
+ * zone reset nor discard is supported to keep things simple.
+ */
+ if (xfs_prepare_zone_reset(bio, rtg))
+ submit_bio(bio);
+ else
+ bio_endio(bio);
+ } while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static bool
+xfs_zone_gc_handle_work(
+ struct xfs_zone_gc_data *data,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+ struct xfs_gc_bio *chunk, *next;
+ struct xfs_group *reset_list;
+ struct blk_plug plug;
+
+ spin_lock(&zi->zi_reset_list_lock);
+ reset_list = zi->zi_reset_list;
+ zi->zi_reset_list = NULL;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (!xfs_zone_reclaim_pick(data->mp, iter) ||
+ !xfs_zone_gc_space_available(data)) {
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !reset_list)
+ return false;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ try_to_freeze();
+
+ if (reset_list)
+ xfs_reset_zones(data, reset_list);
+
+ list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_reset(chunk);
+ }
+
+ list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_chunk(chunk);
+ }
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_write_chunk(chunk);
+ }
+ blk_finish_plug(&plug);
+
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data, iter))
+ ;
+ blk_finish_plug(&plug);
+ return true;
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before. Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+ void *private)
+{
+ struct xfs_mount *mp = private;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int nofs_flag;
+ struct xfs_zone_gc_data *data;
+ struct xfs_zone_gc_iter *iter;
+
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data)
+ return -ENOMEM;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ goto out_free_data;
+
+ nofs_flag = memalloc_nofs_save();
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+ xfs_set_in_gc(mp);
+ if (xfs_zone_gc_handle_work(data, iter))
+ continue;
+
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !zi->zi_reset_list) {
+ xfs_clear_in_gc(mp);
+ xfs_zoned_resv_wake_all(mp);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ kthread_parkme();
+ continue;
+ }
+ }
+
+ schedule();
+ }
+ xfs_clear_in_gc(mp);
+
+ if (iter->victim_rtg)
+ xfs_rtgroup_rele(iter->victim_rtg);
+ if (zi->zi_open_gc_zone)
+ xfs_open_zone_put(zi->zi_open_gc_zone);
+
+ memalloc_nofs_restore(nofs_flag);
+ kfree(iter);
+out_free_data:
+ xfs_zone_gc_data_free(data);
+ return 0;
+}
+
+void
+xfs_zone_gc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+int
+xfs_zone_gc_mount(
+ struct xfs_mount *mp)
+{
+ mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, mp,
+ "xfs-zone-gc/%s", mp->m_super->s_id);
+ if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+ xfs_warn(mp, "unable to create zone gc thread");
+ return PTR_ERR(mp->m_zone_info->zi_gc_thread);
+ }
+
+ /* xfs_zone_gc_start will unpark for rw mounts */
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+ return 0;
+}
+
+void
+xfs_zone_gc_unmount(
+ struct xfs_mount *mp)
+{
+ kthread_stop(mp->m_zone_info->zi_gc_thread);
+}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index f56f3ca8ea00..0b720026e54a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -82,6 +82,11 @@ struct xfs_zone_info {
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+int xfs_zone_reset_sync(struct xfs_rtgroup *rtg);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+int xfs_zone_gc_mount(struct xfs_mount *mp);
+void xfs_zone_gc_unmount(struct xfs_mount *mp);
+
void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
#endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 5ee525e18759..77211f4c7033 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -159,6 +159,13 @@ xfs_zoned_reserve_available(
if (error != -ENOSPC)
break;
+ /*
+ * If there is nothing left to reclaim, give up.
+ */
+ if (!xfs_is_in_gc(mp) &&
+ !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ break;
+
spin_unlock(&zi->zi_reservation_lock);
schedule();
spin_lock(&zi->zi_reservation_lock);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-11 8:54 ` [PATCH 26/43] xfs: implement zoned garbage collection Christoph Hellwig
@ 2024-12-13 22:18 ` Darrick J. Wong
2024-12-15 5:57 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:18 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:51AM +0100, Christoph Hellwig wrote:
> RT groups on a zoned file system need to be completely empty before their
> space can be reused. This means that partially empty groups need to be
> emptied entirely to free up space if no entirely free groups are
> available.
>
> Add a garbage collection thread that moves all data out of the least used
> zone when not enough free zones are available, and which resets all zones
> that have been emptied. To empty zones, the rmap is walked to find the
> owners and the data is read and then written to the new place.
>
> To automatically defragment files the rmap records are sorted by inode
> and logical offset. This means defragmentation of parallel writes into
> a single zone happens automatically when performing garbage collection.
> Because holding the iolock over the entire GC cycle would inject very
> noticeable latency for other accesses to the inodes, the iolock is not
> taken while performing I/O. Instead the I/O completion handler checks
> that the mapping hasn't changed over the one recorded at the start of
> the GC cycle and doesn't update the mapping if it change.
>
> Note: selection of garbage collection victims is extremely simple at the
> moment and will probably see additional near term improvements.
Can we do the garbage collection from userspace? I've had a freespace
defragmenter banging around in my dev tree for years:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=defrag-freespace_2024-12-12
Which has the nice property that it knows how to query the refcount
btree to try to move the most heavily shared blocks first. For zoned
that might not matter since we /must/ evacuate the whole zone.
Regardless, it could be nice to have a userspace process that we could
trigger from the kernel at some threshold (e.g. 70% space used) to see
if it can clean out some zones before the kernel one kicks in and slows
everyone down.
Anyway I'll keep going; that was just a thought I had.
> Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/libxfs/xfs_group.h | 15 +-
> fs/xfs/xfs_extent_busy.c | 2 +-
> fs/xfs/xfs_mount.c | 4 +
> fs/xfs/xfs_mount.h | 3 +
> fs/xfs/xfs_super.c | 7 +
> fs/xfs/xfs_trace.h | 4 +
> fs/xfs/xfs_zone_alloc.c | 52 +-
> fs/xfs/xfs_zone_alloc.h | 8 +
> fs/xfs/xfs_zone_gc.c | 1045 ++++++++++++++++++++++++++++++++++
> fs/xfs/xfs_zone_priv.h | 5 +
> fs/xfs/xfs_zone_space_resv.c | 7 +
> 12 files changed, 1146 insertions(+), 7 deletions(-)
> create mode 100644 fs/xfs/xfs_zone_gc.c
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index bdedf4bdb1db..e38838409271 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -139,6 +139,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
> # xfs_rtbitmap is shared with libxfs
> xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
> xfs_zone_alloc.o \
> + xfs_zone_gc.o \
> xfs_zone_space_resv.o
>
> xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
> diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
> index a70096113384..430a43e1591e 100644
> --- a/fs/xfs/libxfs/xfs_group.h
> +++ b/fs/xfs/libxfs/xfs_group.h
> @@ -19,10 +19,17 @@ struct xfs_group {
> #ifdef __KERNEL__
> /* -- kernel only structures below this line -- */
>
> - /*
> - * Track freed but not yet committed extents.
> - */
> - struct xfs_extent_busy_tree *xg_busy_extents;
> + union {
> + /*
> + * Track freed but not yet committed extents.
> + */
> + struct xfs_extent_busy_tree *xg_busy_extents;
> +
> + /*
> + * List of groups that need a zone reset for zoned file systems.
> + */
> + struct xfs_group *xg_next_reset;
> + };
Don't we need busy extents for zoned rtgroups? I was under the
impression that the busy extents code prevents us from reallocating
recently freed space until the EFI (and hence the bunmapi) transaction
are persisted to the log so that new contents written after a
reallocation + write + fdatasync won't reappear in the old file?
> /*
> * Bitsets of per-ag metadata that have been checked and/or are sick.
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index ea43c9a6e54c..da3161572735 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
> @@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
> while ((pag = xfs_perag_next(mp, pag)))
> xfs_extent_busy_wait_group(pag_group(pag));
>
> - if (xfs_has_rtgroups(mp))
> + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
> while ((rtg = xfs_rtgroup_next(mp, rtg)))
> xfs_extent_busy_wait_group(rtg_group(rtg));
> }
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 70ecbbaba7fd..20d564b3b564 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1088,6 +1088,8 @@ xfs_mountfs(
> error = xfs_fs_reserve_ag_blocks(mp);
> if (error && error != -ENOSPC)
> goto out_agresv;
> +
> + xfs_zone_gc_start(mp);
> }
>
> return 0;
> @@ -1176,6 +1178,8 @@ xfs_unmountfs(
> xfs_inodegc_flush(mp);
>
> xfs_blockgc_stop(mp);
> + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
> + xfs_zone_gc_stop(mp);
> xfs_fs_unreserve_ag_blocks(mp);
> xfs_qm_unmount_quotas(mp);
> if (xfs_has_zoned(mp))
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 02a3609a3322..831d9e09fe72 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -548,6 +548,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
> #define XFS_OPSTATE_RESUMING_QUOTAON 18
> /* Kernel has logged a warning about zoned RT device being used on this fs. */
> #define XFS_OPSTATE_WARNED_ZONED 19
> +/* (Zoned) GC is in progress */
> +#define XFS_OPSTATE_IN_GC 20
>
> #define __XFS_IS_OPSTATE(name, NAME) \
> static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
> @@ -592,6 +594,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
> #endif /* CONFIG_XFS_QUOTA */
> __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
> __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
> +__XFS_IS_OPSTATE(in_gc, IN_GC)
Nit: I might've called this ZONEGC_RUNNING.
if (xfs_is_zonegc_running(mp))
frob();
>
> static inline bool
> xfs_should_warn(struct xfs_mount *mp, long nr)
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d0b7e0d02366..b289b2ba78b1 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -46,6 +46,7 @@
> #include "xfs_exchmaps_item.h"
> #include "xfs_parent.h"
> #include "xfs_rtalloc.h"
> +#include "xfs_zone_alloc.h"
> #include "scrub/stats.h"
> #include "scrub/rcbag_btree.h"
>
> @@ -1947,6 +1948,9 @@ xfs_remount_rw(
> /* Re-enable the background inode inactivation worker. */
> xfs_inodegc_start(mp);
>
> + /* Restart zone reclaim */
> + xfs_zone_gc_start(mp);
> +
> return 0;
> }
>
> @@ -1991,6 +1995,9 @@ xfs_remount_ro(
> */
> xfs_inodegc_stop(mp);
>
> + /* Stop zone reclaim */
> + xfs_zone_gc_stop(mp);
> +
> /* Free the per-AG metadata reservation pool. */
> xfs_fs_unreserve_ag_blocks(mp);
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 763dd3d271b9..bbaf9b2665c7 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -290,8 +290,12 @@ DECLARE_EVENT_CLASS(xfs_zone_class,
> DEFINE_EVENT(xfs_zone_class, name, \
> TP_PROTO(struct xfs_rtgroup *rtg), \
> TP_ARGS(rtg))
> +DEFINE_ZONE_EVENT(xfs_zone_emptied);
> DEFINE_ZONE_EVENT(xfs_zone_full);
> DEFINE_ZONE_EVENT(xfs_zone_activate);
> +DEFINE_ZONE_EVENT(xfs_zone_reset);
> +DEFINE_ZONE_EVENT(xfs_zone_reclaim);
> +DEFINE_ZONE_EVENT(xfs_gc_zone_activate);
>
> TRACE_EVENT(xfs_zone_free_blocks,
> TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> index 1a746e9cfbf4..291cf39a5989 100644
> --- a/fs/xfs/xfs_zone_alloc.c
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -34,11 +34,43 @@ xfs_open_zone_put(
> }
> }
>
> +static void
> +xfs_zone_emptied(
> + struct xfs_rtgroup *rtg)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_zone_info *zi = mp->m_zone_info;
> +
> + trace_xfs_zone_emptied(rtg);
> +
> + /*
> + * This can be called from log recovery, where the zone_info structure
> + * hasn't been allocated yet. But we'll look for empty zones when
> + * setting it up, so don't need to track the empty zone here in that
> + * case.
> + */
> + if (!zi)
> + return;
> +
> + xfs_group_clear_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
> +
> + spin_lock(&zi->zi_reset_list_lock);
> + rtg_group(rtg)->xg_next_reset = zi->zi_reset_list;
> + zi->zi_reset_list = rtg_group(rtg);
> + spin_unlock(&zi->zi_reset_list_lock);
> +
> + wake_up_process(zi->zi_gc_thread);
> +}
> +
> static void
> xfs_zone_mark_reclaimable(
> struct xfs_rtgroup *rtg)
> {
> + struct xfs_mount *mp = rtg_mount(rtg);
> +
> xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_RECLAIMABLE);
> + if (xfs_zoned_need_gc(mp))
> + wake_up_process(mp->m_zone_info->zi_gc_thread);
> }
>
> static void
> @@ -278,9 +310,12 @@ xfs_zone_free_blocks(
> if (!READ_ONCE(rtg->rtg_open_zone)) {
> /*
> * If the zone is not open, mark it reclaimable when the first
> - * block is freed.
> + * block is freed. As an optimization kick of a zone reset if
"...kick off a zone reset..."
> + * the usage counter hits zero.
> */
> - if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
> + if (rmapip->i_used_blocks == 0)
> + xfs_zone_emptied(rtg);
> + else if (rmapip->i_used_blocks + len == rtg_blocks(rtg))
> xfs_zone_mark_reclaimable(rtg);
> }
> xfs_add_frextents(mp, len);
> @@ -415,6 +450,8 @@ xfs_activate_zone(
> atomic_inc(&oz->oz_ref);
> zi->zi_nr_open_zones++;
> list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> + if (xfs_zoned_need_gc(mp))
> + wake_up_process(zi->zi_gc_thread);
>
> /* XXX: this is a little verbose, but let's keep it for now */
> xfs_info(mp, "using zone %u (%u)",
> @@ -747,6 +784,13 @@ xfs_init_zone(
> xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
> }
>
> + if (write_pointer == rtg_blocks(rtg) && used == 0) {
> + error = xfs_zone_reset_sync(rtg);
> + if (error)
> + return error;
> + write_pointer = 0;
> + }
> +
> if (write_pointer == 0) {
> /* zone is empty */
> atomic_inc(&zi->zi_nr_free_zones);
> @@ -954,6 +998,9 @@ xfs_mount_zones(
> xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> iz.available + iz.reclaimable);
>
> + error = xfs_zone_gc_mount(mp);
> + if (error)
> + goto out_free_open_zones;
> return 0;
>
> out_free_open_zones:
> @@ -966,6 +1013,7 @@ void
> xfs_unmount_zones(
> struct xfs_mount *mp)
> {
> + xfs_zone_gc_unmount(mp);
> xfs_free_open_zones(mp->m_zone_info);
> kfree(mp->m_zone_info);
> }
> diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
> index 6d0404c2c46c..44fa1594f73e 100644
> --- a/fs/xfs/xfs_zone_alloc.h
> +++ b/fs/xfs/xfs_zone_alloc.h
> @@ -38,6 +38,8 @@ uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
> #ifdef CONFIG_XFS_RT
> int xfs_mount_zones(struct xfs_mount *mp);
> void xfs_unmount_zones(struct xfs_mount *mp);
> +void xfs_zone_gc_start(struct xfs_mount *mp);
> +void xfs_zone_gc_stop(struct xfs_mount *mp);
> #else
> static inline int xfs_mount_zones(struct xfs_mount *mp)
> {
> @@ -46,6 +48,12 @@ static inline int xfs_mount_zones(struct xfs_mount *mp)
> static inline void xfs_unmount_zones(struct xfs_mount *mp)
> {
> }
> +static inline void xfs_zone_gc_start(struct xfs_mount *mp)
> +{
> +}
> +static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
> +{
> +}
> #endif /* CONFIG_XFS_RT */
>
> #endif /* _XFS_ZONE_ALLOC_H */
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> new file mode 100644
> index 000000000000..085d7001935e
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -0,0 +1,1045 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023-2024 Christoph Hellwig.
> + * Copyright (c) 2024, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_btree.h"
> +#include "xfs_trans.h"
> +#include "xfs_icache.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rtbitmap.h"
> +#include "xfs_rtrmap_btree.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
> +#include "xfs_zones.h"
> +#include "xfs_trace.h"
> +
> +/*
> + * Size of each GC scratch pad. This is also the upper bound for each
> + * GC I/O, which helps to keep latency down.
> + */
> +#define XFS_GC_CHUNK_SIZE SZ_1M
> +
> +/*
> + * Scratchpad data to read GCed data into.
> + *
> + * The offset member tracks where the next allocation starts, and freed tracks
> + * the amount of space that is not used anymore.
> + */
> +#define XFS_ZONE_GC_NR_SCRATCH 2
> +struct xfs_zone_scratch {
> + struct folio *folio;
> + unsigned int offset;
> + unsigned int freed;
> +};
> +
> +/*
> + * Chunk that is read and written for each GC operation.
> + *
> + * Note that for writes to actual zoned devices, the chunk can be split when
> + * reaching the hardware limit.
> + */
> +struct xfs_gc_bio {
> + struct xfs_zone_gc_data *data;
> +
> + /*
> + * Entry into the reading/writing/resetting list. Only accessed from
> + * the GC thread, so no locking needed.
> + */
> + struct list_head entry;
> +
> + /*
> + * State of this gc_bio. Done means the current I/O completed.
> + * Set from the bio end I/O handler, read from the GC thread.
> + */
> + unsigned long state;
> +#define XFS_GC_BIO_NEW 0
> +#define XFS_GC_BIO_DONE 1
Are these bits, or a enum in disguise?
> +
> + /*
> + * Pointer to the inode and range of the inode that the GC is performed
> + * for.
> + */
> + struct xfs_inode *ip;
> + loff_t offset;
> + unsigned int len;
Are offset/len in bytes? It looks like they are.
> + /*
> + * Existing startblock (in the zone to be freed) and newly assigned
> + * daddr in the zone GCed into.
> + */
> + xfs_fsblock_t old_startblock;
> + xfs_daddr_t new_daddr;
> + struct xfs_zone_scratch *scratch;
> +
> + /* Are we writing to a sequential write required zone? */
> + bool is_seq;
> +
> + /* Bio used for reads and writes, including the bvec used by it */
> + struct bio_vec bv;
> + struct bio bio; /* must be last */
> +};
> +
> +/*
> + * Per-mount GC state.
> + */
> +struct xfs_zone_gc_data {
> + struct xfs_mount *mp;
> +
> + /* bioset used to allocate the gc_bios */
> + struct bio_set bio_set;
> +
> + /*
> + * Scratchpad used, and index to indicated which one is used.
> + */
> + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
> + unsigned int scratch_idx;
> +
> + /*
> + * List of bios currently being read, written and reset.
> + * These lists are only accessed by the GC thread itself, and must only
> + * be processed in order.
> + */
> + struct list_head reading;
> + struct list_head writing;
> + struct list_head resetting;
> +};
> +
> +/*
> + * We aim to keep enough zones free in stock to fully use the open zone limit
> + * for data placement purposes.
> + */
> +bool
> +xfs_zoned_need_gc(
> + struct xfs_mount *mp)
> +{
> + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> + return false;
> + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
> + mp->m_groups[XG_TYPE_RTG].blocks *
> + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
Is the righthand side of the comparison the number of blocks in the
zones that are open for userspace can write to?
> + return true;
> + return false;
> +}
> +
> +static struct xfs_zone_gc_data *
> +xfs_zone_gc_data_alloc(
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_gc_data *data;
> + int i;
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + if (!data)
> + return NULL;
> +
> + /*
> + * We actually only need a single bio_vec. It would be nice to have
> + * a flag that only allocates the inline bvecs and not the separate
> + * bvec pool.
> + */
> + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
> + BIOSET_NEED_BVECS))
> + goto out_free_data;
> + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
> + data->scratch[i].folio =
> + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
> + if (!data->scratch[i].folio)
> + goto out_free_scratch;
> + }
> + INIT_LIST_HEAD(&data->reading);
> + INIT_LIST_HEAD(&data->writing);
> + INIT_LIST_HEAD(&data->resetting);
> + data->mp = mp;
> + return data;
> +
> +out_free_scratch:
> + while (--i >= 0)
> + folio_put(data->scratch[i].folio);
> + bioset_exit(&data->bio_set);
> +out_free_data:
> + kfree(data);
> + return NULL;
> +}
> +
> +static void
> +xfs_zone_gc_data_free(
> + struct xfs_zone_gc_data *data)
> +{
> + int i;
> +
> + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
> + folio_put(data->scratch[i].folio);
> + bioset_exit(&data->bio_set);
> + kfree(data);
> +}
> +
> +#define XFS_ZONE_GC_RECS 1024
> +
> +/* iterator, needs to be reinitialized for each victim zone */
> +struct xfs_zone_gc_iter {
> + struct xfs_rtgroup *victim_rtg;
> + unsigned int rec_count;
> + unsigned int rec_idx;
> + xfs_agblock_t next_startblock;
> + struct xfs_rmap_irec recs[XFS_ZONE_GC_RECS];
> +};
Hmm, each xfs_rmap_irec is 32 bytes, so this structure consumes a little
bit more than 32K of memory. How about 1023 records to be nicer to the
slab allocator?
> +
> +static void
> +xfs_zone_gc_iter_init(
> + struct xfs_zone_gc_iter *iter,
> + struct xfs_rtgroup *victim_rtg)
> +
> +{
> + iter->next_startblock = 0;
> + iter->rec_count = 0;
> + iter->rec_idx = 0;
> + iter->victim_rtg = victim_rtg;
> +}
> +
> +static int
> +xfs_zone_gc_query_cb(
This function gathers rmaps for file blocks to evacuate, right?
> + struct xfs_btree_cur *cur,
> + const struct xfs_rmap_irec *irec,
> + void *private)
> +{
> + struct xfs_zone_gc_iter *iter = private;
> +
> + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
> + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
> + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
I wonder if you actually want to return EFSCORRUPTED for these?
> + iter->recs[iter->rec_count] = *irec;
> + if (++iter->rec_count == XFS_ZONE_GC_RECS) {
> + iter->next_startblock =
> + irec->rm_startblock + irec->rm_blockcount;
> + return 1;
> + }
> + return 0;
> +}
> +
> +static int
> +xfs_zone_gc_rmap_rec_cmp(
> + const void *a,
> + const void *b)
> +{
> + const struct xfs_rmap_irec *reca = a;
> + const struct xfs_rmap_irec *recb = b;
> + int64_t diff;
> +
> + diff = reca->rm_owner - recb->rm_owner;
> + if (!diff)
> + diff = reca->rm_offset - recb->rm_offset;
> + return clamp(diff, -1, 1);
> +}
A silly trick I learned from Kent is that this avoids problems with
unsigned comparisons and other weird C behavior:
#define cmp_int(l, r) ((l > r) - (l < r))
and then this becomes:
int diff = cmp_int(reca->rm_owner, recb->rm_owner);
if (!diff)
diff = cmp_int(reca->rm_offset, recb->rm_offset);
return diff;
> +
> +static int
> +xfs_zone_gc_query(
> + struct xfs_mount *mp,
> + struct xfs_zone_gc_iter *iter)
> +{
> + struct xfs_rtgroup *rtg = iter->victim_rtg;
> + struct xfs_rmap_irec ri_low = { };
> + struct xfs_rmap_irec ri_high;
> + struct xfs_btree_cur *cur;
> + struct xfs_trans *tp;
> + int error;
> +
> + ASSERT(iter->next_startblock <= rtg_blocks(rtg));
> + if (iter->next_startblock == rtg_blocks(rtg))
> + goto done;
> +
> + ASSERT(iter->next_startblock < rtg_blocks(rtg));
> + ri_low.rm_startblock = iter->next_startblock;
> + memset(&ri_high, 0xFF, sizeof(ri_high));
> +
> + iter->rec_idx = 0;
> + iter->rec_count = 0;
> +
> + error = xfs_trans_alloc_empty(mp, &tp);
> + if (error)
> + return error;
> +
> + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
Why join the rtrmap inode when this is an empty transaction?
> + cur = xfs_rtrmapbt_init_cursor(tp, rtg);
> + error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> + xfs_zone_gc_query_cb, iter);
> + xfs_btree_del_cursor(cur, error < 0 ? error : 0);
> + xfs_trans_cancel(tp);
> +
> + if (error < 0)
> + return error;
> +
> + /*
> + * Sort the rmap records by inode number and increasing offset to
> + * defragment the mappings.
> + *
> + * This could be further enhanced by an even bigger look ahead window,
> + * but that's better left until we have better detection of changes to
> + * inode mapping to avoid the potential of GCing already dead data.
> + */
> + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
> + xfs_zone_gc_rmap_rec_cmp, NULL);
Indenting here ^
> +
> + if (error == 0) {
> + /*
> + * We finished iterating through the zone.
> + */
> + iter->next_startblock = rtg_blocks(rtg);
> + if (iter->rec_count == 0)
> + goto done;
> + }
> +
> + return 0;
> +done:
> + xfs_rtgroup_rele(iter->victim_rtg);
> + iter->victim_rtg = NULL;
> + return 0;
> +}
> +
> +static bool
> +xfs_zone_gc_iter_next(
> + struct xfs_mount *mp,
> + struct xfs_zone_gc_iter *iter,
> + struct xfs_rmap_irec *chunk_rec,
> + struct xfs_inode **ipp)
> +{
> + struct xfs_rmap_irec *irec;
> + int error;
> +
> + if (!iter->victim_rtg)
> + return false;
> +
> +retry:
> + if (iter->rec_idx == iter->rec_count) {
> + error = xfs_zone_gc_query(mp, iter);
> + if (error)
> + goto fail;
> + if (!iter->victim_rtg)
> + return false;
> + }
> +
> + irec = &iter->recs[iter->rec_idx];
> + error = xfs_iget(mp, NULL, irec->rm_owner,
> + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
> + if (error) {
> + /*
> + * If the inode was already deleted, skip over it.
> + */
> + if (error == -ENOENT) {
> + iter->rec_idx++;
> + goto retry;
> + }
> + goto fail;
> + }
> +
> + if (!S_ISREG(VFS_I(*ipp)->i_mode)) {
if (!S_ISREG() || !XFS_IS_REALTIME_INODE(ip)) ?
> + iter->rec_idx++;
> + xfs_irele(*ipp);
> + goto retry;
> + }
> +
> + *chunk_rec = *irec;
> + return true;
> +
> +fail:
> + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> + return false;
> +}
> +
> +static void
> +xfs_zone_gc_iter_advance(
> + struct xfs_zone_gc_iter *iter,
> + xfs_extlen_t count_fsb)
> +{
> + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
> +
> + irec->rm_offset += count_fsb;
> + irec->rm_startblock += count_fsb;
> + irec->rm_blockcount -= count_fsb;
> + if (!irec->rm_blockcount)
> + iter->rec_idx++;
> +}
> +
> +/*
> + * Iterate through all zones marked as reclaimable and find a candidate that is
> + * either good enough for instant reclaim, or the one with the least used space.
What is instant reclaim? Is there a non-instant(aneous) reclaim?
Are we biasing towards reclaiming zones with fewer blocks to evacuate?
> + */
> +static bool
> +xfs_zone_reclaim_pick(
> + struct xfs_mount *mp,
> + struct xfs_zone_gc_iter *iter)
> +{
> + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0);
> + struct xfs_rtgroup *victim_rtg = NULL, *rtg;
> + uint32_t victim_used = U32_MAX;
> + bool easy = false;
> +
> + if (xfs_is_shutdown(mp))
> + return false;
> +
> + if (iter->victim_rtg)
> + return true;
> +
> + /*
> + * Don't start new work if we are asked to stop or park.
> + */
> + if (kthread_should_stop() || kthread_should_park())
> + return false;
> +
> + if (!xfs_zoned_need_gc(mp))
> + return false;
> +
> + rcu_read_lock();
> + xas_for_each_marked(&xas, rtg, ULONG_MAX, XFS_RTG_RECLAIMABLE) {
> + u64 used = rtg_rmap(rtg)->i_used_blocks;
> +
> + /* skip zones that are just waiting for a reset */
> + if (used == 0)
> + continue;
> +
> + if (used >= victim_used)
> + continue;
> + if (!atomic_inc_not_zero(&rtg->rtg_group.xg_active_ref))
> + continue;
> +
> + if (victim_rtg)
> + xfs_rtgroup_rele(victim_rtg);
> + victim_rtg = rtg;
> + victim_used = used;
> +
> + /*
> + * Any zone that is less than 1 percent used is fair game for
> + * instant reclaim.
> + */
> + if (used < div_u64(rtg_blocks(rtg), 100)) {
> + easy = true;
> + break;
> + }
> + }
> + rcu_read_unlock();
> +
> + if (!victim_rtg)
> + return false;
> +
> + xfs_info(mp, "reclaiming zone %d, used = %u/%u (%s)",
> + rtg_rgno(victim_rtg), victim_used,
> + rtg_blocks(victim_rtg),
> + easy ? "easy" : "best");
> + trace_xfs_zone_reclaim(victim_rtg);
> + xfs_zone_gc_iter_init(iter, victim_rtg);
> + return true;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_steal_open_zone_for_gc(
> + struct xfs_zone_info *zi)
> +{
> + struct xfs_open_zone *oz, *found = NULL;
> +
> + lockdep_assert_held(&zi->zi_zone_list_lock);
> +
> + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
> + if (!found ||
> + oz->oz_write_pointer < found->oz_write_pointer)
> + found = oz;
> + }
> +
> + if (found) {
> + found->oz_is_gc = true;
> + list_del_init(&found->oz_entry);
> + zi->zi_nr_open_zones--;
> + }
> + return found;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_gc_zone(
For what purpose are we selecting a gc zone? I guess this is the zone
that we're evacuating blocks *into*? As opposed to choosing a zone to
evacuate, which I think is what xfs_zone_reclaim_pick does?
(This could use a short comment for readers to perform their own grok
checking.)
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_open_zone *oz = zi->zi_open_gc_zone;
> +
> + if (oz && oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) {
> + /*
> + * We need to wait for pending writes to finish.
> + */
> + if (oz->oz_written < rtg_blocks(oz->oz_rtg))
> + return NULL;
> + xfs_open_zone_put(oz);
> + oz = NULL;
> + }
> +
> + if (!oz) {
> + /*
> + * If there are no free zones available for GC, pick the open
> + * zone with the least used space to GC into. This should
> + * only happen after an unclean shutdown near ENOSPC while
> + * GC was ongoing.
> + */
> + spin_lock(&zi->zi_zone_list_lock);
> + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
> + oz = xfs_steal_open_zone_for_gc(zi);
> + else
> + oz = xfs_open_zone(mp, true);
> + spin_unlock(&zi->zi_zone_list_lock);
> +
> + if (oz)
> + trace_xfs_gc_zone_activate(oz->oz_rtg);
> + zi->zi_open_gc_zone = oz;
> + }
> +
> + return oz;
> +}
> +
> +static unsigned int
> +xfs_zone_gc_scratch_available(
> + struct xfs_zone_gc_data *data)
> +{
> + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
> +}
> +
> +static bool
> +xfs_zone_gc_space_available(
> + struct xfs_zone_gc_data *data)
> +{
> + struct xfs_open_zone *oz;
> +
> + oz = xfs_select_gc_zone(data->mp);
> + if (!oz)
> + return false;
> + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
> + xfs_zone_gc_scratch_available(data);
> +}
> +
> +static void
> +xfs_zone_gc_end_io(
> + struct bio *bio)
> +{
> + struct xfs_gc_bio *chunk =
> + container_of(bio, struct xfs_gc_bio, bio);
> + struct xfs_zone_gc_data *data = chunk->data;
> +
> + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
> + wake_up_process(data->mp->m_zone_info->zi_gc_thread);
> +}
> +
> +static bool
> +xfs_zone_gc_allocate(
What are allocating here? The @data and the xfs_open_zone already
exist, right? AFAICT we're really just picking a zone to evacuate into,
and then returning the daddr/rtbcount so the caller can allocate a bio,
right?
> + struct xfs_zone_gc_data *data,
> + xfs_extlen_t *count_fsb,
> + xfs_daddr_t *daddr,
> + bool *is_seq)
> +{
> + struct xfs_mount *mp = data->mp;
> + struct xfs_open_zone *oz;
> +
> + oz = xfs_select_gc_zone(mp);
> + if (!oz)
> + return false;
> +
> + *count_fsb = min(*count_fsb,
> + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
> +
> + /*
> + * Directly allocate GC blocks from the reserved pool.
> + *
> + * If we'd take them from the normal pool we could be stealing blocks a
> + * regular writer, which would then have to wait for GC and deadlock.
"...stealing blocks from a regular writer..." ?
> + */
> + spin_lock(&mp->m_sb_lock);
> + *count_fsb = min(*count_fsb,
> + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
> + *count_fsb = min3(*count_fsb,
> + mp->m_resblks[XC_FREE_RTEXTENTS].avail,
> + mp->m_resblks[XC_FREE_RTAVAILABLE].avail);
> + mp->m_resblks[XC_FREE_RTEXTENTS].avail -= *count_fsb;
> + mp->m_resblks[XC_FREE_RTAVAILABLE].avail -= *count_fsb;
> + spin_unlock(&mp->m_sb_lock);
> +
> + if (!*count_fsb)
> + return false;
> +
> + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
> + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
> + if (!*is_seq)
> + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
> + oz->oz_write_pointer += *count_fsb;
> + return true;
> +}
> +
> +static bool
> +xfs_zone_gc_start_chunk(
> + struct xfs_zone_gc_data *data,
> + struct xfs_zone_gc_iter *iter)
> +{
> + struct xfs_mount *mp = data->mp;
> + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
> + struct xfs_rmap_irec irec;
> + struct xfs_gc_bio *chunk;
> + struct xfs_inode *ip;
> + struct bio *bio;
> + xfs_daddr_t daddr;
> + bool is_seq;
> +
> + if (xfs_is_shutdown(mp))
> + return false;
> +
> + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
> + return false;
> + if (!xfs_zone_gc_allocate(data, &irec.rm_blockcount, &daddr, &is_seq)) {
> + xfs_irele(ip);
> + return false;
> + }
> +
> + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
> +
> + chunk = container_of(bio, struct xfs_gc_bio, bio);
> + chunk->ip = ip;
> + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
> + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
> + chunk->old_startblock =
> + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
> + chunk->new_daddr = daddr;
> + chunk->is_seq = is_seq;
> + chunk->scratch = &data->scratch[data->scratch_idx];
> + chunk->data = data;
> +
> + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
> + bio->bi_end_io = xfs_zone_gc_end_io;
> + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
> + chunk->scratch->offset);
> + chunk->scratch->offset += chunk->len;
> + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
> + data->scratch_idx =
> + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
> + }
> + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
> + list_add_tail(&chunk->entry, &data->reading);
> + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
> +
> + submit_bio(bio);
> + return true;
> +}
> +
> +static void
> +xfs_zone_gc_free_chunk(
> + struct xfs_gc_bio *chunk)
> +{
> + list_del(&chunk->entry);
> + xfs_irele(chunk->ip);
> + bio_put(&chunk->bio);
> +}
> +
> +static void
> +xfs_gc_submit_write(
> + struct xfs_zone_gc_data *data,
> + struct xfs_gc_bio *chunk)
> +{
> + if (chunk->is_seq) {
> + chunk->bio.bi_opf &= ~REQ_OP_WRITE;
> + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
> + }
> + chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
> + chunk->bio.bi_end_io = xfs_zone_gc_end_io;
> + submit_bio(&chunk->bio);
> +}
> +
> +static struct xfs_gc_bio *
> +xfs_gc_split_write(
> + struct xfs_zone_gc_data *data,
> + struct xfs_gc_bio *chunk)
> +{
> + struct queue_limits *lim =
> + &bdev_get_queue(chunk->bio.bi_bdev)->limits;
> + struct xfs_gc_bio *split_chunk;
> + int split_sectors;
> + unsigned int split_len;
> + struct bio *split;
> + unsigned int nsegs;
> +
> + if (!chunk->is_seq)
> + return NULL;
> +
> + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
> + lim->max_zone_append_sectors << SECTOR_SHIFT);
> + if (!split_sectors)
> + return NULL;
> + split_len = split_sectors << SECTOR_SHIFT;
> +
> + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
> + split_chunk = container_of(split, struct xfs_gc_bio, bio);
> + split_chunk->data = data;
> + ihold(VFS_I(chunk->ip));
> + split_chunk->ip = chunk->ip;
> + split_chunk->is_seq = chunk->is_seq;
> + split_chunk->scratch = chunk->scratch;
> + split_chunk->offset = chunk->offset;
> + split_chunk->len = split_len;
> + split_chunk->old_startblock = chunk->old_startblock;
> + split_chunk->new_daddr = chunk->new_daddr;
> +
> + chunk->offset += split_len;
> + chunk->len -= split_len;
> + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
> +
> + /* add right before the original chunk */
> + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
> + list_add_tail(&split_chunk->entry, &chunk->entry);
> + return split_chunk;
> +}
> +
> +static void
> +xfs_zone_gc_write_chunk(
> + struct xfs_gc_bio *chunk)
> +{
> + struct xfs_zone_gc_data *data = chunk->data;
> + struct xfs_mount *mp = chunk->ip->i_mount;
> + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
> + struct xfs_gc_bio *split_chunk;
> +
> + if (chunk->bio.bi_status)
> + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
Media errors happen, is there a gentler way to handle a read error
besides shutting down the fs? We /do/ have all that infrastructure for
retrying IOs.
> + if (xfs_is_shutdown(mp)) {
> + xfs_zone_gc_free_chunk(chunk);
> + return;
> + }
> +
> + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
> + list_move_tail(&chunk->entry, &data->writing);
> +
> + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
> + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
> + folio_offset);
> +
> + while ((split_chunk = xfs_gc_split_write(data, chunk)))
> + xfs_gc_submit_write(data, split_chunk);
> + xfs_gc_submit_write(data, chunk);
> +}
> +
> +static void
> +xfs_zone_gc_finish_chunk(
> + struct xfs_gc_bio *chunk)
> +{
> + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> + struct xfs_inode *ip = chunk->ip;
> + struct xfs_mount *mp = ip->i_mount;
> + int error;
> +
> + if (chunk->bio.bi_status)
> + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
Can we pick a different zone and try again?
> + if (xfs_is_shutdown(mp)) {
> + xfs_zone_gc_free_chunk(chunk);
> + return;
> + }
> +
> + chunk->scratch->freed += chunk->len;
> + if (chunk->scratch->freed == chunk->scratch->offset) {
> + chunk->scratch->offset = 0;
> + chunk->scratch->freed = 0;
> + }
> +
> + /*
> + * Cycle through the iolock and wait for direct I/O and layouts to
> + * ensure no one is reading from the old mapping before it goes away.
> + */
> + xfs_ilock(ip, iolock);
> + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
> + if (!error)
> + inode_dio_wait(VFS_I(ip));
> + xfs_iunlock(ip, iolock);
But we drop the io/mmaplocks, which means someone can wander in and
change the file before we get to xfs_zoned_end_io. Is that a problem?
> + if (error)
> + goto free;
> +
> + if (chunk->is_seq)
> + chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
> + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
> + chunk->new_daddr, chunk->old_startblock);
> +free:
> + if (error)
> + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> + xfs_zone_gc_free_chunk(chunk);
> +}
> +
> +static void
> +xfs_zone_gc_finish_reset(
> + struct xfs_gc_bio *chunk)
> +{
> + struct xfs_rtgroup *rtg = chunk->bio.bi_private;
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_zone_info *zi = mp->m_zone_info;
> +
> + if (chunk->bio.bi_status) {
> + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> + goto out;
> + }
> +
> + spin_lock(&zi->zi_zone_list_lock);
> + atomic_inc(&zi->zi_nr_free_zones);
> + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
> + spin_unlock(&zi->zi_zone_list_lock);
> +
> + xfs_zoned_add_available(mp, rtg_blocks(rtg));
> +
> + wake_up_all(&zi->zi_zone_wait);
> +out:
> + list_del(&chunk->entry);
> + bio_put(&chunk->bio);
> +}
> +
> +static bool
> +xfs_prepare_zone_reset(
> + struct bio *bio,
> + struct xfs_rtgroup *rtg)
> +{
> + trace_xfs_zone_reset(rtg);
> +
> + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
> + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
> + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
> + if (!bdev_max_discard_sectors(bio->bi_bdev))
> + return false;
> + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
> + bio->bi_iter.bi_size =
> + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
> + }
> +
> + return true;
> +}
> +
> +int
> +xfs_zone_reset_sync(
> + struct xfs_rtgroup *rtg)
> +{
> + int error = 0;
> + struct bio bio;
> +
> + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
> + REQ_OP_ZONE_RESET);
> + if (xfs_prepare_zone_reset(&bio, rtg))
> + error = submit_bio_wait(&bio);
> + bio_uninit(&bio);
> +
> + return error;
> +}
The only caller of this is in xfs_zone_alloc, maybe it belongs there?
TBH I sorta expected all the functions in here to be xfs_zonegc_XXX.
> +static void
> +xfs_reset_zones(
> + struct xfs_zone_gc_data *data,
> + struct xfs_group *reset_list)
> +{
> + struct xfs_group *next = reset_list;
> +
> + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
> + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
> + return;
> + }
> +
> + do {
> + struct xfs_rtgroup *rtg = to_rtg(next);
> + struct xfs_gc_bio *chunk;
> + struct bio *bio;
> +
> + xfs_log_force_inode(rtg_rmap(rtg));
> +
> + next = rtg_group(rtg)->xg_next_reset;
> + rtg_group(rtg)->xg_next_reset = NULL;
> +
> + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
> + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
> + bio->bi_private = rtg;
> + bio->bi_end_io = xfs_zone_gc_end_io;
> +
> + chunk = container_of(bio, struct xfs_gc_bio, bio);
> + chunk->data = data;
> + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
> + list_add_tail(&chunk->entry, &data->resetting);
> +
^^^^^ weird indentation here
> + /*
> + * Also use the bio to drive the state machine when neither
> + * zone reset nor discard is supported to keep things simple.
> + */
> + if (xfs_prepare_zone_reset(bio, rtg))
> + submit_bio(bio);
> + else
> + bio_endio(bio);
> + } while (next);
> +}
> +
> +/*
> + * Handle the work to read and write data for GC and to reset the zones,
> + * including handling all completions.
> + *
> + * Note that the order of the chunks is preserved so that we don't undo the
> + * optimal order established by xfs_zone_gc_query().
> + */
> +static bool
> +xfs_zone_gc_handle_work(
> + struct xfs_zone_gc_data *data,
> + struct xfs_zone_gc_iter *iter)
> +{
> + struct xfs_zone_info *zi = data->mp->m_zone_info;
> + struct xfs_gc_bio *chunk, *next;
> + struct xfs_group *reset_list;
> + struct blk_plug plug;
> +
> + spin_lock(&zi->zi_reset_list_lock);
> + reset_list = zi->zi_reset_list;
> + zi->zi_reset_list = NULL;
> + spin_unlock(&zi->zi_reset_list_lock);
> +
> + if (!xfs_zone_reclaim_pick(data->mp, iter) ||
> + !xfs_zone_gc_space_available(data)) {
> + if (list_empty(&data->reading) &&
> + list_empty(&data->writing) &&
> + list_empty(&data->resetting) &&
> + !reset_list)
> + return false;
> + }
> +
> + __set_current_state(TASK_RUNNING);
> + try_to_freeze();
> +
> + if (reset_list)
> + xfs_reset_zones(data, reset_list);
> +
> + list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
> + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
> + break;
> + xfs_zone_gc_finish_reset(chunk);
> + }
> +
> + list_for_each_entry_safe(chunk, next, &data->writing, entry) {
> + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
> + break;
> + xfs_zone_gc_finish_chunk(chunk);
> + }
> +
> + blk_start_plug(&plug);
> + list_for_each_entry_safe(chunk, next, &data->reading, entry) {
> + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
> + break;
> + xfs_zone_gc_write_chunk(chunk);
> + }
> + blk_finish_plug(&plug);
> +
> + blk_start_plug(&plug);
> + while (xfs_zone_gc_start_chunk(data, iter))
> + ;
> + blk_finish_plug(&plug);
> + return true;
For us clueless dolts, it would be useful to have a comment somewhere
explaining the high level operation of the garbage collector -- it picks
a non-empty zone to empty and a not-full zone to write into, queries the
rmap to find all the space mappings, initiates a read of the disk
contents, writes (or zone appends) the data to the new zone, then remaps
the space in the file. When the zone becomes empty, it is reset.
> +}
> +
> +/*
> + * Note that the current GC algorithm would break reflinks and thus duplicate
> + * data that was shared by multiple owners before. Because of that reflinks
> + * are currently not supported on zoned file systems and can't be created or
> + * mounted.
> + */
> +static int
> +xfs_zoned_gcd(
> + void *private)
> +{
> + struct xfs_mount *mp = private;
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + unsigned int nofs_flag;
> + struct xfs_zone_gc_data *data;
> + struct xfs_zone_gc_iter *iter;
> +
> + data = xfs_zone_gc_data_alloc(mp);
> + if (!data)
> + return -ENOMEM;
If we return ENOMEM here, who gets the return value from the thread
function? I thought it was kthread_stop, and kthread_create only
returns errors encountered while setting up the thread?
> + iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> + if (!iter)
> + goto out_free_data;
> +
> + nofs_flag = memalloc_nofs_save();
> + set_freezable();
> +
> + for (;;) {
> + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
> + xfs_set_in_gc(mp);
> + if (xfs_zone_gc_handle_work(data, iter))
> + continue;
> +
> + if (list_empty(&data->reading) &&
> + list_empty(&data->writing) &&
> + list_empty(&data->resetting) &&
> + !zi->zi_reset_list) {
> + xfs_clear_in_gc(mp);
> + xfs_zoned_resv_wake_all(mp);
> +
> + if (kthread_should_stop()) {
> + __set_current_state(TASK_RUNNING);
> + break;
> + }
> +
> + if (kthread_should_park()) {
> + __set_current_state(TASK_RUNNING);
> + kthread_parkme();
> + continue;
> + }
> + }
> +
> + schedule();
> + }
> + xfs_clear_in_gc(mp);
> +
> + if (iter->victim_rtg)
> + xfs_rtgroup_rele(iter->victim_rtg);
> + if (zi->zi_open_gc_zone)
> + xfs_open_zone_put(zi->zi_open_gc_zone);
> +
> + memalloc_nofs_restore(nofs_flag);
> + kfree(iter);
> +out_free_data:
> + xfs_zone_gc_data_free(data);
> + return 0;
> +}
> +
> +void
> +xfs_zone_gc_start(
> + struct xfs_mount *mp)
> +{
> + if (xfs_has_zoned(mp))
> + kthread_unpark(mp->m_zone_info->zi_gc_thread);
> +}
> +
> +void
> +xfs_zone_gc_stop(
> + struct xfs_mount *mp)
> +{
> + if (xfs_has_zoned(mp))
> + kthread_park(mp->m_zone_info->zi_gc_thread);
> +}
> +
> +int
> +xfs_zone_gc_mount(
> + struct xfs_mount *mp)
> +{
> + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, mp,
> + "xfs-zone-gc/%s", mp->m_super->s_id);
> + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
> + xfs_warn(mp, "unable to create zone gc thread");
> + return PTR_ERR(mp->m_zone_info->zi_gc_thread);
> + }
> +
> + /* xfs_zone_gc_start will unpark for rw mounts */
> + kthread_park(mp->m_zone_info->zi_gc_thread);
> + return 0;
> +}
> +
> +void
> +xfs_zone_gc_unmount(
> + struct xfs_mount *mp)
> +{
> + kthread_stop(mp->m_zone_info->zi_gc_thread);
> +}
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index f56f3ca8ea00..0b720026e54a 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -82,6 +82,11 @@ struct xfs_zone_info {
>
> struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
>
> +int xfs_zone_reset_sync(struct xfs_rtgroup *rtg);
> +bool xfs_zoned_need_gc(struct xfs_mount *mp);
> +int xfs_zone_gc_mount(struct xfs_mount *mp);
> +void xfs_zone_gc_unmount(struct xfs_mount *mp);
> +
> void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
>
> #endif /* _XFS_ZONE_PRIV_H */
> diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
> index 5ee525e18759..77211f4c7033 100644
> --- a/fs/xfs/xfs_zone_space_resv.c
> +++ b/fs/xfs/xfs_zone_space_resv.c
> @@ -159,6 +159,13 @@ xfs_zoned_reserve_available(
> if (error != -ENOSPC)
> break;
>
> + /*
> + * If there is nothing left to reclaim, give up.
> + */
> + if (!xfs_is_in_gc(mp) &&
> + !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> + break;
Should the caller try again with a different zone if this happens?
--D
> +
> spin_unlock(&zi->zi_reservation_lock);
> schedule();
> spin_lock(&zi->zi_reservation_lock);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-13 22:18 ` Darrick J. Wong
@ 2024-12-15 5:57 ` Christoph Hellwig
2024-12-17 1:27 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 5:57 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:18:51PM -0800, Darrick J. Wong wrote:
> Can we do the garbage collection from userspace?
Well, you can try, but it will be less efficient and more fragile. It'll
probably also be very had to make it not deadlock.
> I've had a freespace
> defragmenter banging around in my dev tree for years:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=defrag-freespace_2024-12-12
>
> Which has the nice property that it knows how to query the refcount
> btree to try to move the most heavily shared blocks first. For zoned
> that might not matter since we /must/ evacuate the whole zone.
Is moving heavily shared blocks first actually a good idea? It is a
lot more work to move them and generates more metadata vs moving unshared
blocks. That being said it at least handles reflinks, which this currently
doesn't. I'll take a look at it for ideas on implementing shared block
support for the GC code.
> Regardless, it could be nice to have a userspace process that we could
> trigger from the kernel at some threshold (e.g. 70% space used) to see
> if it can clean out some zones before the kernel one kicks in and slows
> everyone down.
As said above I'm not sold on doing the work in userspace. But adding
config nobs to start GC earlier is on Hans' TODO list, and being able
to force it also sounds useful for some use case. I also suspect that
reusing some of this code, but driving it from the bmap btree instead
of the rmap one could be really nice for file mapping defragmentation.
> > - struct xfs_extent_busy_tree *xg_busy_extents;
> > + union {
> > + /*
> > + * Track freed but not yet committed extents.
> > + */
> > + struct xfs_extent_busy_tree *xg_busy_extents;
> > +
> > + /*
> > + * List of groups that need a zone reset for zoned file systems.
> > + */
> > + struct xfs_group *xg_next_reset;
> > + };
>
> Don't we need busy extents for zoned rtgroups? I was under the
> impression that the busy extents code prevents us from reallocating
> recently freed space until the EFI (and hence the bunmapi) transaction
> are persisted to the log so that new contents written after a
> reallocation + write + fdatasync won't reappear in the old file?
Yes, but remember blocks can't be reused in a zoned file systems until
the zone has been reset. And xfs_reset_zones forces a flush on the
RT device before starting the current patch of resets, and then also
forces the log out so that all transactions that touched the rmap inode
(which includes the EFI transaction) are forced to disk.
> > @@ -592,6 +594,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
> > #endif /* CONFIG_XFS_QUOTA */
> > __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
> > __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
> > +__XFS_IS_OPSTATE(in_gc, IN_GC)
>
> Nit: I might've called this ZONEGC_RUNNING.
>
> if (xfs_is_zonegc_running(mp))
> frob();
Fine with me.
> > + * State of this gc_bio. Done means the current I/O completed.
> > + * Set from the bio end I/O handler, read from the GC thread.
> > + */
> > + unsigned long state;
> > +#define XFS_GC_BIO_NEW 0
> > +#define XFS_GC_BIO_DONE 1
>
> Are these bits, or a enum in disguise?
They are an enum in disguise (sounds like a great country song, to go
along with this recent programming theme metal song:
https://www.youtube.com/watch?v=yup8gIXxWDU
>
> > +
> > + /*
> > + * Pointer to the inode and range of the inode that the GC is performed
> > + * for.
> > + */
> > + struct xfs_inode *ip;
> > + loff_t offset;
> > + unsigned int len;
>
> Are offset/len in bytes? It looks like they are.
Yes.
> > +xfs_zoned_need_gc(
> > + struct xfs_mount *mp)
> > +{
> > + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> > + return false;
> > + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
> > + mp->m_groups[XG_TYPE_RTG].blocks *
> > + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
>
> Is the righthand side of the comparison the number of blocks in the
> zones that are open for userspace can write to?
Yes. m_max_open_zones is the maximum number of zones we can write to
at the same time. From that XFS_OPEN_GC_ZONES is deducted because GC
zones (there's only 1 right now) always use reserved blocks.
> > +struct xfs_zone_gc_iter {
> > + struct xfs_rtgroup *victim_rtg;
> > + unsigned int rec_count;
> > + unsigned int rec_idx;
> > + xfs_agblock_t next_startblock;
> > + struct xfs_rmap_irec recs[XFS_ZONE_GC_RECS];
> > +};
>
> Hmm, each xfs_rmap_irec is 32 bytes, so this structure consumes a little
> bit more than 32K of memory. How about 1023 records to be nicer to the
> slab allocator?
Sure.
> > +static int
> > +xfs_zone_gc_query_cb(
>
> This function gathers rmaps for file blocks to evacuate, right?
Yes.
>
> > + struct xfs_btree_cur *cur,
> > + const struct xfs_rmap_irec *irec,
> > + void *private)
> > +{
> > + struct xfs_zone_gc_iter *iter = private;
> > +
> > + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
> > + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
> > + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
>
> I wonder if you actually want to return EFSCORRUPTED for these?
They could. OTOH returning all this on a rtrmap query is more than just
a corrupted file system, isn't it?
> > + const struct xfs_rmap_irec *recb = b;
> > + int64_t diff;
> > +
> > + diff = reca->rm_owner - recb->rm_owner;
> > + if (!diff)
> > + diff = reca->rm_offset - recb->rm_offset;
> > + return clamp(diff, -1, 1);
> > +}
>
> A silly trick I learned from Kent is that this avoids problems with
> unsigned comparisons and other weird C behavior:
>
> #define cmp_int(l, r) ((l > r) - (l < r))
Looks like that is used in a few places and would be nice to have
in kernel.h.
> > + error = xfs_trans_alloc_empty(mp, &tp);
> > + if (error)
> > + return error;
> > +
> > + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> > + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
>
> Why join the rtrmap inode when this is an empty transaction?
Probably because I stupidly copy and pasted this from somewhere and
it didn't blow up? :)
> > +}
> > +
> > +/*
> > + * Iterate through all zones marked as reclaimable and find a candidate that is
> > + * either good enough for instant reclaim, or the one with the least used space.
>
> What is instant reclaim? Is there a non-instant(aneous) reclaim?
> Are we biasing towards reclaiming zones with fewer blocks to evacuate?
Instantly reclaims is when the zone is used less than 1% and we just take
it instead of looking for the best candidate (least used blocks)
otherwise.
> > +static struct xfs_open_zone *
> > +xfs_select_gc_zone(
>
> For what purpose are we selecting a gc zone? I guess this is the zone
> that we're evacuating blocks *into*? As opposed to choosing a zone to
> evacuate, which I think is what xfs_zone_reclaim_pick does?
Exactly.
> (This could use a short comment for readers to perform their own grok
> checking.)
Sure. And maybe we can also work on the naming to throw in more
consistent victim and target prefixes.
> > +
> > +static bool
> > +xfs_zone_gc_allocate(
>
> What are allocating here? The @data and the xfs_open_zone already
> exist, right? AFAICT we're really just picking a zone to evacuate into,
> and then returning the daddr/rtbcount so the caller can allocate a bio,
> right?
Yes, it allocates blocks from the gc zones. I.e this is the GC
counterpart of xfs_zone_alloc_blocks. Maybe xfs_zone_gc_alloc_blocks
might be a better name?
> > + struct xfs_zone_gc_data *data = chunk->data;
> > + struct xfs_mount *mp = chunk->ip->i_mount;
> > + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
> > + struct xfs_gc_bio *split_chunk;
> > +
> > + if (chunk->bio.bi_status)
> > + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
>
> Media errors happen, is there a gentler way to handle a read error
> besides shutting down the fs? We /do/ have all that infrastructure for
> retrying IOs.
We do have it, and as far as I can tell it's pretty useless. Retryable
errors are already retried by the device or drive, so once things bubble
up to the file system they tend to be fatal. So the only thing we do
with retrying here is to delay the inevitable trouble.
I'm actually looking into something related at the moment: for writes
XFS currently bubbles up write errors to the caller (dio) or stores
them in the mapping (buffered I/O), which for the latter means we lose
the pagecache because the dirty bits are cleared, but only users that
actually fsync or close will ever see it. And with modern media you
will only get these errors if shit really hit the fan. For normal
1 device XFS configurations we'll hit a metadata write error sooner
or later and shut the file system down, but with an external RT device
we don't and basically never shut down which is rather problematic.
So I'm tempted to add code to (at least optionally) shut down after
data write errors.
> > +static void
> > +xfs_zone_gc_finish_chunk(
> > + struct xfs_gc_bio *chunk)
> > +{
> > + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > + struct xfs_inode *ip = chunk->ip;
> > + struct xfs_mount *mp = ip->i_mount;
> > + int error;
> > +
> > + if (chunk->bio.bi_status)
> > + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
>
> Can we pick a different zone and try again?
We could. But it will just fail again and we'll delay the failure
reporting to the upper layer which would much rather know about that and
say move it's data to a different node.
> > + /*
> > + * Cycle through the iolock and wait for direct I/O and layouts to
> > + * ensure no one is reading from the old mapping before it goes away.
> > + */
> > + xfs_ilock(ip, iolock);
> > + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
> > + if (!error)
> > + inode_dio_wait(VFS_I(ip));
> > + xfs_iunlock(ip, iolock);
>
> But we drop the io/mmaplocks, which means someone can wander in and
> change the file before we get to xfs_zoned_end_io. Is that a problem?
No, that's why xfs_zoned_end_io has the special mode where the old
startblock is passed in by GC, and it won't remap when they mismatch.
xfs_zoned_end_extent has a comment describing it.
> > +int
> > +xfs_zone_reset_sync(
> > + struct xfs_rtgroup *rtg)
> > +{
> > + int error = 0;
> > + struct bio bio;
> > +
> > + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
> > + REQ_OP_ZONE_RESET);
> > + if (xfs_prepare_zone_reset(&bio, rtg))
> > + error = submit_bio_wait(&bio);
> > + bio_uninit(&bio);
> > +
> > + return error;
> > +}
>
> The only caller of this is in xfs_zone_alloc, maybe it belongs there?
I actually split it out recently so that we don't need a forward
declaration for xfs_zone_gc_data in xfs_zone_priv.h that was needed
previously and which is a bit ugly. I also conceptually is part of
GC, as it finishes off a GC process interrupted by a powerfail.
> TBH I sorta expected all the functions in here to be xfs_zonegc_XXX.
I can look into that.
> For us clueless dolts, it would be useful to have a comment somewhere
> explaining the high level operation of the garbage collector
Sure.
> -- it picks
> a non-empty zone to empty and a not-full zone to write into, queries the
> rmap to find all the space mappings, initiates a read of the disk
> contents, writes (or zone appends) the data to the new zone, then remaps
> the space in the file. When the zone becomes empty, it is reset.
Yes, I'll add something.
> > + struct xfs_zone_gc_data *data;
> > + struct xfs_zone_gc_iter *iter;
> > +
> > + data = xfs_zone_gc_data_alloc(mp);
> > + if (!data)
> > + return -ENOMEM;
>
> If we return ENOMEM here, who gets the return value from the thread
> function? I thought it was kthread_stop, and kthread_create only
> returns errors encountered while setting up the thread?
Hmm. I guess I can move it to the caller, although passing both the
data and iter will make it a bit complicated.
> > --- a/fs/xfs/xfs_zone_space_resv.c
> > +++ b/fs/xfs/xfs_zone_space_resv.c
> > @@ -159,6 +159,13 @@ xfs_zoned_reserve_available(
> > if (error != -ENOSPC)
> > break;
> >
> > + /*
> > + * If there is nothing left to reclaim, give up.
> > + */
> > + if (!xfs_is_in_gc(mp) &&
> > + !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> > + break;
>
> Should the caller try again with a different zone if this happens?
No zones involved at all at this level of code. We're before
taking iolock and just reserving space. But
!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) means there
literally isn't any reclaimable space left, and !xfs_is_in_gc means
there's also no more ongoing processes that might have taken the last
zone from reclaimable space, but haven't added it to the available
pool yet. I.e. this is the hard ENOSPC condition.
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-15 5:57 ` Christoph Hellwig
@ 2024-12-17 1:27 ` Darrick J. Wong
2024-12-17 4:06 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 1:27 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 06:57:23AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 02:18:51PM -0800, Darrick J. Wong wrote:
> > Can we do the garbage collection from userspace?
>
> Well, you can try, but it will be less efficient and more fragile. It'll
> probably also be very had to make it not deadlock.
>
> > I've had a freespace
> > defragmenter banging around in my dev tree for years:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=defrag-freespace_2024-12-12
> >
> > Which has the nice property that it knows how to query the refcount
> > btree to try to move the most heavily shared blocks first. For zoned
> > that might not matter since we /must/ evacuate the whole zone.
>
> Is moving heavily shared blocks first actually a good idea? It is a
> lot more work to move them and generates more metadata vs moving unshared
> blocks. That being said it at least handles reflinks, which this currently
> doesn't. I'll take a look at it for ideas on implementing shared block
> support for the GC code.
Hrmm. For defragmenting free space, I thought it was best to move the
most highly shared extents first to increase the likelihood that the new
space allocation would be contiguous and not contribute to bmbt
expansion.
For zone gc we have to clear out the whole rtgroup and we don't have a
/lot/ of control so maybe that matters less. OTOH we know how much
space we can get out of the zone, so
> > Regardless, it could be nice to have a userspace process that we could
> > trigger from the kernel at some threshold (e.g. 70% space used) to see
> > if it can clean out some zones before the kernel one kicks in and slows
> > everyone down.
>
> As said above I'm not sold on doing the work in userspace. But adding
> config nobs to start GC earlier is on Hans' TODO list, and being able
> to force it also sounds useful for some use case. I also suspect that
> reusing some of this code, but driving it from the bmap btree instead
> of the rmap one could be really nice for file mapping defragmentation.
<nod> I'd definitely give the in-kernel gc a means to stop the userspace
gc if the zone runs out of space and it clearly isn't making progress.
The tricky part is how do we give the userspace gc one of the "gc
zones"?
> > > - struct xfs_extent_busy_tree *xg_busy_extents;
> > > + union {
> > > + /*
> > > + * Track freed but not yet committed extents.
> > > + */
> > > + struct xfs_extent_busy_tree *xg_busy_extents;
> > > +
> > > + /*
> > > + * List of groups that need a zone reset for zoned file systems.
> > > + */
> > > + struct xfs_group *xg_next_reset;
> > > + };
> >
> > Don't we need busy extents for zoned rtgroups? I was under the
> > impression that the busy extents code prevents us from reallocating
> > recently freed space until the EFI (and hence the bunmapi) transaction
> > are persisted to the log so that new contents written after a
> > reallocation + write + fdatasync won't reappear in the old file?
>
> Yes, but remember blocks can't be reused in a zoned file systems until
> the zone has been reset. And xfs_reset_zones forces a flush on the
> RT device before starting the current patch of resets, and then also
> forces the log out so that all transactions that touched the rmap inode
> (which includes the EFI transaction) are forced to disk.
Ah, right! Would you mind putting that in a comment somewhere?
/*
* List of groups that need a zone reset. The zonegc code
* forces a log flush of the rtrmap inode before resetting the
* write pointer, so we don't need busy extent tracking.
*/
> > > @@ -592,6 +594,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
> > > #endif /* CONFIG_XFS_QUOTA */
> > > __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
> > > __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
> > > +__XFS_IS_OPSTATE(in_gc, IN_GC)
> >
> > Nit: I might've called this ZONEGC_RUNNING.
> >
> > if (xfs_is_zonegc_running(mp))
> > frob();
>
> Fine with me.
>
> > > + * State of this gc_bio. Done means the current I/O completed.
> > > + * Set from the bio end I/O handler, read from the GC thread.
> > > + */
> > > + unsigned long state;
> > > +#define XFS_GC_BIO_NEW 0
> > > +#define XFS_GC_BIO_DONE 1
> >
> > Are these bits, or a enum in disguise?
>
> They are an enum in disguise (sounds like a great country song, to go
> along with this recent programming theme metal song:
>
> https://www.youtube.com/watch?v=yup8gIXxWDU
Sorry I spittook all over the keyboard and now I hva to go clen it up.
> >
> > > +
> > > + /*
> > > + * Pointer to the inode and range of the inode that the GC is performed
> > > + * for.
> > > + */
> > > + struct xfs_inode *ip;
> > > + loff_t offset;
> > > + unsigned int len;
> >
> > Are offset/len in bytes? It looks like they are.
>
> Yes.
>
> > > +xfs_zoned_need_gc(
> > > + struct xfs_mount *mp)
> > > +{
> > > + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> > > + return false;
> > > + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
> > > + mp->m_groups[XG_TYPE_RTG].blocks *
> > > + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
> >
> > Is the righthand side of the comparison the number of blocks in the
> > zones that are open for userspace can write to?
>
> Yes. m_max_open_zones is the maximum number of zones we can write to
> at the same time. From that XFS_OPEN_GC_ZONES is deducted because GC
> zones (there's only 1 right now) always use reserved blocks.
>
> > > +struct xfs_zone_gc_iter {
> > > + struct xfs_rtgroup *victim_rtg;
> > > + unsigned int rec_count;
> > > + unsigned int rec_idx;
> > > + xfs_agblock_t next_startblock;
> > > + struct xfs_rmap_irec recs[XFS_ZONE_GC_RECS];
> > > +};
> >
> > Hmm, each xfs_rmap_irec is 32 bytes, so this structure consumes a little
> > bit more than 32K of memory. How about 1023 records to be nicer to the
> > slab allocator?
>
> Sure.
>
> > > +static int
> > > +xfs_zone_gc_query_cb(
> >
> > This function gathers rmaps for file blocks to evacuate, right?
>
> Yes.
>
> >
> > > + struct xfs_btree_cur *cur,
> > > + const struct xfs_rmap_irec *irec,
> > > + void *private)
> > > +{
> > > + struct xfs_zone_gc_iter *iter = private;
> > > +
> > > + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
> > > + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
> > > + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
> >
> > I wonder if you actually want to return EFSCORRUPTED for these?
>
> They could. OTOH returning all this on a rtrmap query is more than just
> a corrupted file system, isn't it?
Oh yeah, I forgot that xfs_rmap_get_rec has its own verifiers and will
return EFSCORRUPTED for all three conditions. Ok never mind then. :)
> > > + const struct xfs_rmap_irec *recb = b;
> > > + int64_t diff;
> > > +
> > > + diff = reca->rm_owner - recb->rm_owner;
> > > + if (!diff)
> > > + diff = reca->rm_offset - recb->rm_offset;
> > > + return clamp(diff, -1, 1);
> > > +}
> >
> > A silly trick I learned from Kent is that this avoids problems with
> > unsigned comparisons and other weird C behavior:
> >
> > #define cmp_int(l, r) ((l > r) - (l < r))
>
> Looks like that is used in a few places and would be nice to have
> in kernel.h.
>
> > > + error = xfs_trans_alloc_empty(mp, &tp);
> > > + if (error)
> > > + return error;
> > > +
> > > + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> > > + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
> >
> > Why join the rtrmap inode when this is an empty transaction?
>
> Probably because I stupidly copy and pasted this from somewhere and
> it didn't blow up? :)
Well you didn't dirty the inode (or the transaction) so I guess that is
actually allowed. :)
> > > +}
> > > +
> > > +/*
> > > + * Iterate through all zones marked as reclaimable and find a candidate that is
> > > + * either good enough for instant reclaim, or the one with the least used space.
> >
> > What is instant reclaim? Is there a non-instant(aneous) reclaim?
> > Are we biasing towards reclaiming zones with fewer blocks to evacuate?
>
> Instantly reclaims is when the zone is used less than 1% and we just take
> it instead of looking for the best candidate (least used blocks)
> otherwise.
Ah, ok.
> > > +static struct xfs_open_zone *
> > > +xfs_select_gc_zone(
> >
> > For what purpose are we selecting a gc zone? I guess this is the zone
> > that we're evacuating blocks *into*? As opposed to choosing a zone to
> > evacuate, which I think is what xfs_zone_reclaim_pick does?
>
> Exactly.
>
> > (This could use a short comment for readers to perform their own grok
> > checking.)
>
> Sure. And maybe we can also work on the naming to throw in more
> consistent victim and target prefixes.
<noD>
> > > +
> > > +static bool
> > > +xfs_zone_gc_allocate(
> >
> > What are allocating here? The @data and the xfs_open_zone already
> > exist, right? AFAICT we're really just picking a zone to evacuate into,
> > and then returning the daddr/rtbcount so the caller can allocate a bio,
> > right?
>
> Yes, it allocates blocks from the gc zones. I.e this is the GC
> counterpart of xfs_zone_alloc_blocks. Maybe xfs_zone_gc_alloc_blocks
> might be a better name?
<nod>
> > > + struct xfs_zone_gc_data *data = chunk->data;
> > > + struct xfs_mount *mp = chunk->ip->i_mount;
> > > + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
> > > + struct xfs_gc_bio *split_chunk;
> > > +
> > > + if (chunk->bio.bi_status)
> > > + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> >
> > Media errors happen, is there a gentler way to handle a read error
> > besides shutting down the fs? We /do/ have all that infrastructure for
> > retrying IOs.
>
> We do have it, and as far as I can tell it's pretty useless. Retryable
> errors are already retried by the device or drive, so once things bubble
> up to the file system they tend to be fatal. So the only thing we do
> with retrying here is to delay the inevitable trouble.
>
> I'm actually looking into something related at the moment: for writes
> XFS currently bubbles up write errors to the caller (dio) or stores
> them in the mapping (buffered I/O), which for the latter means we lose
> the pagecache because the dirty bits are cleared, but only users that
> actually fsync or close will ever see it. And with modern media you
> will only get these errors if shit really hit the fan. For normal
> 1 device XFS configurations we'll hit a metadata write error sooner
> or later and shut the file system down, but with an external RT device
> we don't and basically never shut down which is rather problematic.
> So I'm tempted to add code to (at least optionally) shut down after
> data write errors.
It would be kinda nice if we could report write(back) errors via
fanotify, but that's buried so deep in the filesystems that seems
tricky.
> > > +static void
> > > +xfs_zone_gc_finish_chunk(
> > > + struct xfs_gc_bio *chunk)
> > > +{
> > > + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> > > + struct xfs_inode *ip = chunk->ip;
> > > + struct xfs_mount *mp = ip->i_mount;
> > > + int error;
> > > +
> > > + if (chunk->bio.bi_status)
> > > + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
> >
> > Can we pick a different zone and try again?
>
> We could. But it will just fail again and we'll delay the failure
> reporting to the upper layer which would much rather know about that and
> say move it's data to a different node.
<nod>
> > > + /*
> > > + * Cycle through the iolock and wait for direct I/O and layouts to
> > > + * ensure no one is reading from the old mapping before it goes away.
> > > + */
> > > + xfs_ilock(ip, iolock);
> > > + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
> > > + if (!error)
> > > + inode_dio_wait(VFS_I(ip));
> > > + xfs_iunlock(ip, iolock);
> >
> > But we drop the io/mmaplocks, which means someone can wander in and
> > change the file before we get to xfs_zoned_end_io. Is that a problem?
>
> No, that's why xfs_zoned_end_io has the special mode where the old
> startblock is passed in by GC, and it won't remap when they mismatch.
> xfs_zoned_end_extent has a comment describing it.
ah ok.
> > > +int
> > > +xfs_zone_reset_sync(
> > > + struct xfs_rtgroup *rtg)
> > > +{
> > > + int error = 0;
> > > + struct bio bio;
> > > +
> > > + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
> > > + REQ_OP_ZONE_RESET);
> > > + if (xfs_prepare_zone_reset(&bio, rtg))
> > > + error = submit_bio_wait(&bio);
> > > + bio_uninit(&bio);
> > > +
> > > + return error;
> > > +}
> >
> > The only caller of this is in xfs_zone_alloc, maybe it belongs there?
>
> I actually split it out recently so that we don't need a forward
> declaration for xfs_zone_gc_data in xfs_zone_priv.h that was needed
> previously and which is a bit ugly. I also conceptually is part of
> GC, as it finishes off a GC process interrupted by a powerfail.
>
> > TBH I sorta expected all the functions in here to be xfs_zonegc_XXX.
>
> I can look into that.
>
> > For us clueless dolts, it would be useful to have a comment somewhere
> > explaining the high level operation of the garbage collector
>
> Sure.
>
> > -- it picks
> > a non-empty zone to empty and a not-full zone to write into, queries the
> > rmap to find all the space mappings, initiates a read of the disk
> > contents, writes (or zone appends) the data to the new zone, then remaps
> > the space in the file. When the zone becomes empty, it is reset.
>
> Yes, I'll add something.
>
> > > + struct xfs_zone_gc_data *data;
> > > + struct xfs_zone_gc_iter *iter;
> > > +
> > > + data = xfs_zone_gc_data_alloc(mp);
> > > + if (!data)
> > > + return -ENOMEM;
> >
> > If we return ENOMEM here, who gets the return value from the thread
> > function? I thought it was kthread_stop, and kthread_create only
> > returns errors encountered while setting up the thread?
>
> Hmm. I guess I can move it to the caller, although passing both the
> data and iter will make it a bit complicated.
>
> > > --- a/fs/xfs/xfs_zone_space_resv.c
> > > +++ b/fs/xfs/xfs_zone_space_resv.c
> > > @@ -159,6 +159,13 @@ xfs_zoned_reserve_available(
> > > if (error != -ENOSPC)
> > > break;
> > >
> > > + /*
> > > + * If there is nothing left to reclaim, give up.
> > > + */
> > > + if (!xfs_is_in_gc(mp) &&
> > > + !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
> > > + break;
> >
> > Should the caller try again with a different zone if this happens?
>
> No zones involved at all at this level of code. We're before
> taking iolock and just reserving space. But
> !xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) means there
> literally isn't any reclaimable space left, and !xfs_is_in_gc means
> there's also no more ongoing processes that might have taken the last
> zone from reclaimable space, but haven't added it to the available
> pool yet. I.e. this is the hard ENOSPC condition.
Ah ok. Thanks for explanining. :D
--D
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-17 1:27 ` Darrick J. Wong
@ 2024-12-17 4:06 ` Christoph Hellwig
2024-12-17 17:42 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-17 4:06 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Mon, Dec 16, 2024 at 05:27:53PM -0800, Darrick J. Wong wrote:
> > lot more work to move them and generates more metadata vs moving unshared
> > blocks. That being said it at least handles reflinks, which this currently
> > doesn't. I'll take a look at it for ideas on implementing shared block
> > support for the GC code.
>
> Hrmm. For defragmenting free space, I thought it was best to move the
> most highly shared extents first to increase the likelihood that the new
> space allocation would be contiguous and not contribute to bmbt
> expansion.
How does moving a highly shared extent vs a less shared extent help
with keeping free space contiguous? What matters for that in a non-zoned
interface is that the extent is between two free space or soon to be
free space extents, but the amount of sharing shouldn't really matter.
> For zone gc we have to clear out the whole rtgroup and we don't have a
> /lot/ of control so maybe that matters less. OTOH we know how much
> space we can get out of the zone, so
But yes, independent of the above question, freespace for the zone
allocator is always very contiguous.
> <nod> I'd definitely give the in-kernel gc a means to stop the userspace
> gc if the zone runs out of space and it clearly isn't making progress.
> The tricky part is how do we give the userspace gc one of the "gc
> zones"?
Yes. And how do we kill it when it doesn't act in time? How do we
even ensure it acts in time. How do we deal with userspace GC not
running or getting killed?
I have to say all my experiments with user space call ups for activity
triggered by kernel fast path and memory reclaim activity have been
overwhelmingly negative. I won't NAK any of someone wants to experiment,
but I don't plan to spend my time on it.
> Ah, right! Would you mind putting that in a comment somewhere?
Will do.
> > 1 device XFS configurations we'll hit a metadata write error sooner
> > or later and shut the file system down, but with an external RT device
> > we don't and basically never shut down which is rather problematic.
> > So I'm tempted to add code to (at least optionally) shut down after
> > data write errors.
>
> It would be kinda nice if we could report write(back) errors via
> fanotify, but that's buried so deep in the filesystems that seems
> tricky.
Reporting that is more useful than just the shutdown would be useful.
How we get it on the other hand might be a bit hard.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-17 4:06 ` Christoph Hellwig
@ 2024-12-17 17:42 ` Darrick J. Wong
2024-12-18 7:13 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:42 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Tue, Dec 17, 2024 at 05:06:55AM +0100, Christoph Hellwig wrote:
> On Mon, Dec 16, 2024 at 05:27:53PM -0800, Darrick J. Wong wrote:
> > > lot more work to move them and generates more metadata vs moving unshared
> > > blocks. That being said it at least handles reflinks, which this currently
> > > doesn't. I'll take a look at it for ideas on implementing shared block
> > > support for the GC code.
> >
> > Hrmm. For defragmenting free space, I thought it was best to move the
> > most highly shared extents first to increase the likelihood that the new
> > space allocation would be contiguous and not contribute to bmbt
> > expansion.
>
> How does moving a highly shared extent vs a less shared extent help
> with keeping free space contiguous? What matters for that in a non-zoned
> interface is that the extent is between two free space or soon to be
> free space extents, but the amount of sharing shouldn't really matter.
It might help if I mention that the clearspace code I wrote is given a
range of device daddrs to evacuate, so it tries to make *that range*
contiguous and free, possibly at the expense of other parts of the
filesystem. Initially I wrote it to support evacuating near EOFS so
that you could shrink the filesystem, but Ted and others mentioned that
it can be more generally useful to recover after some database
compresses its table files and fragments the free space.
So I'm not defragmenting in the xfs_fsr sense, and maybe I should just
call it free space evacuation. If the daddr range you want to evac
contains 1x 200MB extent shared 1000 times; and 10,000 fragmented 8k
blocks, you might want to move the 200MB extent (and all 1000 mappings)
first to try to keep that contiguous. If moving the 8k fragments fails,
at least you cleared out 200MB of it.
> > For zone gc we have to clear out the whole rtgroup and we don't have a
> > /lot/ of control so maybe that matters less. OTOH we know how much
> > space we can get out of the zone, so
>
> But yes, independent of the above question, freespace for the zone
> allocator is always very contiguous.
>
> > <nod> I'd definitely give the in-kernel gc a means to stop the userspace
> > gc if the zone runs out of space and it clearly isn't making progress.
> > The tricky part is how do we give the userspace gc one of the "gc
> > zones"?
>
> Yes. And how do we kill it when it doesn't act in time? How do we
> even ensure it acts in time. How do we deal with userspace GC not
> running or getting killed?
>
> I have to say all my experiments with user space call ups for activity
> triggered by kernel fast path and memory reclaim activity have been
> overwhelmingly negative. I won't NAK any of someone wants to experiment,
> but I don't plan to spend my time on it.
<nod> That was mostly built on the speculation that on a device with
130,000 zones, there probably aren't so many writer threads that we
couldn't add another gc process to clean out a few zones. But that's
all highly speculative food for the roadmap.
> > Ah, right! Would you mind putting that in a comment somewhere?
>
> Will do.
>
> > > 1 device XFS configurations we'll hit a metadata write error sooner
> > > or later and shut the file system down, but with an external RT device
> > > we don't and basically never shut down which is rather problematic.
> > > So I'm tempted to add code to (at least optionally) shut down after
> > > data write errors.
> >
> > It would be kinda nice if we could report write(back) errors via
> > fanotify, but that's buried so deep in the filesystems that seems
> > tricky.
>
> Reporting that is more useful than just the shutdown would be useful.
> How we get it on the other hand might be a bit hard.
Yeah. The experimental healthmon code further down in my dev tree
explores that a little, but we'll see how everyone reacts to it. ;)
Also: while I was poking around with Felipe's ficlone/swapon test it
occurred to me -- does freezing the fs actually get the zonegc kthread
to finish up whatever work is in-flight at that moment?
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 26/43] xfs: implement zoned garbage collection
2024-12-17 17:42 ` Darrick J. Wong
@ 2024-12-18 7:13 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-18 7:13 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Tue, Dec 17, 2024 at 09:42:33AM -0800, Darrick J. Wong wrote:
> Also: while I was poking around with Felipe's ficlone/swapon test it
> occurred to me -- does freezing the fs actually get the zonegc kthread
> to finish up whatever work is in-flight at that moment?
Looking at the code it probably does not. Let me see if I can come up
with a test to expose that, i.e. heavy GC activity, freeze, mark the
underlying device RO and see if something explodes (based on my reading
it should right now).
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 27/43] xfs: implement buffered writes to zoned RT devices
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (25 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 26/43] xfs: implement zoned garbage collection Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:37 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 28/43] xfs: implement direct " Christoph Hellwig
` (15 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Implement buffered writes including page faults and block zeroing for
zoned RT devices. Buffered writes to zoned RT devices are split into
three phases:
1) a reservation for the worst case data block usage is taken before
acquiring the iolock. When there are enough free blocks but not
enough available one, garbage collection is kicked of to free the
space before continuing with the write. If there isn't enough
freeable space, the block reservation is reduced and a short write
will happen as expected by normal Linux write semantics.
2) with the iolock held, the generic iomap buffered write code is
called, which through the iomap_begin operation usually just inserts
delalloc extents for the range in a single iteration. Only for
overwrites of existing data that are not block aligned, or zeroing
operations the existing extent mapping is read to fill out the srcmap
and to figure out if zeroing is required.
3) the ->map_blocks callback to the generic iomap writeback code
calls into the zoned space allocator to actually allocate on-disk
space for the range before kicking of the writeback.
Note that because all writes are out of place, truncate or hole punches
that are not aligned to block size boundaries need to allocate space.
For block zeroing from truncate, ->setattr is called with the iolock
(aka i_rwsem) already held, so a hacky deviation from the above
scheme is needed. In this case the space reservations is called with
the iolock held, but is required not to block and can dip into the
reserved block pool. This can lead to -ENOSPC when truncating a
file, which is unfortunate. But fixing the calling conventions in
the VFS is probably much easier with code requiring it already in
mainline.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_aops.c | 134 +++++++++++++++++++++++++--
fs/xfs/xfs_bmap_util.c | 21 +++--
fs/xfs/xfs_bmap_util.h | 12 ++-
fs/xfs/xfs_file.c | 202 +++++++++++++++++++++++++++++++++++++----
fs/xfs/xfs_iomap.c | 186 ++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 6 +-
fs/xfs/xfs_iops.c | 31 ++++++-
fs/xfs/xfs_reflink.c | 2 +-
fs/xfs/xfs_trace.h | 1 +
9 files changed, 550 insertions(+), 45 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d35ac4c19fb2..67392413216b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2023 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -19,6 +19,8 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -85,6 +87,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -115,9 +118,10 @@ xfs_end_ioend(
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
+ ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
- offset + size);
+ offset + size, NULL);
}
goto done;
}
@@ -125,7 +129,10 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_IOEND_SHARED)
+ if (is_zoned)
+ error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+ NULLFSBLOCK);
+ else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
@@ -175,17 +182,27 @@ xfs_end_io(
}
}
-STATIC void
+static void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
+ /*
+ * For Appends record the actually written block number and set the
+ * boundary flag if needed.
+ */
+ if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ xfs_mark_rtg_boundary(ioend);
+ }
+
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -462,7 +479,7 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio));
+ folio_pos(folio) + folio_size(folio), NULL);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -471,14 +488,117 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
.discard_folio = xfs_discard_folio,
};
+struct xfs_zoned_writepage_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct xfs_open_zone *open_zone;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
+{
+ return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+ struct iomap_writepage_ctx *wpc,
+ struct inode *inode,
+ loff_t offset,
+ unsigned int len)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_filblks_t count_fsb;
+ struct xfs_bmbt_irec imap, del;
+ struct xfs_iext_cursor icur;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
+ /*
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+ imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ if (imap.br_startoff > offset_fsb) {
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+ return 0;
+ }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
+
+ del = imap;
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+ XFS_BMAPI_REMAP);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
+ wpc->iomap.addr = 0;
+
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
+}
+
+static int
+xfs_zoned_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
+ int status)
+{
+ wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (status)
+ return status;
+ xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
+ return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+ .map_blocks = xfs_zoned_map_blocks,
+ .submit_ioend = xfs_zoned_submit_ioend,
+ .discard_folio = xfs_discard_folio,
+};
+
STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct xfs_inode *ip = XFS_I(mapping->host);
struct xfs_writepage_ctx wpc = { };
+ int error;
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ if (xfs_is_zoned_inode(ip)) {
+ struct xfs_zoned_writepage_ctx xc = { };
+
+ error = iomap_writepages(mapping, wbc, &xc.ctx,
+ &xfs_zoned_writeback_ops);
+ if (xc.open_zone)
+ xfs_open_zone_put(xc.open_zone);
+ return error;
+ }
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c623688e457c..c87422de2d77 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
- xfs_off_t end_byte)
+ xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,10 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del,
+ ac ? XFS_BMAPI_REMAP : 0);
+ if (xfs_is_zoned_inode(ip) && ac)
+ ac->reserved_blocks += del.br_blockcount;
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -582,7 +587,7 @@ xfs_free_eofblocks(
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
- LLONG_MAX);
+ LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -825,7 +830,8 @@ int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
@@ -880,7 +886,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = xfs_zero_range(ip, offset, len, NULL);
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
@@ -968,7 +974,8 @@ int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -981,7 +988,7 @@ xfs_collapse_file_space(
trace_xfs_collapse_file_space(ip);
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index b29760d36e1a..c477b3361630 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
- xfs_off_t start_byte, xfs_off_t end_byte);
+ xfs_off_t start_byte, xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
@@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 827f7819df6a..195cf60a81b0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -360,7 +361,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
- bool *drained_dio)
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@@ -414,7 +416,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@@ -431,7 +433,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- unsigned int *iolock)
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@@ -481,7 +484,7 @@ xfs_file_write_checks(
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
- &drained_dio);
+ &drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@@ -491,6 +494,48 @@ xfs_file_write_checks(
return kiocb_modified(iocb);
}
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
+ /*
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
+ *
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
+ */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(ip,
+ XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
+}
+
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -597,7 +642,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -675,7 +720,7 @@ xfs_file_dio_write_unaligned(
goto out_unlock;
}
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -749,7 +794,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -793,7 +838,7 @@ xfs_file_buffered_write(
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -840,6 +885,67 @@ xfs_file_buffered_write(
return ret;
}
+STATIC ssize_t
+xfs_file_buffered_write_zoned(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
@@ -887,6 +993,8 @@ xfs_file_write_iter(
return ret;
}
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@@ -941,7 +1049,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@@ -957,7 +1066,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
- error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1013,7 +1122,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@@ -1026,7 +1136,7 @@ xfs_falloc_zero_range(
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len);
+ error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@@ -1107,12 +1217,29 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_zone_alloc_ctx ac = { };
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(ip) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE))) {
+ error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ }
+
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@@ -1133,16 +1260,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, &ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
- error = xfs_falloc_collapse_range(file, offset, len);
+ error = xfs_falloc_collapse_range(file, offset, len, &ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
- error = xfs_falloc_zero_range(file, mode, offset, len);
+ error = xfs_falloc_zero_range(file, mode, offset, len, &ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1160,6 +1287,8 @@ xfs_file_fallocate(
out_unlock:
xfs_iunlock(ip, iolock);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
return error;
}
@@ -1485,9 +1614,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
-xfs_write_fault(
+__xfs_write_fault(
struct vm_fault *vmf,
- unsigned int order)
+ unsigned int order,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1515,13 +1645,49 @@ xfs_write_fault(
ret = xfs_dax_fault_locked(vmf, order, true);
else
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
- NULL);
+ ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
+static vm_fault_t
+xfs_write_fault_zoned(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ unsigned int len = folio_size(page_folio(vmf->page));
+ struct xfs_zone_alloc_ctx ac = { };
+ int error;
+ vm_fault_t ret;
+
+ /*
+ * This could over-allocate as it doesn't check for truncation.
+ *
+ * But as the overallocation is limited to less than a folio and will be
+ 8 release instantly that's just fine.
+ */
+ error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
+ &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ ret = __xfs_write_fault(vmf, order, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
+ return ret;
+}
+
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
+ return xfs_write_fault_zoned(vmf, order);
+ return __xfs_write_fault(vmf, order, NULL);
+}
+
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index aa1db0dc1d98..402b253ce3a2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -31,6 +31,7 @@
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -1270,6 +1271,176 @@ xfs_bmapi_reserve_delalloc(
return error;
}
+static int
+xfs_zoned_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ u16 iomap_flags = IOMAP_F_SHARED;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ xfs_filblks_t count_fsb;
+ xfs_extlen_t indlen;
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+
+ ASSERT(!xfs_get_extsz_hint(ip));
+ ASSERT(!(flags & IOMAP_UNSHARE));
+ ASSERT(ac);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For zeroing operations check if there is any data to zero first.
+ *
+ * For regular writes we always need to allocate new blocks, but need to
+ * provide the source mapping when the range is unaligned to support
+ * read-modify-write of the whole block in the page cache.
+ *
+ * In either case we need to limit the reported range to the boundaries
+ * of the source map in the data fork.
+ */
+ if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+ !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+ (flags & IOMAP_ZERO)) {
+ struct xfs_bmbt_irec smap;
+ struct xfs_iext_cursor scur;
+
+ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+ &smap))
+ smap.br_startoff = end_fsb; /* fake hole until EOF */
+ if (smap.br_startoff > offset_fsb) {
+ /*
+ * We never need to allocate blocks for zeroing a hole.
+ */
+ if (flags & IOMAP_ZERO) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb,
+ smap.br_startoff);
+ goto out_unlock;
+ }
+ end_fsb = min(end_fsb, smap.br_startoff);
+ } else {
+ end_fsb = min(end_fsb,
+ smap.br_startoff + smap.br_blockcount);
+ xfs_trim_extent(&smap, offset_fsb,
+ end_fsb - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+ xfs_iomap_inode_sequence(ip, 0));
+ if (error)
+ goto out_unlock;
+ }
+ }
+
+ if (!ip->i_cowfp)
+ xfs_ifork_init_cow(ip);
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = end_fsb;
+ if (got.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &got);
+ goto done;
+ }
+
+ /*
+ * Cap the maximum length to keep the chunks of work done here somewhat
+ * symmetric with the work writeback does.
+ */
+ end_fsb = min(end_fsb, got.br_startoff);
+ count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+ XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+ /*
+ * The block reservation is supposed to cover all blocks that the
+ * operation could possible write, but there is a nasty corner case
+ * where blocks could be stolen from underneath us:
+ *
+ * 1) while this thread iterates over a larger buffered write,
+ * 2) another thread is causing a write fault that calls into
+ * ->page_mkwrite in range this thread writes to, using up the
+ * delalloc reservation created by a previous call to this function.
+ * 3) another thread does direct I/O on the range that the write fault
+ * happened on, which causes writeback of the dirty data.
+ * 4) this then set the stale flag, which cuts the current iomap
+ * iteration short, causing the new call to ->iomap_begin that gets
+ * us here again, but now without a sufficient reservation.
+ *
+ * This is a very unusual I/O pattern, and nothing but generic/095 is
+ * known to hit it. There's not really much we can do here, so turn this
+ * into a short write.
+ */
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+ ip->i_ino, current->comm);
+ count_fsb = ac->reserved_blocks;
+ if (!count_fsb) {
+ error = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ error = xfs_quota_reserve_blkres(ip, count_fsb);
+ if (error)
+ goto out_unlock;
+
+ indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+ error = xfs_dec_fdblocks(mp, indlen, false);
+ if (error)
+ goto out_unlock;
+ ip->i_delayed_blks += count_fsb;
+ xfs_mod_delalloc(ip, count_fsb, indlen);
+
+ got.br_startoff = offset_fsb;
+ got.br_startblock = nullstartblock(indlen);
+ got.br_blockcount = count_fsb;
+ got.br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+ ac->reserved_blocks -= count_fsb;
+ iomap_flags |= IOMAP_F_NEW;
+
+ trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+ XFS_COW_FORK, &got);
+done:
+ error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+ xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1296,6 +1467,10 @@ xfs_buffered_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_is_zoned_inode(ip))
+ return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+ count, flags, iomap, srcmap);
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1528,10 +1703,13 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
- offset, offset + length);
+ offset, offset + length, iter->private);
}
static int
@@ -1768,6 +1946,7 @@ xfs_zero_range(
struct xfs_inode *ip,
loff_t pos,
loff_t len,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1778,13 +1957,14 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1793,5 +1973,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, ac);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8347268af727..bc8a00cad854 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
- bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 990df072ba35..c382f6b6c9e3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -852,6 +853,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
+ struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@@ -887,6 +889,28 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
+ /*
+ * Normally xfs_zoned_space_reserve is supposed to be called outside the
+ * IOLOCK. For truncate we can't do that since ->setattr is called with
+ * it already held by the VFS. So for now chicken out and try to
+ * allocate space under it.
+ *
+ * To avoid deadlocks this means we can't block waiting for space, which
+ * can lead to spurious -ENOSPC if there are no directly available
+ * blocks. We mitigate this a bit by allowing zeroing to dip into the
+ * reserved pool, but eventually the VFS calling convention needs to
+ * change.
+ */
+ if (xfs_is_zoned_inode(ip)) {
+ error = xfs_zoned_space_reserve(ip, 1,
+ XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+ if (error) {
+ if (error == -EAGAIN)
+ return -ENOSPC;
+ return error;
+ }
+ }
+
/*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
@@ -900,11 +924,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
- &did_zeroing);
+ &ac, &did_zeroing);
} else {
- error = xfs_truncate_page(ip, newsize, &did_zeroing);
+ error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
+
if (error)
return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b7dba5ad2f34..b8e55c3badfb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1538,7 +1538,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return xfs_zero_range(ip, isize, pos - isize, NULL);
+ return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bbaf9b2665c7..c03ec9bfec1f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1694,6 +1694,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 27/43] xfs: implement buffered writes to zoned RT devices
2024-12-11 8:54 ` [PATCH 27/43] xfs: implement buffered writes to zoned RT devices Christoph Hellwig
@ 2024-12-13 22:37 ` Darrick J. Wong
2024-12-15 6:12 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:52AM +0100, Christoph Hellwig wrote:
> Implement buffered writes including page faults and block zeroing for
> zoned RT devices. Buffered writes to zoned RT devices are split into
> three phases:
>
> 1) a reservation for the worst case data block usage is taken before
> acquiring the iolock. When there are enough free blocks but not
> enough available one, garbage collection is kicked of to free the
> space before continuing with the write. If there isn't enough
> freeable space, the block reservation is reduced and a short write
> will happen as expected by normal Linux write semantics.
> 2) with the iolock held, the generic iomap buffered write code is
> called, which through the iomap_begin operation usually just inserts
> delalloc extents for the range in a single iteration. Only for
> overwrites of existing data that are not block aligned, or zeroing
> operations the existing extent mapping is read to fill out the srcmap
> and to figure out if zeroing is required.
> 3) the ->map_blocks callback to the generic iomap writeback code
> calls into the zoned space allocator to actually allocate on-disk
> space for the range before kicking of the writeback.
>
> Note that because all writes are out of place, truncate or hole punches
> that are not aligned to block size boundaries need to allocate space.
> For block zeroing from truncate, ->setattr is called with the iolock
> (aka i_rwsem) already held, so a hacky deviation from the above
> scheme is needed. In this case the space reservations is called with
> the iolock held, but is required not to block and can dip into the
> reserved block pool. This can lead to -ENOSPC when truncating a
> file, which is unfortunate. But fixing the calling conventions in
> the VFS is probably much easier with code requiring it already in
> mainline.
Urk, that's messy. :(
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_aops.c | 134 +++++++++++++++++++++++++--
> fs/xfs/xfs_bmap_util.c | 21 +++--
> fs/xfs/xfs_bmap_util.h | 12 ++-
> fs/xfs/xfs_file.c | 202 +++++++++++++++++++++++++++++++++++++----
> fs/xfs/xfs_iomap.c | 186 ++++++++++++++++++++++++++++++++++++-
> fs/xfs/xfs_iomap.h | 6 +-
> fs/xfs/xfs_iops.c | 31 ++++++-
> fs/xfs/xfs_reflink.c | 2 +-
> fs/xfs/xfs_trace.h | 1 +
> 9 files changed, 550 insertions(+), 45 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index d35ac4c19fb2..67392413216b 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1,7 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0
> /*
> * Copyright (c) 2000-2005 Silicon Graphics, Inc.
> - * Copyright (c) 2016-2018 Christoph Hellwig.
> + * Copyright (c) 2016-2023 Christoph Hellwig.
2024, surely?
Or at this point, 2025
> * All Rights Reserved.
> */
> #include "xfs.h"
> @@ -19,6 +19,8 @@
> #include "xfs_reflink.h"
> #include "xfs_errortag.h"
> #include "xfs_error.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_rtgroup.h"
>
> struct xfs_writepage_ctx {
> struct iomap_writepage_ctx ctx;
> @@ -85,6 +87,7 @@ xfs_end_ioend(
> {
> struct xfs_inode *ip = XFS_I(ioend->io_inode);
> struct xfs_mount *mp = ip->i_mount;
> + bool is_zoned = xfs_is_zoned_inode(ip);
> xfs_off_t offset = ioend->io_offset;
> size_t size = ioend->io_size;
> unsigned int nofs_flag;
> @@ -115,9 +118,10 @@ xfs_end_ioend(
> error = blk_status_to_errno(ioend->io_bio.bi_status);
> if (unlikely(error)) {
> if (ioend->io_flags & IOMAP_IOEND_SHARED) {
> + ASSERT(!is_zoned);
> xfs_reflink_cancel_cow_range(ip, offset, size, true);
> xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
> - offset + size);
> + offset + size, NULL);
> }
> goto done;
> }
> @@ -125,7 +129,10 @@ xfs_end_ioend(
> /*
> * Success: commit the COW or unwritten blocks if needed.
> */
> - if (ioend->io_flags & IOMAP_IOEND_SHARED)
> + if (is_zoned)
> + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
> + NULLFSBLOCK);
> + else if (ioend->io_flags & IOMAP_IOEND_SHARED)
> error = xfs_reflink_end_cow(ip, offset, size);
> else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
> error = xfs_iomap_write_unwritten(ip, offset, size, false);
> @@ -175,17 +182,27 @@ xfs_end_io(
> }
> }
>
> -STATIC void
> +static void
> xfs_end_bio(
> struct bio *bio)
> {
> struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
> struct xfs_inode *ip = XFS_I(ioend->io_inode);
> + struct xfs_mount *mp = ip->i_mount;
> unsigned long flags;
>
> + /*
> + * For Appends record the actually written block number and set the
> + * boundary flag if needed.
> + */
> + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
> + ioend->io_sector = bio->bi_iter.bi_sector;
> + xfs_mark_rtg_boundary(ioend);
> + }
> +
> spin_lock_irqsave(&ip->i_ioend_lock, flags);
> if (list_empty(&ip->i_ioend_list))
> - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
> + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
> &ip->i_ioend_work));
> list_add_tail(&ioend->io_list, &ip->i_ioend_list);
> spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
> @@ -462,7 +479,7 @@ xfs_discard_folio(
> * folio itself and not the start offset that is passed in.
> */
> xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
> - folio_pos(folio) + folio_size(folio));
> + folio_pos(folio) + folio_size(folio), NULL);
> }
>
> static const struct iomap_writeback_ops xfs_writeback_ops = {
> @@ -471,14 +488,117 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
> .discard_folio = xfs_discard_folio,
> };
>
> +struct xfs_zoned_writepage_ctx {
> + struct iomap_writepage_ctx ctx;
> + struct xfs_open_zone *open_zone;
> +};
> +
> +static inline struct xfs_zoned_writepage_ctx *
> +XFS_ZWPC(struct iomap_writepage_ctx *ctx)
> +{
> + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
> +}
> +
> +static int
> +xfs_zoned_map_blocks(
> + struct iomap_writepage_ctx *wpc,
> + struct inode *inode,
> + loff_t offset,
> + unsigned int len)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
> + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
> + xfs_filblks_t count_fsb;
> + struct xfs_bmbt_irec imap, del;
> + struct xfs_iext_cursor icur;
> +
> + if (xfs_is_shutdown(mp))
> + return -EIO;
> +
> + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
> +
> + /*
> + * All dirty data must be covered by delalloc extents. But truncate can
> + * remove delalloc extents underneath us or reduce their size.
> + * Returning a hole tells iomap to not write back any data from this
> + * range, which is the right thing to do in that case.
> + *
> + * Otherwise just tell iomap to treat ranges previously covered by a
> + * delalloc extent as mapped. The actual block allocation will be done
> + * just before submitting the bio.
> + */
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
> + imap.br_startoff = end_fsb; /* fake a hole past EOF */
> + if (imap.br_startoff > offset_fsb) {
> + imap.br_blockcount = imap.br_startoff - offset_fsb;
> + imap.br_startoff = offset_fsb;
> + imap.br_startblock = HOLESTARTBLOCK;
> + imap.br_state = XFS_EXT_NORM;
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
> + return 0;
> + }
> + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
> + count_fsb = end_fsb - offset_fsb;
> +
> + del = imap;
> + xfs_trim_extent(&del, offset_fsb, count_fsb);
> + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
> + XFS_BMAPI_REMAP);
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> + wpc->iomap.type = IOMAP_MAPPED;
> + wpc->iomap.flags = IOMAP_F_DIRTY;
> + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
> + wpc->iomap.offset = offset;
> + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
> + wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
> + wpc->iomap.addr = 0;
/me wonders if this should be set somewhere other than block 0 just in
case we screw up? That might just be paranoia since I think iomap puts
the bio if it doesn't get to ->submit_bio.
> +
> + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
> + return 0;
> +}
> +
> +static int
> +xfs_zoned_submit_ioend(
> + struct iomap_writepage_ctx *wpc,
> + int status)
> +{
> + wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
> + if (status)
> + return status;
> + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
> + return 0;
> +}
> +
> +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
> + .map_blocks = xfs_zoned_map_blocks,
> + .submit_ioend = xfs_zoned_submit_ioend,
> + .discard_folio = xfs_discard_folio,
> +};
> +
> STATIC int
> xfs_vm_writepages(
> struct address_space *mapping,
> struct writeback_control *wbc)
> {
> + struct xfs_inode *ip = XFS_I(mapping->host);
> struct xfs_writepage_ctx wpc = { };
> + int error;
>
> - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
> + xfs_iflags_clear(ip, XFS_ITRUNCATED);
> + if (xfs_is_zoned_inode(ip)) {
> + struct xfs_zoned_writepage_ctx xc = { };
I noticed that the zoned writepage ctx doesn't track data/cow fork
sequence numbers. Why is this not necessary? Can we not race with
someone else doing writeback?
> +
> + error = iomap_writepages(mapping, wbc, &xc.ctx,
> + &xfs_zoned_writeback_ops);
> + if (xc.open_zone)
> + xfs_open_zone_put(xc.open_zone);
> + return error;
> + }
> return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
> }
>
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index c623688e457c..c87422de2d77 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -30,6 +30,7 @@
> #include "xfs_reflink.h"
> #include "xfs_rtbitmap.h"
> #include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
>
> /* Kernel only BMAP related definitions and functions */
>
> @@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
> struct xfs_inode *ip,
> int whichfork,
> xfs_off_t start_byte,
> - xfs_off_t end_byte)
> + xfs_off_t end_byte,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct xfs_mount *mp = ip->i_mount;
> struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
> @@ -467,7 +469,10 @@ xfs_bmap_punch_delalloc_range(
> continue;
> }
>
> - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
> + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del,
> + ac ? XFS_BMAPI_REMAP : 0);
> + if (xfs_is_zoned_inode(ip) && ac)
> + ac->reserved_blocks += del.br_blockcount;
> if (!xfs_iext_get_extent(ifp, &icur, &got))
> break;
> }
> @@ -582,7 +587,7 @@ xfs_free_eofblocks(
> if (ip->i_delayed_blks) {
> xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
> round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
> - LLONG_MAX);
> + LLONG_MAX, NULL);
> }
> xfs_inode_clear_eofblocks_tag(ip);
> return 0;
> @@ -825,7 +830,8 @@ int
> xfs_free_file_space(
> struct xfs_inode *ip,
> xfs_off_t offset,
> - xfs_off_t len)
> + xfs_off_t len,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct xfs_mount *mp = ip->i_mount;
> xfs_fileoff_t startoffset_fsb;
> @@ -880,7 +886,7 @@ xfs_free_file_space(
> return 0;
> if (offset + len > XFS_ISIZE(ip))
> len = XFS_ISIZE(ip) - offset;
> - error = xfs_zero_range(ip, offset, len, NULL);
> + error = xfs_zero_range(ip, offset, len, ac, NULL);
> if (error)
> return error;
>
> @@ -968,7 +974,8 @@ int
> xfs_collapse_file_space(
> struct xfs_inode *ip,
> xfs_off_t offset,
> - xfs_off_t len)
> + xfs_off_t len,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct xfs_mount *mp = ip->i_mount;
> struct xfs_trans *tp;
> @@ -981,7 +988,7 @@ xfs_collapse_file_space(
>
> trace_xfs_collapse_file_space(ip);
>
> - error = xfs_free_file_space(ip, offset, len);
> + error = xfs_free_file_space(ip, offset, len, ac);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index b29760d36e1a..c477b3361630 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -15,6 +15,7 @@ struct xfs_inode;
> struct xfs_mount;
> struct xfs_trans;
> struct xfs_bmalloca;
> +struct xfs_zone_alloc_ctx;
>
> #ifdef CONFIG_XFS_RT
> int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
> @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
> #endif /* CONFIG_XFS_RT */
>
> void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
> - xfs_off_t start_byte, xfs_off_t end_byte);
> + xfs_off_t start_byte, xfs_off_t end_byte,
> + struct xfs_zone_alloc_ctx *ac);
>
> struct kgetbmap {
> __s64 bmv_offset; /* file offset of segment in blocks */
> @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
>
> /* preallocation and hole punch interface */
> int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
> - xfs_off_t len);
> + xfs_off_t len);
> int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
> - xfs_off_t len);
> + xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
> int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
> - xfs_off_t len);
> + xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
> int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
> - xfs_off_t len);
> + xfs_off_t len);
>
> /* EOF block manipulation functions */
> bool xfs_can_free_eofblocks(struct xfs_inode *ip);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 827f7819df6a..195cf60a81b0 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -25,6 +25,7 @@
> #include "xfs_iomap.h"
> #include "xfs_reflink.h"
> #include "xfs_file.h"
> +#include "xfs_zone_alloc.h"
>
> #include <linux/dax.h>
> #include <linux/falloc.h>
> @@ -360,7 +361,8 @@ xfs_file_write_zero_eof(
> struct iov_iter *from,
> unsigned int *iolock,
> size_t count,
> - bool *drained_dio)
> + bool *drained_dio,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
> loff_t isize;
> @@ -414,7 +416,7 @@ xfs_file_write_zero_eof(
> trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
>
> xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
> + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
> xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
>
> return error;
> @@ -431,7 +433,8 @@ STATIC ssize_t
> xfs_file_write_checks(
> struct kiocb *iocb,
> struct iov_iter *from,
> - unsigned int *iolock)
> + unsigned int *iolock,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = iocb->ki_filp->f_mapping->host;
> size_t count = iov_iter_count(from);
> @@ -481,7 +484,7 @@ xfs_file_write_checks(
> */
> if (iocb->ki_pos > i_size_read(inode)) {
> error = xfs_file_write_zero_eof(iocb, from, iolock, count,
> - &drained_dio);
> + &drained_dio, ac);
> if (error == 1)
> goto restart;
> if (error)
> @@ -491,6 +494,48 @@ xfs_file_write_checks(
> return kiocb_modified(iocb);
> }
>
> +static ssize_t
> +xfs_zoned_write_space_reserve(
> + struct xfs_inode *ip,
> + struct kiocb *iocb,
> + struct iov_iter *from,
> + unsigned int flags,
> + struct xfs_zone_alloc_ctx *ac)
> +{
> + loff_t count = iov_iter_count(from);
> + int error;
> +
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + flags |= XFS_ZR_NOWAIT;
> +
> + /*
> + * Check the rlimit and LFS boundary first so that we don't over-reserve
> + * by possibly a lot.
> + *
> + * The generic write path will redo this check later, and it might have
> + * changed by then. If it got expanded we'll stick to our earlier
> + * smaller limit, and if it is decreased the new smaller limit will be
> + * used and our extra space reservation will be returned after finishing
> + * the write.
> + */
> + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
> + if (error)
> + return error;
> +
> + /*
> + * Sloppily round up count to file system blocks.
> + *
> + * This will often reserve an extra block, but that avoids having to look
> + * at the start offset, which isn't stable for O_APPEND until taking the
> + * iolock. Also we need to reserve a block each for zeroing the old
> + * EOF block and the new start block if they are unaligned.
> + *
> + * Any remaining block will be returned after the write.
> + */
> + return xfs_zoned_space_reserve(ip,
> + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
> +}
> +
> static int
> xfs_dio_write_end_io(
> struct kiocb *iocb,
> @@ -597,7 +642,7 @@ xfs_file_dio_write_aligned(
> ret = xfs_ilock_iocb_for_write(iocb, &iolock);
> if (ret)
> return ret;
> - ret = xfs_file_write_checks(iocb, from, &iolock);
> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> if (ret)
> goto out_unlock;
>
> @@ -675,7 +720,7 @@ xfs_file_dio_write_unaligned(
> goto out_unlock;
> }
>
> - ret = xfs_file_write_checks(iocb, from, &iolock);
> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> if (ret)
> goto out_unlock;
>
> @@ -749,7 +794,7 @@ xfs_file_dax_write(
> ret = xfs_ilock_iocb(iocb, iolock);
> if (ret)
> return ret;
> - ret = xfs_file_write_checks(iocb, from, &iolock);
> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> if (ret)
> goto out;
>
> @@ -793,7 +838,7 @@ xfs_file_buffered_write(
> if (ret)
> return ret;
>
> - ret = xfs_file_write_checks(iocb, from, &iolock);
> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> if (ret)
> goto out;
>
> @@ -840,6 +885,67 @@ xfs_file_buffered_write(
> return ret;
> }
>
> +STATIC ssize_t
> +xfs_file_buffered_write_zoned(
> + struct kiocb *iocb,
> + struct iov_iter *from)
> +{
> + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
> + struct xfs_mount *mp = ip->i_mount;
> + unsigned int iolock = XFS_IOLOCK_EXCL;
> + bool cleared_space = false;
> + struct xfs_zone_alloc_ctx ac = { };
> + ssize_t ret;
> +
> + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
> + if (ret < 0)
> + return ret;
> +
> + ret = xfs_ilock_iocb(iocb, iolock);
> + if (ret)
> + goto out_unreserve;
> +
> + ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
> + if (ret)
> + goto out_unlock;
> +
> + /*
> + * Truncate the iter to the length that we were actually able to
> + * allocate blocks for. This needs to happen after
> + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
> + * writes.
> + */
> + iov_iter_truncate(from,
> + XFS_FSB_TO_B(mp, ac.reserved_blocks) -
> + (iocb->ki_pos & mp->m_blockmask));
> + if (!iov_iter_count(from))
> + goto out_unlock;
> +
> +retry:
> + trace_xfs_file_buffered_write(iocb, from);
> + ret = iomap_file_buffered_write(iocb, from,
> + &xfs_buffered_write_iomap_ops, &ac);
> + if (ret == -ENOSPC && !cleared_space) {
> + /*
^ extra space
> + * Kick off writeback to convert delalloc space and release the
> + * usually too pessimistic indirect block reservations.
> + */
> + xfs_flush_inodes(mp);
> + cleared_space = true;
> + goto retry;
> + }
> +
> +out_unlock:
> + xfs_iunlock(ip, iolock);
> +out_unreserve:
> + xfs_zoned_space_unreserve(ip, &ac);
> + if (ret > 0) {
> + XFS_STATS_ADD(mp, xs_write_bytes, ret);
> + ret = generic_write_sync(iocb, ret);
> + }
> + return ret;
> +}
> +
> STATIC ssize_t
> xfs_file_write_iter(
> struct kiocb *iocb,
> @@ -887,6 +993,8 @@ xfs_file_write_iter(
> return ret;
> }
>
> + if (xfs_is_zoned_inode(ip))
> + return xfs_file_buffered_write_zoned(iocb, from);
> return xfs_file_buffered_write(iocb, from);
> }
>
> @@ -941,7 +1049,8 @@ static int
> xfs_falloc_collapse_range(
> struct file *file,
> loff_t offset,
> - loff_t len)
> + loff_t len,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = file_inode(file);
> loff_t new_size = i_size_read(inode) - len;
> @@ -957,7 +1066,7 @@ xfs_falloc_collapse_range(
> if (offset + len >= i_size_read(inode))
> return -EINVAL;
>
> - error = xfs_collapse_file_space(XFS_I(inode), offset, len);
> + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
> if (error)
> return error;
> return xfs_falloc_setsize(file, new_size);
> @@ -1013,7 +1122,8 @@ xfs_falloc_zero_range(
> struct file *file,
> int mode,
> loff_t offset,
> - loff_t len)
> + loff_t len,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = file_inode(file);
> unsigned int blksize = i_blocksize(inode);
> @@ -1026,7 +1136,7 @@ xfs_falloc_zero_range(
> if (error)
> return error;
>
> - error = xfs_free_file_space(XFS_I(inode), offset, len);
> + error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
> if (error)
> return error;
>
> @@ -1107,12 +1217,29 @@ xfs_file_fallocate(
> struct xfs_inode *ip = XFS_I(inode);
> long error;
> uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> + struct xfs_zone_alloc_ctx ac = { };
>
> if (!S_ISREG(inode->i_mode))
> return -EINVAL;
> if (mode & ~XFS_FALLOC_FL_SUPPORTED)
> return -EOPNOTSUPP;
>
> + /*
> + * For zoned file systems, zeroing the first and last block of a hole
> + * punch requires allocating a new block to rewrite the remaining data
> + * and new zeroes out of place. Get a reservations for those before
> + * taking the iolock. Dip into the reserved pool because we are
> + * expected to be able to punch a hole even on a completely full
> + * file system.
> + */
> + if (xfs_is_zoned_inode(ip) &&
> + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
> + FALLOC_FL_COLLAPSE_RANGE))) {
> + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
> + if (error)
> + return error;
> + }
> +
> xfs_ilock(ip, iolock);
> error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
> if (error)
> @@ -1133,16 +1260,16 @@ xfs_file_fallocate(
>
> switch (mode & FALLOC_FL_MODE_MASK) {
> case FALLOC_FL_PUNCH_HOLE:
> - error = xfs_free_file_space(ip, offset, len);
> + error = xfs_free_file_space(ip, offset, len, &ac);
> break;
> case FALLOC_FL_COLLAPSE_RANGE:
> - error = xfs_falloc_collapse_range(file, offset, len);
> + error = xfs_falloc_collapse_range(file, offset, len, &ac);
> break;
> case FALLOC_FL_INSERT_RANGE:
> error = xfs_falloc_insert_range(file, offset, len);
> break;
> case FALLOC_FL_ZERO_RANGE:
> - error = xfs_falloc_zero_range(file, mode, offset, len);
> + error = xfs_falloc_zero_range(file, mode, offset, len, &ac);
> break;
> case FALLOC_FL_UNSHARE_RANGE:
> error = xfs_falloc_unshare_range(file, mode, offset, len);
> @@ -1160,6 +1287,8 @@ xfs_file_fallocate(
>
> out_unlock:
> xfs_iunlock(ip, iolock);
> + if (xfs_is_zoned_inode(ip))
> + xfs_zoned_space_unreserve(ip, &ac);
> return error;
> }
>
> @@ -1485,9 +1614,10 @@ xfs_dax_read_fault(
> * i_lock (XFS - extent map serialisation)
> */
> static vm_fault_t
> -xfs_write_fault(
> +__xfs_write_fault(
> struct vm_fault *vmf,
> - unsigned int order)
> + unsigned int order,
> + struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = file_inode(vmf->vma->vm_file);
> struct xfs_inode *ip = XFS_I(inode);
> @@ -1515,13 +1645,49 @@ xfs_write_fault(
> ret = xfs_dax_fault_locked(vmf, order, true);
> else
> ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
> - NULL);
> + ac);
> xfs_iunlock(ip, lock_mode);
>
> sb_end_pagefault(inode->i_sb);
> return ret;
> }
>
> +static vm_fault_t
> +xfs_write_fault_zoned(
> + struct vm_fault *vmf,
> + unsigned int order)
> +{
> + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
> + unsigned int len = folio_size(page_folio(vmf->page));
> + struct xfs_zone_alloc_ctx ac = { };
> + int error;
> + vm_fault_t ret;
> +
> + /*
> + * This could over-allocate as it doesn't check for truncation.
> + *
> + * But as the overallocation is limited to less than a folio and will be
> + 8 release instantly that's just fine.
Nit: ^ should be an asterisk.
> + */
> + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
> + &ac);
> + if (error < 0)
> + return vmf_fs_error(error);
> + ret = __xfs_write_fault(vmf, order, &ac);
> + xfs_zoned_space_unreserve(ip, &ac);
> + return ret;
> +}
> +
> +static vm_fault_t
> +xfs_write_fault(
> + struct vm_fault *vmf,
> + unsigned int order)
> +{
> + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
> + return xfs_write_fault_zoned(vmf, order);
> + return __xfs_write_fault(vmf, order, NULL);
> +}
> +
> static inline bool
> xfs_is_write_fault(
> struct vm_fault *vmf)
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index aa1db0dc1d98..402b253ce3a2 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -31,6 +31,7 @@
> #include "xfs_health.h"
> #include "xfs_rtbitmap.h"
> #include "xfs_icache.h"
> +#include "xfs_zone_alloc.h"
>
> #define XFS_ALLOC_ALIGN(mp, off) \
> (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
> @@ -1270,6 +1271,176 @@ xfs_bmapi_reserve_delalloc(
> return error;
> }
>
> +static int
> +xfs_zoned_buffered_write_iomap_begin(
> + struct inode *inode,
> + loff_t offset,
> + loff_t count,
> + unsigned flags,
> + struct iomap *iomap,
> + struct iomap *srcmap)
> +{
> + struct iomap_iter *iter =
> + container_of(iomap, struct iomap_iter, iomap);
I still wonder if iomap should be passing a (const struct iomap_iter *)
to the ->iomap_begin function instead of passing all these variables and
then implementations have to container_of if they want iter->private
too.
> + struct xfs_zone_alloc_ctx *ac = iter->private;
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
> + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
> + u16 iomap_flags = IOMAP_F_SHARED;
> + unsigned int lockmode = XFS_ILOCK_EXCL;
> + xfs_filblks_t count_fsb;
> + xfs_extlen_t indlen;
> + struct xfs_bmbt_irec got;
> + struct xfs_iext_cursor icur;
> + int error = 0;
> +
> + ASSERT(!xfs_get_extsz_hint(ip));
> + ASSERT(!(flags & IOMAP_UNSHARE));
> + ASSERT(ac);
> +
> + if (xfs_is_shutdown(mp))
> + return -EIO;
> +
> + error = xfs_qm_dqattach(ip);
> + if (error)
> + return error;
> +
> + error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> + if (error)
> + return error;
> +
> + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
> + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
> + xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
> + error = -EFSCORRUPTED;
> + goto out_unlock;
> + }
> +
> + XFS_STATS_INC(mp, xs_blk_mapw);
> +
> + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> + if (error)
> + goto out_unlock;
> +
> + /*
> + * For zeroing operations check if there is any data to zero first.
> + *
> + * For regular writes we always need to allocate new blocks, but need to
> + * provide the source mapping when the range is unaligned to support
> + * read-modify-write of the whole block in the page cache.
> + *
> + * In either case we need to limit the reported range to the boundaries
> + * of the source map in the data fork.
> + */
> + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
> + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
> + (flags & IOMAP_ZERO)) {
> + struct xfs_bmbt_irec smap;
> + struct xfs_iext_cursor scur;
> +
> + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
> + &smap))
> + smap.br_startoff = end_fsb; /* fake hole until EOF */
> + if (smap.br_startoff > offset_fsb) {
> + /*
> + * We never need to allocate blocks for zeroing a hole.
> + */
> + if (flags & IOMAP_ZERO) {
> + xfs_hole_to_iomap(ip, iomap, offset_fsb,
> + smap.br_startoff);
> + goto out_unlock;
> + }
> + end_fsb = min(end_fsb, smap.br_startoff);
> + } else {
> + end_fsb = min(end_fsb,
> + smap.br_startoff + smap.br_blockcount);
> + xfs_trim_extent(&smap, offset_fsb,
> + end_fsb - offset_fsb);
> + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
> + xfs_iomap_inode_sequence(ip, 0));
> + if (error)
> + goto out_unlock;
> + }
> + }
> +
> + if (!ip->i_cowfp)
> + xfs_ifork_init_cow(ip);
> +
> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
> + got.br_startoff = end_fsb;
> + if (got.br_startoff <= offset_fsb) {
> + trace_xfs_reflink_cow_found(ip, &got);
> + goto done;
> + }
> +
> + /*
> + * Cap the maximum length to keep the chunks of work done here somewhat
> + * symmetric with the work writeback does.
> + */
> + end_fsb = min(end_fsb, got.br_startoff);
> + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
> + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
> +
> + /*
> + * The block reservation is supposed to cover all blocks that the
> + * operation could possible write, but there is a nasty corner case
> + * where blocks could be stolen from underneath us:
> + *
> + * 1) while this thread iterates over a larger buffered write,
> + * 2) another thread is causing a write fault that calls into
> + * ->page_mkwrite in range this thread writes to, using up the
> + * delalloc reservation created by a previous call to this function.
> + * 3) another thread does direct I/O on the range that the write fault
> + * happened on, which causes writeback of the dirty data.
> + * 4) this then set the stale flag, which cuts the current iomap
> + * iteration short, causing the new call to ->iomap_begin that gets
> + * us here again, but now without a sufficient reservation.
> + *
> + * This is a very unusual I/O pattern, and nothing but generic/095 is
> + * known to hit it. There's not really much we can do here, so turn this
> + * into a short write.
Wheeeeee. >:(
> + */
> + if (count_fsb > ac->reserved_blocks) {
> + xfs_warn_ratelimited(mp,
> +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
> + ip->i_ino, current->comm);
Aren't continuations ^^^ supposed to be indented two tabs?
> + count_fsb = ac->reserved_blocks;
> + if (!count_fsb) {
> + error = -EIO;
> + goto out_unlock;
> + }
> + }
> +
> + error = xfs_quota_reserve_blkres(ip, count_fsb);
> + if (error)
> + goto out_unlock;
> +
> + indlen = xfs_bmap_worst_indlen(ip, count_fsb);
> + error = xfs_dec_fdblocks(mp, indlen, false);
> + if (error)
> + goto out_unlock;
> + ip->i_delayed_blks += count_fsb;
> + xfs_mod_delalloc(ip, count_fsb, indlen);
> +
> + got.br_startoff = offset_fsb;
> + got.br_startblock = nullstartblock(indlen);
> + got.br_blockcount = count_fsb;
> + got.br_state = XFS_EXT_NORM;
> + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
> + ac->reserved_blocks -= count_fsb;
> + iomap_flags |= IOMAP_F_NEW;
> +
> + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
> + XFS_COW_FORK, &got);
> +done:
> + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
> + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
> +out_unlock:
> + xfs_iunlock(ip, lockmode);
> + return error;
> +}
> +
> static int
> xfs_buffered_write_iomap_begin(
> struct inode *inode,
> @@ -1296,6 +1467,10 @@ xfs_buffered_write_iomap_begin(
> if (xfs_is_shutdown(mp))
> return -EIO;
>
> + if (xfs_is_zoned_inode(ip))
> + return xfs_zoned_buffered_write_iomap_begin(inode, offset,
> + count, flags, iomap, srcmap);
> +
> /* we can't use delayed allocations when using extent size hints */
> if (xfs_get_extsz_hint(ip))
> return xfs_direct_write_iomap_begin(inode, offset, count,
> @@ -1528,10 +1703,13 @@ xfs_buffered_write_delalloc_punch(
> loff_t length,
> struct iomap *iomap)
> {
> + struct iomap_iter *iter =
> + container_of(iomap, struct iomap_iter, iomap);
> +
> xfs_bmap_punch_delalloc_range(XFS_I(inode),
> (iomap->flags & IOMAP_F_SHARED) ?
> XFS_COW_FORK : XFS_DATA_FORK,
> - offset, offset + length);
> + offset, offset + length, iter->private);
> }
>
> static int
> @@ -1768,6 +1946,7 @@ xfs_zero_range(
> struct xfs_inode *ip,
> loff_t pos,
> loff_t len,
> + struct xfs_zone_alloc_ctx *ac,
> bool *did_zero)
> {
> struct inode *inode = VFS_I(ip);
> @@ -1778,13 +1957,14 @@ xfs_zero_range(
> return dax_zero_range(inode, pos, len, did_zero,
> &xfs_dax_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> - &xfs_buffered_write_iomap_ops, NULL);
> + &xfs_buffered_write_iomap_ops, ac);
> }
>
> int
> xfs_truncate_page(
> struct xfs_inode *ip,
> loff_t pos,
> + struct xfs_zone_alloc_ctx *ac,
> bool *did_zero)
> {
> struct inode *inode = VFS_I(ip);
> @@ -1793,5 +1973,5 @@ xfs_truncate_page(
> return dax_truncate_page(inode, pos, did_zero,
> &xfs_dax_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> - &xfs_buffered_write_iomap_ops, NULL);
> + &xfs_buffered_write_iomap_ops, ac);
> }
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 8347268af727..bc8a00cad854 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -10,6 +10,7 @@
>
> struct xfs_inode;
> struct xfs_bmbt_irec;
> +struct xfs_zone_alloc_ctx;
>
> int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
> xfs_fileoff_t count_fsb, unsigned int flags,
> @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
> u16 iomap_flags, u64 sequence_cookie);
>
> int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
> - bool *did_zero);
> -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
> + struct xfs_zone_alloc_ctx *ac, bool *did_zero);
> +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
> + struct xfs_zone_alloc_ctx *ac, bool *did_zero);
>
> static inline xfs_filblks_t
> xfs_aligned_fsb_count(
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 990df072ba35..c382f6b6c9e3 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -29,6 +29,7 @@
> #include "xfs_xattr.h"
> #include "xfs_file.h"
> #include "xfs_bmap.h"
> +#include "xfs_zone_alloc.h"
>
> #include <linux/posix_acl.h>
> #include <linux/security.h>
> @@ -852,6 +853,7 @@ xfs_setattr_size(
> uint lock_flags = 0;
> uint resblks = 0;
> bool did_zeroing = false;
> + struct xfs_zone_alloc_ctx ac = { };
>
> xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
> ASSERT(S_ISREG(inode->i_mode));
> @@ -887,6 +889,28 @@ xfs_setattr_size(
> */
> inode_dio_wait(inode);
>
> + /*
> + * Normally xfs_zoned_space_reserve is supposed to be called outside the
> + * IOLOCK. For truncate we can't do that since ->setattr is called with
> + * it already held by the VFS. So for now chicken out and try to
> + * allocate space under it.
> + *
> + * To avoid deadlocks this means we can't block waiting for space, which
> + * can lead to spurious -ENOSPC if there are no directly available
> + * blocks. We mitigate this a bit by allowing zeroing to dip into the
> + * reserved pool, but eventually the VFS calling convention needs to
> + * change.
> + */
> + if (xfs_is_zoned_inode(ip)) {
> + error = xfs_zoned_space_reserve(ip, 1,
> + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
> + if (error) {
> + if (error == -EAGAIN)
> + return -ENOSPC;
> + return error;
> + }
> + }
> +
> /*
> * File data changes must be complete before we start the transaction to
> * modify the inode. This needs to be done before joining the inode to
> @@ -900,11 +924,14 @@ xfs_setattr_size(
> if (newsize > oldsize) {
> trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
> error = xfs_zero_range(ip, oldsize, newsize - oldsize,
> - &did_zeroing);
> + &ac, &did_zeroing);
> } else {
> - error = xfs_truncate_page(ip, newsize, &did_zeroing);
> + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
> }
>
> + if (xfs_is_zoned_inode(ip))
> + xfs_zoned_space_unreserve(ip, &ac);
> +
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index b7dba5ad2f34..b8e55c3badfb 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1538,7 +1538,7 @@ xfs_reflink_zero_posteof(
> return 0;
>
> trace_xfs_zero_eof(ip, isize, pos - isize);
> - return xfs_zero_range(ip, isize, pos - isize, NULL);
> + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
Not really a fan of passing the *ac pointers around everywhere, but atm
I can't think of a better option.
--D
> }
>
> /*
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index bbaf9b2665c7..c03ec9bfec1f 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1694,6 +1694,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
> DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
> DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
> DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
> +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
>
> DECLARE_EVENT_CLASS(xfs_itrunc_class,
> TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 27/43] xfs: implement buffered writes to zoned RT devices
2024-12-13 22:37 ` Darrick J. Wong
@ 2024-12-15 6:12 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:12 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:37:57PM -0800, Darrick J. Wong wrote:
> > index d35ac4c19fb2..67392413216b 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -1,7 +1,7 @@
> > // SPDX-License-Identifier: GPL-2.0
> > /*
> > * Copyright (c) 2000-2005 Silicon Graphics, Inc.
> > - * Copyright (c) 2016-2018 Christoph Hellwig.
> > + * Copyright (c) 2016-2023 Christoph Hellwig.
>
> 2024, surely?
>
> Or at this point, 2025
Nost of this actually was done (and pushed to a semi-public tree) in
2023. But I guess I touched it enough in 2024 to cover that.
> > + wpc->iomap.type = IOMAP_MAPPED;
> > + wpc->iomap.flags = IOMAP_F_DIRTY;
> > + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
> > + wpc->iomap.offset = offset;
> > + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
> > + wpc->iomap.flags = IOMAP_F_ZONE_APPEND;
> > + wpc->iomap.addr = 0;
>
> /me wonders if this should be set somewhere other than block 0 just in
> case we screw up? That might just be paranoia since I think iomap puts
> the bio if it doesn't get to ->submit_bio.
I'll give it a spin.
> > + struct xfs_inode *ip = XFS_I(mapping->host);
> > struct xfs_writepage_ctx wpc = { };
> > + int error;
> >
> > - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
> > + xfs_iflags_clear(ip, XFS_ITRUNCATED);
> > + if (xfs_is_zoned_inode(ip)) {
> > + struct xfs_zoned_writepage_ctx xc = { };
>
> I noticed that the zoned writepage ctx doesn't track data/cow fork
> sequence numbers. Why is this not necessary? Can we not race with
> someone else doing writeback?
Unlike "regular" XFS writeback, which tries to convert large extents from
preallocations (or just large ranges), zoned writeback works a single
folio at time, that there is a new mapping for each folio. And these are
only dummy mappings anyway. So all work happens on folios that are
locked or have the writeback bit set and are off limits to other
writeback threads, new buffered writers or invalidation. I guess I
need to document this better.
> > +xfs_zoned_buffered_write_iomap_begin(
> > + struct inode *inode,
> > + loff_t offset,
> > + loff_t count,
> > + unsigned flags,
> > + struct iomap *iomap,
> > + struct iomap *srcmap)
> > +{
> > + struct iomap_iter *iter =
> > + container_of(iomap, struct iomap_iter, iomap);
>
> I still wonder if iomap should be passing a (const struct iomap_iter *)
> to the ->iomap_begin function instead of passing all these variables and
> then implementations have to container_of if they want iter->private
> too.
It should, and I plan to pass the iter when reworking this towards
the iter model. Hopefully I can get to that relatively soon after
this project lands.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 28/43] xfs: implement direct writes to zoned RT devices
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (26 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 27/43] xfs: implement buffered writes to zoned RT devices Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:39 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 29/43] xfs: wire up zoned block freeing in xfs_rtextent_free_finish_item Christoph Hellwig
` (14 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Direct writes to zoned RT devices are extremely simple. After taking the
block reservation before acquiring the iolock, the iomap direct I/O calls
into ->iomap_begin which will return a "fake" iomap for the entire
requested range. The actual block allocation is then done from the
submit_io handler using code shared with the buffered I/O path.
The iomap_dio_ops set the bio_set to the (iomap) ioend one and initialize
the embedded ioend, which allows reusing the existing ioend based buffered
I/O completion path.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_aops.c | 6 ++--
fs/xfs/xfs_aops.h | 3 +-
fs/xfs/xfs_file.c | 80 +++++++++++++++++++++++++++++++++++++++++-----
fs/xfs/xfs_iomap.c | 54 +++++++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 1 +
5 files changed, 133 insertions(+), 11 deletions(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 67392413216b..a3ca14e811fd 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -137,7 +137,9 @@ xfs_end_ioend(
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- if (!error && xfs_ioend_is_append(ioend))
+ if (!error &&
+ !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
+ xfs_ioend_is_append(ioend))
error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
done:
iomap_finish_ioends(ioend, error);
@@ -182,7 +184,7 @@ xfs_end_io(
}
}
-static void
+void
xfs_end_bio(
struct bio *bio)
{
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e0bd68419764..5a7a0f1a0b49 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,6 +9,7 @@
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
-int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 195cf60a81b0..1b39000b7c62 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
#include <linux/dax.h>
@@ -548,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@@ -627,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_IOEND_DIRECT);
+ xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@@ -642,7 +683,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@@ -656,11 +697,31 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, NULL, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
- if (iolock)
- xfs_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
@@ -777,7 +838,10 @@ xfs_file_dio_write(
(xfs_is_always_cow_inode(ip) &&
(iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
- return xfs_file_dio_write_aligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 402b253ce3a2..9626632883d0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -965,6 +965,60 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple. The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+ /*
+ * Needs to be pushed down into the allocator so that only writes into
+ * a single zone can be supported.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ /*
+ * Ensure the extent list is in memory in so that we don't have to do
+ * read it from the I/O completion handler.
+ */
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ }
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DIRTY;
+ iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+ iomap->offset = offset;
+ iomap->length = length;
+ iomap->flags = IOMAP_F_ZONE_APPEND;
+ iomap->addr = 0;
+ return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+ .iomap_begin = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index bc8a00cad854..d330c4a581b1 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -51,6 +51,7 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 28/43] xfs: implement direct writes to zoned RT devices
2024-12-11 8:54 ` [PATCH 28/43] xfs: implement direct " Christoph Hellwig
@ 2024-12-13 22:39 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:39 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:53AM +0100, Christoph Hellwig wrote:
> Direct writes to zoned RT devices are extremely simple. After taking the
> block reservation before acquiring the iolock, the iomap direct I/O calls
> into ->iomap_begin which will return a "fake" iomap for the entire
> requested range. The actual block allocation is then done from the
> submit_io handler using code shared with the buffered I/O path.
>
> The iomap_dio_ops set the bio_set to the (iomap) ioend one and initialize
> the embedded ioend, which allows reusing the existing ioend based buffered
> I/O completion path.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Yeah that is a lot simpler. :)
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_aops.c | 6 ++--
> fs/xfs/xfs_aops.h | 3 +-
> fs/xfs/xfs_file.c | 80 +++++++++++++++++++++++++++++++++++++++++-----
> fs/xfs/xfs_iomap.c | 54 +++++++++++++++++++++++++++++++
> fs/xfs/xfs_iomap.h | 1 +
> 5 files changed, 133 insertions(+), 11 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 67392413216b..a3ca14e811fd 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -137,7 +137,9 @@ xfs_end_ioend(
> else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
> error = xfs_iomap_write_unwritten(ip, offset, size, false);
>
> - if (!error && xfs_ioend_is_append(ioend))
> + if (!error &&
> + !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
> + xfs_ioend_is_append(ioend))
> error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
> done:
> iomap_finish_ioends(ioend, error);
> @@ -182,7 +184,7 @@ xfs_end_io(
> }
> }
>
> -static void
> +void
> xfs_end_bio(
> struct bio *bio)
> {
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index e0bd68419764..5a7a0f1a0b49 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -9,6 +9,7 @@
> extern const struct address_space_operations xfs_address_space_operations;
> extern const struct address_space_operations xfs_dax_aops;
>
> -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
> +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
> +void xfs_end_bio(struct bio *bio);
>
> #endif /* __XFS_AOPS_H__ */
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 195cf60a81b0..1b39000b7c62 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -25,6 +25,7 @@
> #include "xfs_iomap.h"
> #include "xfs_reflink.h"
> #include "xfs_file.h"
> +#include "xfs_aops.h"
> #include "xfs_zone_alloc.h"
>
> #include <linux/dax.h>
> @@ -548,6 +549,9 @@ xfs_dio_write_end_io(
> loff_t offset = iocb->ki_pos;
> unsigned int nofs_flag;
>
> + ASSERT(!xfs_is_zoned_inode(ip) ||
> + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
> +
> trace_xfs_end_io_direct_write(ip, offset, size);
>
> if (xfs_is_shutdown(ip->i_mount))
> @@ -627,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
> .end_io = xfs_dio_write_end_io,
> };
>
> +static void
> +xfs_dio_zoned_submit_io(
> + const struct iomap_iter *iter,
> + struct bio *bio,
> + loff_t file_offset)
> +{
> + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
> + struct xfs_zone_alloc_ctx *ac = iter->private;
> + xfs_filblks_t count_fsb;
> + struct iomap_ioend *ioend;
> +
> + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
> + if (count_fsb > ac->reserved_blocks) {
> + xfs_err(mp,
> +"allocation (%lld) larger than reservation (%lld).",
> + count_fsb, ac->reserved_blocks);
> + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> + bio_io_error(bio);
> + return;
> + }
> + ac->reserved_blocks -= count_fsb;
> +
> + bio->bi_end_io = xfs_end_bio;
> + ioend = iomap_init_ioend(iter->inode, bio, file_offset,
> + IOMAP_IOEND_DIRECT);
> + xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
> +}
> +
> +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
> + .bio_set = &iomap_ioend_bioset,
> + .submit_io = xfs_dio_zoned_submit_io,
> + .end_io = xfs_dio_write_end_io,
> +};
> +
> /*
> - * Handle block aligned direct I/O writes
> + * Handle block aligned direct I/O writes.
> */
> static noinline ssize_t
> xfs_file_dio_write_aligned(
> struct xfs_inode *ip,
> struct kiocb *iocb,
> - struct iov_iter *from)
> + struct iov_iter *from,
> + const struct iomap_ops *ops,
> + const struct iomap_dio_ops *dops,
> + struct xfs_zone_alloc_ctx *ac)
> {
> unsigned int iolock = XFS_IOLOCK_SHARED;
> ssize_t ret;
> @@ -642,7 +683,7 @@ xfs_file_dio_write_aligned(
> ret = xfs_ilock_iocb_for_write(iocb, &iolock);
> if (ret)
> return ret;
> - ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> + ret = xfs_file_write_checks(iocb, from, &iolock, ac);
> if (ret)
> goto out_unlock;
>
> @@ -656,11 +697,31 @@ xfs_file_dio_write_aligned(
> iolock = XFS_IOLOCK_SHARED;
> }
> trace_xfs_file_direct_write(iocb, from);
> - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
> - &xfs_dio_write_ops, 0, NULL, 0);
> + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
> out_unlock:
> - if (iolock)
> - xfs_iunlock(ip, iolock);
> + xfs_iunlock(ip, iolock);
> + return ret;
> +}
> +
> +/*
> + * Handle block aligned direct I/O writes to zoned devices.
> + */
> +static noinline ssize_t
> +xfs_file_dio_write_zoned(
> + struct xfs_inode *ip,
> + struct kiocb *iocb,
> + struct iov_iter *from)
> +{
> + struct xfs_zone_alloc_ctx ac = { };
> + ssize_t ret;
> +
> + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
> + if (ret < 0)
> + return ret;
> + ret = xfs_file_dio_write_aligned(ip, iocb, from,
> + &xfs_zoned_direct_write_iomap_ops,
> + &xfs_dio_zoned_write_ops, &ac);
> + xfs_zoned_space_unreserve(ip, &ac);
> return ret;
> }
>
> @@ -777,7 +838,10 @@ xfs_file_dio_write(
> (xfs_is_always_cow_inode(ip) &&
> (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
> return xfs_file_dio_write_unaligned(ip, iocb, from);
> - return xfs_file_dio_write_aligned(ip, iocb, from);
> + if (xfs_is_zoned_inode(ip))
> + return xfs_file_dio_write_zoned(ip, iocb, from);
> + return xfs_file_dio_write_aligned(ip, iocb, from,
> + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
> }
>
> static noinline ssize_t
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 402b253ce3a2..9626632883d0 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -965,6 +965,60 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
> .iomap_begin = xfs_direct_write_iomap_begin,
> };
>
> +#ifdef CONFIG_XFS_RT
> +/*
> + * This is really simple. The space has already been reserved before taking the
> + * IOLOCK, the actual block allocation is done just before submitting the bio
> + * and only recorded in the extent map on I/O completion.
> + */
> +static int
> +xfs_zoned_direct_write_iomap_begin(
> + struct inode *inode,
> + loff_t offset,
> + loff_t length,
> + unsigned flags,
> + struct iomap *iomap,
> + struct iomap *srcmap)
> +{
> + struct xfs_inode *ip = XFS_I(inode);
> + int error;
> +
> + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
> +
> + /*
> + * Needs to be pushed down into the allocator so that only writes into
> + * a single zone can be supported.
> + */
> + if (flags & IOMAP_NOWAIT)
> + return -EAGAIN;
> +
> + /*
> + * Ensure the extent list is in memory in so that we don't have to do
> + * read it from the I/O completion handler.
> + */
> + if (xfs_need_iread_extents(&ip->i_df)) {
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + if (error)
> + return error;
> + }
> +
> + iomap->type = IOMAP_MAPPED;
> + iomap->flags = IOMAP_F_DIRTY;
> + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
> + iomap->offset = offset;
> + iomap->length = length;
> + iomap->flags = IOMAP_F_ZONE_APPEND;
> + iomap->addr = 0;
> + return 0;
> +}
> +
> +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
> + .iomap_begin = xfs_zoned_direct_write_iomap_begin,
> +};
> +#endif /* CONFIG_XFS_RT */
> +
> static int
> xfs_dax_write_iomap_end(
> struct inode *inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index bc8a00cad854..d330c4a581b1 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -51,6 +51,7 @@ xfs_aligned_fsb_count(
>
> extern const struct iomap_ops xfs_buffered_write_iomap_ops;
> extern const struct iomap_ops xfs_direct_write_iomap_ops;
> +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
> extern const struct iomap_ops xfs_read_iomap_ops;
> extern const struct iomap_ops xfs_seek_iomap_ops;
> extern const struct iomap_ops xfs_xattr_iomap_ops;
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 29/43] xfs: wire up zoned block freeing in xfs_rtextent_free_finish_item
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (27 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 28/43] xfs: implement direct " Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:40 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 30/43] xfs: hide reserved RT blocks from statfs Christoph Hellwig
` (13 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Make xfs_rtextent_free_finish_item call into the zoned allocator to free
blocks on zoned RT devices.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_extfree_item.c | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888..777438b853da 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
trace_xfs_extent_free_deferred(mp, xefi);
- if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
- if (*rtgp != to_rtg(xefi->xefi_group)) {
- *rtgp = to_rtg(xefi->xefi_group);
- xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
- xfs_rtgroup_trans_join(tp, *rtgp,
- XFS_RTGLOCK_BITMAP);
- }
- error = xfs_rtfree_blocks(tp, *rtgp,
- xefi->xefi_startblock, xefi->xefi_blockcount);
+ if (xefi->xefi_flags & XFS_EFI_CANCELLED)
+ goto done;
+
+ if (*rtgp != to_rtg(xefi->xefi_group)) {
+ unsigned int lock_flags;
+
+ if (xfs_has_zoned(mp))
+ lock_flags = XFS_RTGLOCK_RMAP;
+ else
+ lock_flags = XFS_RTGLOCK_BITMAP;
+
+ *rtgp = to_rtg(xefi->xefi_group);
+ xfs_rtgroup_lock(*rtgp, lock_flags);
+ xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
}
+
+ if (xfs_has_zoned(mp)) {
+ error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ } else {
+ error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ }
+
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
-
+done:
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 29/43] xfs: wire up zoned block freeing in xfs_rtextent_free_finish_item
2024-12-11 8:54 ` [PATCH 29/43] xfs: wire up zoned block freeing in xfs_rtextent_free_finish_item Christoph Hellwig
@ 2024-12-13 22:40 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:40 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:54AM +0100, Christoph Hellwig wrote:
> Make xfs_rtextent_free_finish_item call into the zoned allocator to free
> blocks on zoned RT devices.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks good,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_extfree_item.c | 35 +++++++++++++++++++++++++----------
> 1 file changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
> index a25c713ff888..777438b853da 100644
> --- a/fs/xfs/xfs_extfree_item.c
> +++ b/fs/xfs/xfs_extfree_item.c
> @@ -29,6 +29,7 @@
> #include "xfs_inode.h"
> #include "xfs_rtbitmap.h"
> #include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
>
> struct kmem_cache *xfs_efi_cache;
> struct kmem_cache *xfs_efd_cache;
> @@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
>
> trace_xfs_extent_free_deferred(mp, xefi);
>
> - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
> - if (*rtgp != to_rtg(xefi->xefi_group)) {
> - *rtgp = to_rtg(xefi->xefi_group);
> - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
> - xfs_rtgroup_trans_join(tp, *rtgp,
> - XFS_RTGLOCK_BITMAP);
> - }
> - error = xfs_rtfree_blocks(tp, *rtgp,
> - xefi->xefi_startblock, xefi->xefi_blockcount);
> + if (xefi->xefi_flags & XFS_EFI_CANCELLED)
> + goto done;
> +
> + if (*rtgp != to_rtg(xefi->xefi_group)) {
> + unsigned int lock_flags;
> +
> + if (xfs_has_zoned(mp))
> + lock_flags = XFS_RTGLOCK_RMAP;
> + else
> + lock_flags = XFS_RTGLOCK_BITMAP;
> +
> + *rtgp = to_rtg(xefi->xefi_group);
> + xfs_rtgroup_lock(*rtgp, lock_flags);
> + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
> }
> +
> + if (xfs_has_zoned(mp)) {
> + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
> + xefi->xefi_blockcount);
> + } else {
> + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
> + xefi->xefi_blockcount);
> + }
> +
> if (error == -EAGAIN) {
> xfs_efd_from_efi(efdp);
> return error;
> }
> -
> +done:
> xfs_efd_add_extent(efdp, xefi);
> xfs_extent_free_cancel_item(item);
> return error;
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 30/43] xfs: hide reserved RT blocks from statfs
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (28 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 29/43] xfs: wire up zoned block freeing in xfs_rtextent_free_finish_item Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:43 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 31/43] xfs: support growfs on zoned file systems Christoph Hellwig
` (12 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
File systems with a zoned RT device have a large number of reserved
blocks that are required for garbage collection, and which can't be
filled with user data. Exclude them from the available blocks reported
through stat(v)fs.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_super.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b289b2ba78b1..59998aac7ed7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -865,7 +865,8 @@ xfs_statfs_rt(
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
st->f_bfree = xfs_rtbxlen_to_blen(mp, max(0LL, freertx));
- st->f_blocks = mp->m_sb.sb_rblocks;
+ st->f_blocks = mp->m_sb.sb_rblocks -
+ xfs_rtbxlen_to_blen(mp, mp->m_resblks[XC_FREE_RTEXTENTS].total);
}
static void
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 30/43] xfs: hide reserved RT blocks from statfs
2024-12-11 8:54 ` [PATCH 30/43] xfs: hide reserved RT blocks from statfs Christoph Hellwig
@ 2024-12-13 22:43 ` Darrick J. Wong
2024-12-15 6:03 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:43 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:55AM +0100, Christoph Hellwig wrote:
> File systems with a zoned RT device have a large number of reserved
> blocks that are required for garbage collection, and which can't be
> filled with user data. Exclude them from the available blocks reported
> through stat(v)fs.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_super.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b289b2ba78b1..59998aac7ed7 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -865,7 +865,8 @@ xfs_statfs_rt(
> xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
>
> st->f_bfree = xfs_rtbxlen_to_blen(mp, max(0LL, freertx));
> - st->f_blocks = mp->m_sb.sb_rblocks;
> + st->f_blocks = mp->m_sb.sb_rblocks -
> + xfs_rtbxlen_to_blen(mp, mp->m_resblks[XC_FREE_RTEXTENTS].total);
I wonder, is mp->m_resblks[XC_FREE_RTEXTENTS].total considered
"unavailable"? Should that be added to xfs_freecounter_unavailable?
--D
> }
>
> static void
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 30/43] xfs: hide reserved RT blocks from statfs
2024-12-13 22:43 ` Darrick J. Wong
@ 2024-12-15 6:03 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:03 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:43:58PM -0800, Darrick J. Wong wrote:
> > - st->f_blocks = mp->m_sb.sb_rblocks;
> > + st->f_blocks = mp->m_sb.sb_rblocks -
> > + xfs_rtbxlen_to_blen(mp, mp->m_resblks[XC_FREE_RTEXTENTS].total);
>
> I wonder, is mp->m_resblks[XC_FREE_RTEXTENTS].total considered
> "unavailable"? Should that be added to xfs_freecounter_unavailable?
That messed up the set_aside calculation in xfs_dec_freecounter and
I think also xfs_reserve_blocks. I tried this early on in the project
and it made a complete mess of the free block accounting.
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 31/43] xfs: support growfs on zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (29 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 30/43] xfs: hide reserved RT blocks from statfs Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:45 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 32/43] xfs: allow COW forks on zoned file systems in xchk_bmap Christoph Hellwig
` (11 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Replace the inner loop growing one RT bitmap block at a time with
one just modifying the superblock counters for growing an entire
zone (aka RTG). The big restriction is just like at mkfs time only
a RT extent size of a single FSB is allowed, and the file system
capacity needs to be aligned to the zone size.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_rtalloc.c | 121 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 101 insertions(+), 20 deletions(-)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47c94ac74259..e21baa494c33 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -860,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
return error;
}
+static void
+xfs_growfs_rt_sb_fields(
+ struct xfs_trans *tp,
+ const struct xfs_mount *nmp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+ nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
+ if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
+ if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+ nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
+ if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+ nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
+ if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
+ if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
+ nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+}
+
+static int
+xfs_growfs_rt_zoned(
+ struct xfs_rtgroup *rtg,
+ xfs_rfsblock_t nrblocks)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_mount *nmp;
+ struct xfs_trans *tp;
+ xfs_rtbxlen_t freed_rtx;
+ int error;
+
+ /*
+ * Calculate new sb and mount fields for this round. Also ensure the
+ * rtg_extents value is uptodate as the rtbitmap code relies on it.
+ */
+ nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
+ mp->m_sb.sb_rextsize);
+ if (!nmp)
+ return -ENOMEM;
+ freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
+
+ xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
+ nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
+
+ error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_growfs_rt_sb_fields(tp, nmp);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+
+ /*
+ * Ensure the mount RT feature flag is now set, and compute new
+ * maxlevels for rt btrees.
+ */
+ mp->m_features |= XFS_FEAT_REALTIME;
+ xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
+ xfs_zoned_add_available(mp, freed_rtx);
+out_free:
+ kfree(nmp);
+ return error;
+}
+
static int
xfs_growfs_rt_bmblock(
struct xfs_rtgroup *rtg,
@@ -945,24 +1023,7 @@ xfs_growfs_rt_bmblock(
/*
* Update superblock fields.
*/
- if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
- nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
- if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
- nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
- if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
- nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
- if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
- nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
- if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
- nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
- if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
- nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+ xfs_growfs_rt_sb_fields(args.tp, nmp);
/*
* Free the new extent.
@@ -1129,6 +1190,11 @@ xfs_growfs_rtg(
goto out_rele;
}
+ if (xfs_has_zoned(mp)) {
+ error = xfs_growfs_rt_zoned(rtg, nrblocks);
+ goto out_rele;
+ }
+
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
if (error)
goto out_rele;
@@ -1148,8 +1214,7 @@ xfs_growfs_rtg(
if (old_rsum_cache)
kvfree(old_rsum_cache);
- xfs_rtgroup_rele(rtg);
- return 0;
+ goto out_rele;
out_error:
/*
@@ -1197,6 +1262,22 @@ xfs_growfs_check_rtgeom(
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
+
+ if (xfs_has_zoned(mp)) {
+ uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ uint32_t rem;
+
+ if (rextsize != 1)
+ return -EINVAL;
+ div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
+ if (rem) {
+ xfs_warn(mp,
+"new RT volume size (%lld) not aligned to RT group size (%d)",
+ mp->m_sb.sb_rblocks, gblocks);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 31/43] xfs: support growfs on zoned file systems
2024-12-11 8:54 ` [PATCH 31/43] xfs: support growfs on zoned file systems Christoph Hellwig
@ 2024-12-13 22:45 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:45 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:56AM +0100, Christoph Hellwig wrote:
> Replace the inner loop growing one RT bitmap block at a time with
> one just modifying the superblock counters for growing an entire
> zone (aka RTG). The big restriction is just like at mkfs time only
> a RT extent size of a single FSB is allowed, and the file system
> capacity needs to be aligned to the zone size.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Got it, that really is nice to do growfs a group at a time.
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_rtalloc.c | 121 ++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 101 insertions(+), 20 deletions(-)
>
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index 47c94ac74259..e21baa494c33 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -860,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
> return error;
> }
>
> +static void
> +xfs_growfs_rt_sb_fields(
> + struct xfs_trans *tp,
> + const struct xfs_mount *nmp)
> +{
> + struct xfs_mount *mp = tp->t_mountp;
> +
> + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
> + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
> + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
> + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
> + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
> + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
> + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
> + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
> + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
> + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
> + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
> + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
> +}
> +
> +static int
> +xfs_growfs_rt_zoned(
> + struct xfs_rtgroup *rtg,
> + xfs_rfsblock_t nrblocks)
> +{
> + struct xfs_mount *mp = rtg_mount(rtg);
> + struct xfs_mount *nmp;
> + struct xfs_trans *tp;
> + xfs_rtbxlen_t freed_rtx;
> + int error;
> +
> + /*
> + * Calculate new sb and mount fields for this round. Also ensure the
> + * rtg_extents value is uptodate as the rtbitmap code relies on it.
> + */
> + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
> + mp->m_sb.sb_rextsize);
> + if (!nmp)
> + return -ENOMEM;
> + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
> +
> + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
> + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
> +
> + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
> + if (error)
> + goto out_free;
> +
> + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
> +
> + xfs_growfs_rt_sb_fields(tp, nmp);
> + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
> +
> + error = xfs_trans_commit(tp);
> + if (error)
> + goto out_free;
> +
> + /*
> + * Ensure the mount RT feature flag is now set, and compute new
> + * maxlevels for rt btrees.
> + */
> + mp->m_features |= XFS_FEAT_REALTIME;
> + xfs_rtrmapbt_compute_maxlevels(mp);
> + xfs_rtrefcountbt_compute_maxlevels(mp);
> + xfs_zoned_add_available(mp, freed_rtx);
> +out_free:
> + kfree(nmp);
> + return error;
> +}
> +
> static int
> xfs_growfs_rt_bmblock(
> struct xfs_rtgroup *rtg,
> @@ -945,24 +1023,7 @@ xfs_growfs_rt_bmblock(
> /*
> * Update superblock fields.
> */
> - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
> - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
> - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
> - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
> - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
> - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
> - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
> - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
> - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
> - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
> - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
> - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
> - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
> + xfs_growfs_rt_sb_fields(args.tp, nmp);
>
> /*
> * Free the new extent.
> @@ -1129,6 +1190,11 @@ xfs_growfs_rtg(
> goto out_rele;
> }
>
> + if (xfs_has_zoned(mp)) {
> + error = xfs_growfs_rt_zoned(rtg, nrblocks);
> + goto out_rele;
> + }
> +
> error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
> if (error)
> goto out_rele;
> @@ -1148,8 +1214,7 @@ xfs_growfs_rtg(
>
> if (old_rsum_cache)
> kvfree(old_rsum_cache);
> - xfs_rtgroup_rele(rtg);
> - return 0;
> + goto out_rele;
>
> out_error:
> /*
> @@ -1197,6 +1262,22 @@ xfs_growfs_check_rtgeom(
>
> if (min_logfsbs > mp->m_sb.sb_logblocks)
> return -EINVAL;
> +
> + if (xfs_has_zoned(mp)) {
> + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
> + uint32_t rem;
> +
> + if (rextsize != 1)
> + return -EINVAL;
> + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
> + if (rem) {
> + xfs_warn(mp,
> +"new RT volume size (%lld) not aligned to RT group size (%d)",
> + mp->m_sb.sb_rblocks, gblocks);
> + return -EINVAL;
> + }
> + }
> +
> return 0;
> }
>
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 32/43] xfs: allow COW forks on zoned file systems in xchk_bmap
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (30 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 31/43] xfs: support growfs on zoned file systems Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:47 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems Christoph Hellwig
` (10 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
zoned file systems can have COW forks even without reflinks.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/scrub/bmap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 66da7d4d56ba..cfc6f035ecaa 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1039,7 +1039,7 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
/* No CoW forks on non-reflink filesystems. */
- if (!xfs_has_reflink(mp)) {
+ if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 32/43] xfs: allow COW forks on zoned file systems in xchk_bmap
2024-12-11 8:54 ` [PATCH 32/43] xfs: allow COW forks on zoned file systems in xchk_bmap Christoph Hellwig
@ 2024-12-13 22:47 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:47 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:57AM +0100, Christoph Hellwig wrote:
> zoned file systems can have COW forks even without reflinks.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/scrub/bmap.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
> index 66da7d4d56ba..cfc6f035ecaa 100644
> --- a/fs/xfs/scrub/bmap.c
> +++ b/fs/xfs/scrub/bmap.c
> @@ -1039,7 +1039,7 @@ xchk_bmap(
> switch (whichfork) {
> case XFS_COW_FORK:
> /* No CoW forks on non-reflink filesystems. */
> - if (!xfs_has_reflink(mp)) {
> + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
Might want to update the comment
/* No CoW forks filesystem doesn't support out of place writes */
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> xchk_ino_set_corrupt(sc, sc->ip->i_ino);
> return 0;
> }
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (31 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 32/43] xfs: allow COW forks on zoned file systems in xchk_bmap Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:49 ` Darrick J. Wong
2024-12-11 8:54 ` [PATCH 34/43] xfs: support xrep_require_rtext_inuse " Christoph Hellwig
` (9 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Space usage is tracked by the rmap, which already is separately
cross-reference. But on top of that we have the write pointer and can
do a basic sanity check here that the block is not beyond the write
pointer.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/scrub/rtbitmap.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e8c776a34c1d..d5ff8609dbfb 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
+ if (xfs_has_zoned(sc->mp)) {
+ if (!xfs_zone_rgbno_is_valid(rtg,
+ xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
+ xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
+ return;
+ }
+
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems
2024-12-11 8:54 ` [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems Christoph Hellwig
@ 2024-12-13 22:49 ` Darrick J. Wong
2024-12-15 6:13 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:49 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:58AM +0100, Christoph Hellwig wrote:
> Space usage is tracked by the rmap, which already is separately
> cross-reference. But on top of that we have the write pointer and can
cross-referenced
> do a basic sanity check here that the block is not beyond the write
> pointer.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/scrub/rtbitmap.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
> index e8c776a34c1d..d5ff8609dbfb 100644
> --- a/fs/xfs/scrub/rtbitmap.c
> +++ b/fs/xfs/scrub/rtbitmap.c
> @@ -21,6 +21,7 @@
> #include "xfs_rmap.h"
> #include "xfs_rtrmap_btree.h"
> #include "xfs_exchmaps.h"
> +#include "xfs_zone_alloc.h"
> #include "scrub/scrub.h"
> #include "scrub/common.h"
> #include "scrub/repair.h"
> @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
> xfs_extlen_t len)
> {
> struct xfs_rtgroup *rtg = sc->sr.rtg;
> - struct xfs_inode *rbmip = rtg_bitmap(rtg);
> xfs_rtxnum_t startext;
> xfs_rtxnum_t endext;
> bool is_free;
> @@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
> if (xchk_skip_xref(sc->sm))
> return;
>
> + if (xfs_has_zoned(sc->mp)) {
> + if (!xfs_zone_rgbno_is_valid(rtg,
> + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
> + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
> + return;
> + }
> +
> startext = xfs_rtb_to_rtx(sc->mp, rtbno);
> endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
> error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
> @@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
> if (!xchk_should_check_xref(sc, &error, NULL))
> return;
> if (is_free)
> - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
> + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
rbmip is already the return value from rtg_bitmap()
--D
> }
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems
2024-12-13 22:49 ` Darrick J. Wong
@ 2024-12-15 6:13 ` Christoph Hellwig
2024-12-17 17:02 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:13 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:49:12PM -0800, Darrick J. Wong wrote:
> > @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
> > xfs_extlen_t len)
> > {
> > struct xfs_rtgroup *rtg = sc->sr.rtg;
> > - struct xfs_inode *rbmip = rtg_bitmap(rtg);
> > if (is_free)
> > - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
> > + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
>
> rbmip is already the return value from rtg_bitmap()
Yes, but it gets removed above. Because it only has a single user,
and it keeps me from incorrectly referencing it in the zone branch,
which I did initially and which didn't end up well :)
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems
2024-12-15 6:13 ` Christoph Hellwig
@ 2024-12-17 17:02 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:02 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 07:13:49AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 02:49:12PM -0800, Darrick J. Wong wrote:
> > > @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
> > > xfs_extlen_t len)
> > > {
> > > struct xfs_rtgroup *rtg = sc->sr.rtg;
> > > - struct xfs_inode *rbmip = rtg_bitmap(rtg);
>
>
> > > if (is_free)
> > > - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
> > > + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
> >
> > rbmip is already the return value from rtg_bitmap()
>
> Yes, but it gets removed above. Because it only has a single user,
> and it keeps me from incorrectly referencing it in the zone branch,
> which I did initially and which didn't end up well :)
Oh right, my bad. With the typo in the commit message fixed,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 34/43] xfs: support xrep_require_rtext_inuse on zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (32 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 33/43] xfs: support xchk_xref_is_used_rt_space on zoned file systems Christoph Hellwig
@ 2024-12-11 8:54 ` Christoph Hellwig
2024-12-13 22:49 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices Christoph Hellwig
` (8 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:54 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Space usage is tracked by the rmap, which already is separately
cross-reference. But on top of that we have the write pointer and can
do a basic sanity check here that the block is not beyond the write
pointer.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/scrub/repair.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 90740718ac70..dd88a237d629 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1048,7 +1049,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
- int error;
+ int error = 0;
+
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 34/43] xfs: support xrep_require_rtext_inuse on zoned file systems
2024-12-11 8:54 ` [PATCH 34/43] xfs: support xrep_require_rtext_inuse " Christoph Hellwig
@ 2024-12-13 22:49 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:49 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:54:59AM +0100, Christoph Hellwig wrote:
> Space usage is tracked by the rmap, which already is separately
> cross-reference. But on top of that we have the write pointer and can
> do a basic sanity check here that the block is not beyond the write
> pointer.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/scrub/repair.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 90740718ac70..dd88a237d629 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -43,6 +43,7 @@
> #include "xfs_rtalloc.h"
> #include "xfs_metafile.h"
> #include "xfs_rtrefcount_btree.h"
> +#include "xfs_zone_alloc.h"
> #include "scrub/scrub.h"
> #include "scrub/common.h"
> #include "scrub/trace.h"
> @@ -1048,7 +1049,13 @@ xrep_require_rtext_inuse(
> xfs_rtxnum_t startrtx;
> xfs_rtxnum_t endrtx;
> bool is_free = false;
> - int error;
> + int error = 0;
> +
> + if (xfs_has_zoned(mp)) {
> + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
> + return -EFSCORRUPTED;
> + return 0;
> + }
>
> startrtx = xfs_rgbno_to_rtx(mp, rgbno);
> endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (33 preceding siblings ...)
2024-12-11 8:54 ` [PATCH 34/43] xfs: support xrep_require_rtext_inuse " Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:11 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 36/43] xfs: disable reflink for zoned file systems Christoph Hellwig
` (7 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
File system with internal RT devices are a bit odd in that we need
to report AGs and RGs. To make this happen use separate synthetic
fmr_device values for the different sections instead of the dev_t
mapping used by other XFS configurations.
The data device is reported as file system metadata before the
start of the RGs for the synthetic RT fmr_device.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_fs.h | 9 +++++
fs/xfs/xfs_fsmap.c | 80 +++++++++++++++++++++++++++++++++---------
2 files changed, 72 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 5e66fb2b2cc7..12463ba766da 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1082,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+/*
+ * Devices supported by a single XFS file system. Reported in fsmaps fmr_device
+ * when using internal RT devices.
+ */
+enum xfs_device {
+ XFS_DEV_DATA = 1,
+ XFS_DEV_LOG = 2,
+ XFS_DEV_RT = 3,
+};
#ifndef HAVE_BBMACROS
/*
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 917d4d0e51b3..a4bc1642fe56 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
+ xfs_daddr_t rtstart_daddr;
xfs_rtblock_t start_rtb;
xfs_rtblock_t end_rtb;
xfs_rgnumber_t start_rg, end_rg;
uint64_t eofs;
int error = 0;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+
+ rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
+ if (keys[0].fmr_physical < rtstart_daddr) {
+ struct xfs_fsmap_irec frec = {
+ .owner = XFS_RMAP_OWN_FS,
+ .len_daddr = rtstart_daddr,
+ };
+
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0) {
+ info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
+ return 0;
+ }
+
+ /* Fabricate an rmap entry for space occupied by the data dev */
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+ }
+
+ start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
+ min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
@@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
}
#endif /* CONFIG_XFS_RT */
+static uint32_t
+xfs_getfsmap_device(
+ struct xfs_mount *mp,
+ enum xfs_device dev)
+{
+ if (mp->m_sb.sb_rtstart)
+ return dev;
+
+ switch (dev) {
+ case XFS_DEV_DATA:
+ return new_encode_dev(mp->m_ddev_targp->bt_dev);
+ case XFS_DEV_LOG:
+ return new_encode_dev(mp->m_logdev_targp->bt_dev);
+ case XFS_DEV_RT:
+ if (!mp->m_rtdev_targp)
+ break;
+ return new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ }
+
+ return -1;
+}
+
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
struct xfs_mount *mp,
struct xfs_fsmap *fm)
{
- if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
- fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
- return true;
- if (mp->m_logdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
- return true;
- if (mp->m_rtdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
- return true;
- return false;
+ return fm->fmr_device == 0 ||
+ fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
+ (mp->m_rtdev_targp &&
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
}
/* Ensure that the low key is less than the high key. */
@@ -1126,7 +1166,7 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
- handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
@@ -1134,7 +1174,7 @@ xfs_getfsmap(
if (mp->m_logdev_targp != mp->m_ddev_targp) {
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
mp->m_sb.sb_logblocks);
- handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
@@ -1144,7 +1184,7 @@ xfs_getfsmap(
*/
if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
if (use_rmap)
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
else
@@ -1234,7 +1274,13 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
- head->fmh_oflags = FMH_OF_DEV_T;
+
+ /*
+ * For internal RT device we need to report different synthetic devices
+ * for a single physical device, and thus can't report the actual dev_t.
+ */
+ if (!mp->m_sb.sb_rtstart)
+ head->fmh_oflags = FMH_OF_DEV_T;
return error;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices
2024-12-11 8:55 ` [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices Christoph Hellwig
@ 2024-12-13 23:11 ` Darrick J. Wong
2024-12-15 6:26 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:11 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:00AM +0100, Christoph Hellwig wrote:
> File system with internal RT devices are a bit odd in that we need
> to report AGs and RGs. To make this happen use separate synthetic
> fmr_device values for the different sections instead of the dev_t
> mapping used by other XFS configurations.
>
> The data device is reported as file system metadata before the
> start of the RGs for the synthetic RT fmr_device.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_fs.h | 9 +++++
> fs/xfs/xfs_fsmap.c | 80 +++++++++++++++++++++++++++++++++---------
> 2 files changed, 72 insertions(+), 17 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index 5e66fb2b2cc7..12463ba766da 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -1082,6 +1082,15 @@ struct xfs_rtgroup_geometry {
> #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
> /* XFS_IOC_GETFSUUID ---------- deprecated 140 */
>
> +/*
> + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device
> + * when using internal RT devices.
> + */
> +enum xfs_device {
> + XFS_DEV_DATA = 1,
> + XFS_DEV_LOG = 2,
> + XFS_DEV_RT = 3,
> +};
>
> #ifndef HAVE_BBMACROS
> /*
> diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
> index 917d4d0e51b3..a4bc1642fe56 100644
> --- a/fs/xfs/xfs_fsmap.c
> +++ b/fs/xfs/xfs_fsmap.c
> @@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
> struct xfs_mount *mp = tp->t_mountp;
> struct xfs_rtgroup *rtg = NULL;
> struct xfs_btree_cur *bt_cur = NULL;
> + xfs_daddr_t rtstart_daddr;
> xfs_rtblock_t start_rtb;
> xfs_rtblock_t end_rtb;
> xfs_rgnumber_t start_rg, end_rg;
> uint64_t eofs;
> int error = 0;
>
> - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
> + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
> if (keys[0].fmr_physical >= eofs)
> return 0;
> - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
> - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
> +
> + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
> + if (keys[0].fmr_physical < rtstart_daddr) {
> + struct xfs_fsmap_irec frec = {
> + .owner = XFS_RMAP_OWN_FS,
> + .len_daddr = rtstart_daddr,
> + };
> +
> + /* Adjust the low key if we are continuing from where we left off. */
> + if (keys[0].fmr_length > 0) {
> + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
> + return 0;
> + }
> +
> + /* Fabricate an rmap entry for space occupied by the data dev */
> + error = xfs_getfsmap_helper(tp, info, &frec);
> + if (error)
> + return error;
Seeing as you report different fmr_device values for the data and rt
devices, I'd have though that you'd want the rt fsmappings to start at
fmr_physical == 0. But then I guess for the sb_rtstart > 0 case, the
rtblock values that get written into the bmbt have that rtstart value
added in, don't they?
--D
> + }
> +
> + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
> + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
> + min(eofs - 1, keys[1].fmr_physical));
>
> info->missing_owner = XFS_FMR_OWN_FREE;
>
> @@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
> }
> #endif /* CONFIG_XFS_RT */
>
> +static uint32_t
> +xfs_getfsmap_device(
> + struct xfs_mount *mp,
> + enum xfs_device dev)
> +{
> + if (mp->m_sb.sb_rtstart)
> + return dev;
> +
> + switch (dev) {
> + case XFS_DEV_DATA:
> + return new_encode_dev(mp->m_ddev_targp->bt_dev);
> + case XFS_DEV_LOG:
> + return new_encode_dev(mp->m_logdev_targp->bt_dev);
> + case XFS_DEV_RT:
> + if (!mp->m_rtdev_targp)
> + break;
> + return new_encode_dev(mp->m_rtdev_targp->bt_dev);
> + }
> +
> + return -1;
> +}
> +
> /* Do we recognize the device? */
> STATIC bool
> xfs_getfsmap_is_valid_device(
> struct xfs_mount *mp,
> struct xfs_fsmap *fm)
> {
> - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
> - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
> - return true;
> - if (mp->m_logdev_targp &&
> - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
> - return true;
> - if (mp->m_rtdev_targp &&
> - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
> - return true;
> - return false;
> + return fm->fmr_device == 0 ||
> + fm->fmr_device == UINT_MAX ||
> + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
> + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
> + (mp->m_rtdev_targp &&
> + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
> }
>
> /* Ensure that the low key is less than the high key. */
> @@ -1126,7 +1166,7 @@ xfs_getfsmap(
> /* Set up our device handlers. */
> memset(handlers, 0, sizeof(handlers));
> handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
> - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
> + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
> if (use_rmap)
> handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
> else
> @@ -1134,7 +1174,7 @@ xfs_getfsmap(
> if (mp->m_logdev_targp != mp->m_ddev_targp) {
> handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
> mp->m_sb.sb_logblocks);
> - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
> + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
> handlers[1].fn = xfs_getfsmap_logdev;
> }
> #ifdef CONFIG_XFS_RT
> @@ -1144,7 +1184,7 @@ xfs_getfsmap(
> */
> if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
> handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
> - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
> + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
> if (use_rmap)
> handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
> else
> @@ -1234,7 +1274,13 @@ xfs_getfsmap(
>
> if (tp)
> xfs_trans_cancel(tp);
> - head->fmh_oflags = FMH_OF_DEV_T;
> +
> + /*
> + * For internal RT device we need to report different synthetic devices
> + * for a single physical device, and thus can't report the actual dev_t.
> + */
> + if (!mp->m_sb.sb_rtstart)
> + head->fmh_oflags = FMH_OF_DEV_T;
> return error;
> }
>
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices
2024-12-13 23:11 ` Darrick J. Wong
@ 2024-12-15 6:26 ` Christoph Hellwig
2024-12-17 17:06 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:26 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 03:11:15PM -0800, Darrick J. Wong wrote:
> > + /* Fabricate an rmap entry for space occupied by the data dev */
> > + error = xfs_getfsmap_helper(tp, info, &frec);
> > + if (error)
> > + return error;
>
> Seeing as you report different fmr_device values for the data and rt
> devices, I'd have though that you'd want the rt fsmappings to start at
> fmr_physical == 0. But then I guess for the sb_rtstart > 0 case, the
> rtblock values that get written into the bmbt have that rtstart value
> added in, don't they?
The bmbt values are all relative to rtstart, the daddr translation is what
adds the offset. So if we want to take the offset out of the fsmap
reporting, I'll need new helpers to not add it or manually subtract it
afterwards. If that's preferred it should be doable, even if the fsmap
code keeps confusing me more each time I look at it.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices
2024-12-15 6:26 ` Christoph Hellwig
@ 2024-12-17 17:06 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:06 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 07:26:13AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 03:11:15PM -0800, Darrick J. Wong wrote:
> > > + /* Fabricate an rmap entry for space occupied by the data dev */
> > > + error = xfs_getfsmap_helper(tp, info, &frec);
> > > + if (error)
> > > + return error;
> >
> > Seeing as you report different fmr_device values for the data and rt
> > devices, I'd have though that you'd want the rt fsmappings to start at
> > fmr_physical == 0. But then I guess for the sb_rtstart > 0 case, the
> > rtblock values that get written into the bmbt have that rtstart value
> > added in, don't they?
>
> The bmbt values are all relative to rtstart, the daddr translation is what
> adds the offset. So if we want to take the offset out of the fsmap
> reporting, I'll need new helpers to not add it or manually subtract it
> afterwards. If that's preferred it should be doable, even if the fsmap
> code keeps confusing me more each time I look at it.
I think it's ok if you can leave it as it is. Once you move to
"virtual" fmr_device numbers (aka not a dev_t) then it's up to you to
define how the fmr_physical address space works. It's no longer a
reference to a block device that you can open/pread/etc.
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (34 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 35/43] xfs: enable fsmap reporting for internal RT devices Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:12 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 37/43] xfs: disable rt quotas " Christoph Hellwig
` (6 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
While the zoned on-disk format supports reflinks, the GC code currently
always unshares reflinks when moving blocks to new zones, thus making the
feature unusuable. Disable reflinks until the GC code is refcount aware.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_super.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59998aac7ed7..690bb068a23a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1818,6 +1818,13 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_zoned(mp)) {
+ xfs_alert(mp,
+ "reflink not compatible with zoned RT device!");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
/*
* always-cow mode is not supported on filesystems with rt
* extent sizes larger than a single block because we'd have
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-11 8:55 ` [PATCH 36/43] xfs: disable reflink for zoned file systems Christoph Hellwig
@ 2024-12-13 23:12 ` Darrick J. Wong
2024-12-15 6:26 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:12 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:01AM +0100, Christoph Hellwig wrote:
> While the zoned on-disk format supports reflinks, the GC code currently
> always unshares reflinks when moving blocks to new zones, thus making the
> feature unusuable. Disable reflinks until the GC code is refcount aware.
This goes back to the question I had in the gc patch -- can we let
userspace do its own reflink-aware freespace copygc, and only use the
in-kernel gc if userspace doesn't respond fast enough? I imagine
someone will want to share used blocks on zoned storage at some point.
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_super.c | 7 +++++++
> 1 file changed, 7 insertions(+)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 59998aac7ed7..690bb068a23a 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1818,6 +1818,13 @@ xfs_fs_fill_super(
> goto out_filestream_unmount;
> }
>
> + if (xfs_has_zoned(mp)) {
> + xfs_alert(mp,
> + "reflink not compatible with zoned RT device!");
> + error = -EINVAL;
> + goto out_filestream_unmount;
> + }
> +
> /*
> * always-cow mode is not supported on filesystems with rt
> * extent sizes larger than a single block because we'd have
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-13 23:12 ` Darrick J. Wong
@ 2024-12-15 6:26 ` Christoph Hellwig
2024-12-17 17:10 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:26 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 03:12:47PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:55:01AM +0100, Christoph Hellwig wrote:
> > While the zoned on-disk format supports reflinks, the GC code currently
> > always unshares reflinks when moving blocks to new zones, thus making the
> > feature unusuable. Disable reflinks until the GC code is refcount aware.
>
> This goes back to the question I had in the gc patch -- can we let
> userspace do its own reflink-aware freespace copygc, and only use the
> in-kernel gc if userspace doesn't respond fast enough? I imagine
> someone will want to share used blocks on zoned storage at some point.
I'm pretty sure we could, if we're willing to deal with worse decision
making, worse performance and potential for deadlocks while dealing with
a bigger and more complicated code base. But why?
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-15 6:26 ` Christoph Hellwig
@ 2024-12-17 17:10 ` Darrick J. Wong
2024-12-18 7:09 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:10 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 07:26:54AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 03:12:47PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:55:01AM +0100, Christoph Hellwig wrote:
> > > While the zoned on-disk format supports reflinks, the GC code currently
> > > always unshares reflinks when moving blocks to new zones, thus making the
> > > feature unusuable. Disable reflinks until the GC code is refcount aware.
> >
> > This goes back to the question I had in the gc patch -- can we let
> > userspace do its own reflink-aware freespace copygc, and only use the
> > in-kernel gc if userspace doesn't respond fast enough? I imagine
> > someone will want to share used blocks on zoned storage at some point.
>
> I'm pretty sure we could, if we're willing to deal with worse decision
> making, worse performance and potential for deadlocks while dealing with
> a bigger and more complicated code base. But why?
Mostly intellectual curiosity on my part about self-reorganizing
filesystems. The zonegc you've already written is good enough for now,
though the no-reflink requirement feels a bit onerous.
But hey, it's not like I have numbers showing that a userspace
copy-dedupe gc strategy is any better, so I'll not hold up this whole
series on account of that.
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-17 17:10 ` Darrick J. Wong
@ 2024-12-18 7:09 ` Christoph Hellwig
2024-12-18 18:16 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-18 7:09 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Tue, Dec 17, 2024 at 09:10:55AM -0800, Darrick J. Wong wrote:
> Mostly intellectual curiosity on my part about self-reorganizing
> filesystems. The zonegc you've already written is good enough for now,
> though the no-reflink requirement feels a bit onerous.
The no-reflink is mostly because we want a minimum viable merge candidate,
and our initial uses for things like lsm databases and objects stores
don't strongly need it. I hope to add reflink support ~ 2 or 3 merge
windows after the initial code.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 36/43] xfs: disable reflink for zoned file systems
2024-12-18 7:09 ` Christoph Hellwig
@ 2024-12-18 18:16 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-18 18:16 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 18, 2024 at 08:09:34AM +0100, Christoph Hellwig wrote:
> On Tue, Dec 17, 2024 at 09:10:55AM -0800, Darrick J. Wong wrote:
> > Mostly intellectual curiosity on my part about self-reorganizing
> > filesystems. The zonegc you've already written is good enough for now,
> > though the no-reflink requirement feels a bit onerous.
>
> The no-reflink is mostly because we want a minimum viable merge candidate,
> and our initial uses for things like lsm databases and objects stores
> don't strongly need it. I hope to add reflink support ~ 2 or 3 merge
> windows after the initial code.
<nod>
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 37/43] xfs: disable rt quotas for zoned file systems
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (35 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 36/43] xfs: disable reflink for zoned file systems Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:05 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 38/43] xfs: enable the zoned RT device feature Christoph Hellwig
` (5 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
They'll need a little more work.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_qm.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e1ba5af6250f..417439b58785 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
* immediately. We only support rtquota if rtgroups are enabled to
* avoid problems with older kernels.
*/
- if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
+ if (mp->m_sb.sb_rextents &&
+ (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 37/43] xfs: disable rt quotas for zoned file systems
2024-12-11 8:55 ` [PATCH 37/43] xfs: disable rt quotas " Christoph Hellwig
@ 2024-12-13 23:05 ` Darrick J. Wong
2024-12-15 6:21 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:05 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:02AM +0100, Christoph Hellwig wrote:
> They'll need a little more work.
I guess we'll have to get back to this... :/
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_qm.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index e1ba5af6250f..417439b58785 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
> * immediately. We only support rtquota if rtgroups are enabled to
> * avoid problems with older kernels.
> */
> - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
> + if (mp->m_sb.sb_rextents &&
> + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
> xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
> mp->m_qflags = 0;
> goto write_changes;
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 37/43] xfs: disable rt quotas for zoned file systems
2024-12-13 23:05 ` Darrick J. Wong
@ 2024-12-15 6:21 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:21 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 03:05:03PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:55:02AM +0100, Christoph Hellwig wrote:
> > They'll need a little more work.
>
> I guess we'll have to get back to this... :/
Yes. I've always been wanting to implement it, but not with the
very highest priority. The fact that you pulled up the RT quota series
to earlier in your patch stack threw a little monkey wrench here :)
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 38/43] xfs: enable the zoned RT device feature
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (36 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 37/43] xfs: disable rt quotas " Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 22:52 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 39/43] xfs: support zone gaps Christoph Hellwig
` (4 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Enable the zoned RT device directory feature. With this feature, RT
groups are written sequentially and always emptied before rewriting
the blocks. This perfectly maps to zoned devices, but can also be
used on conventional block devices.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_format.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 12979496f30a..fc56de8fe696 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -408,7 +408,8 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
- XFS_SB_FEAT_INCOMPAT_METADIR)
+ XFS_SB_FEAT_INCOMPAT_METADIR | \
+ XFS_SB_FEAT_INCOMPAT_ZONED)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 38/43] xfs: enable the zoned RT device feature
2024-12-11 8:55 ` [PATCH 38/43] xfs: enable the zoned RT device feature Christoph Hellwig
@ 2024-12-13 22:52 ` Darrick J. Wong
2024-12-15 6:15 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:52 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:03AM +0100, Christoph Hellwig wrote:
> Enable the zoned RT device directory feature. With this feature, RT
> groups are written sequentially and always emptied before rewriting
> the blocks. This perfectly maps to zoned devices, but can also be
> used on conventional block devices.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks ok, though it's a bit odd that this isn't the very end of the
series.
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/libxfs/xfs_format.h | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index 12979496f30a..fc56de8fe696 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -408,7 +408,8 @@ xfs_sb_has_ro_compat_feature(
> XFS_SB_FEAT_INCOMPAT_NREXT64 | \
> XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
> XFS_SB_FEAT_INCOMPAT_PARENT | \
> - XFS_SB_FEAT_INCOMPAT_METADIR)
> + XFS_SB_FEAT_INCOMPAT_METADIR | \
> + XFS_SB_FEAT_INCOMPAT_ZONED)
>
> #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
> static inline bool
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 38/43] xfs: enable the zoned RT device feature
2024-12-13 22:52 ` Darrick J. Wong
@ 2024-12-15 6:15 ` Christoph Hellwig
0 siblings, 0 replies; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:15 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:52:45PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:55:03AM +0100, Christoph Hellwig wrote:
> > Enable the zoned RT device directory feature. With this feature, RT
> > groups are written sequentially and always emptied before rewriting
> > the blocks. This perfectly maps to zoned devices, but can also be
> > used on conventional block devices.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Looks ok, though it's a bit odd that this isn't the very end of the
> series.
The rest of the series adds another on-disk feature build on top of
this (the zone gaps) and new in-memory only features (debug output
in /proc/self/mountstats and hint based data placement). So I tried
to keep the bracket of adding the first bits for a new feature and
enabling it as small as possible (it's already pretty large anyway).
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 39/43] xfs: support zone gaps
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (37 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 38/43] xfs: enable the zoned RT device feature Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 22:55 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 40/43] xfs: add a max_open_zones mount option Christoph Hellwig
` (3 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Zoned devices can have gaps beyond the usable capacity of a zone and the
end in the LBA/daddr address space. In other words, the hardware
equivalent to the RT groups already takes care of the power of 2
alignment for us. In this case the sparse FSB/RTB address space maps 1:1
to the device address space.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/libxfs/xfs_format.h | 4 +++-
fs/xfs/libxfs/xfs_group.h | 6 +++++-
fs/xfs/libxfs/xfs_rtgroup.h | 13 ++++++++-----
fs/xfs/libxfs/xfs_sb.c | 3 +++
fs/xfs/libxfs/xfs_zones.c | 19 +++++++++++++++++--
fs/xfs/xfs_mount.h | 9 +++++++++
6 files changed, 45 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fc56de8fe696..9491a09f6aa7 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -398,6 +398,7 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED (1U << 31)/* zoned RT allocator */
+#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1U << 30)/* RTGs have LBA gaps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
@@ -409,7 +410,8 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
XFS_SB_FEAT_INCOMPAT_METADIR | \
- XFS_SB_FEAT_INCOMPAT_ZONED)
+ XFS_SB_FEAT_INCOMPAT_ZONED | \
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index 430a43e1591e..996b29313bc2 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -117,7 +117,11 @@ xfs_gbno_to_daddr(
struct xfs_groups *g = &mp->m_groups[xg->xg_type];
xfs_fsblock_t fsbno;
- fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
+ if (g->has_daddr_gaps)
+ fsbno = xfs_gbno_to_fsb(xg, gbno);
+ else
+ fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
+
return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 85d8d329d417..5b3305e09ec3 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -245,11 +245,14 @@ xfs_rtb_to_daddr(
xfs_rtblock_t rtbno)
{
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
- xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
- uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
- return XFS_FSB_TO_BB(mp,
- g->start_fsb + start_bno + (rtbno & g->blkmask));
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+
+ rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
+ }
+
+ return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
}
static inline xfs_rtblock_t
@@ -261,7 +264,7 @@ xfs_daddr_to_rtb(
xfs_rfsblock_t bno;
bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
- if (xfs_has_rtgroups(mp)) {
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno;
uint32_t rgbno;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ee56fc22fd06..18e4c4908f94 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1176,6 +1176,9 @@ xfs_sb_mount_rextsize(
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
rgs->start_fsb = mp->m_sb.sb_rtstart;
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
+ rgs->has_daddr_gaps = true;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
index e170d7c13533..c17111f40821 100644
--- a/fs/xfs/libxfs/xfs_zones.c
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -135,6 +135,7 @@ xfs_zone_validate(
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ uint32_t expected_size;
/*
* Check that the zone capacity matches the rtgroup size stored in the
@@ -149,11 +150,25 @@ xfs_zone_validate(
return -EIO;
}
- if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
+ if (g->has_daddr_gaps) {
+ expected_size = 1 << g->blklog;
+ } else {
+ if (zone->len != zone->capacity) {
+ xfs_warn(mp,
+"zone %u has capacity != size ((0x%llx vs 0x%llx)",
+ rtg_rgno(rtg),
+ XFS_BB_TO_FSB(mp, zone->len),
+ XFS_BB_TO_FSB(mp, zone->capacity));
+ return -EIO;
+ }
+ expected_size = g->blocks;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
xfs_warn(mp,
"zone %u length (0x%llx) does match geometry (0x%x).",
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
- 1 << g->blklog);
+ expected_size);
}
switch (zone->type) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 831d9e09fe72..ec8612c8b71d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -97,6 +97,15 @@ struct xfs_groups {
*/
uint8_t blklog;
+ /*
+ * Zoned devices can have gaps beyond the usable capacity of a zone and
+ * the end in the LBA/daddr address space. In other words, the hardware
+ * equivalent to the RT groups already takes care of the power of 2
+ * alignment for us. In this case the sparse FSB/RTB address space maps
+ * 1:1 to the device address space.
+ */
+ bool has_daddr_gaps;
+
/*
* Mask to extract the group-relative block number from a FSB.
* For a pre-rtgroups filesystem we pretend to have one very large
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 39/43] xfs: support zone gaps
2024-12-11 8:55 ` [PATCH 39/43] xfs: support zone gaps Christoph Hellwig
@ 2024-12-13 22:55 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:55 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:04AM +0100, Christoph Hellwig wrote:
> Zoned devices can have gaps beyond the usable capacity of a zone and the
> end in the LBA/daddr address space. In other words, the hardware
> equivalent to the RT groups already takes care of the power of 2
> alignment for us. In this case the sparse FSB/RTB address space maps 1:1
> to the device address space.
Heh, sparse lba ranges.
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/libxfs/xfs_format.h | 4 +++-
> fs/xfs/libxfs/xfs_group.h | 6 +++++-
> fs/xfs/libxfs/xfs_rtgroup.h | 13 ++++++++-----
> fs/xfs/libxfs/xfs_sb.c | 3 +++
> fs/xfs/libxfs/xfs_zones.c | 19 +++++++++++++++++--
> fs/xfs/xfs_mount.h | 9 +++++++++
> 6 files changed, 45 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index fc56de8fe696..9491a09f6aa7 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -398,6 +398,7 @@ xfs_sb_has_ro_compat_feature(
> #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
> #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
> #define XFS_SB_FEAT_INCOMPAT_ZONED (1U << 31)/* zoned RT allocator */
> +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1U << 30)/* RTGs have LBA gaps */
These will have to be renumbered before merging.
Otherwise looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> #define XFS_SB_FEAT_INCOMPAT_ALL \
> (XFS_SB_FEAT_INCOMPAT_FTYPE | \
> @@ -409,7 +410,8 @@ xfs_sb_has_ro_compat_feature(
> XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
> XFS_SB_FEAT_INCOMPAT_PARENT | \
> XFS_SB_FEAT_INCOMPAT_METADIR | \
> - XFS_SB_FEAT_INCOMPAT_ZONED)
> + XFS_SB_FEAT_INCOMPAT_ZONED | \
> + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
>
> #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
> static inline bool
> diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
> index 430a43e1591e..996b29313bc2 100644
> --- a/fs/xfs/libxfs/xfs_group.h
> +++ b/fs/xfs/libxfs/xfs_group.h
> @@ -117,7 +117,11 @@ xfs_gbno_to_daddr(
> struct xfs_groups *g = &mp->m_groups[xg->xg_type];
> xfs_fsblock_t fsbno;
>
> - fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
> + if (g->has_daddr_gaps)
> + fsbno = xfs_gbno_to_fsb(xg, gbno);
> + else
> + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
> +
> return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
> }
>
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> index 85d8d329d417..5b3305e09ec3 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.h
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -245,11 +245,14 @@ xfs_rtb_to_daddr(
> xfs_rtblock_t rtbno)
> {
> struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
> - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
> - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
>
> - return XFS_FSB_TO_BB(mp,
> - g->start_fsb + start_bno + (rtbno & g->blkmask));
> + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
> + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
> +
> + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
> + }
> +
> + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
> }
>
> static inline xfs_rtblock_t
> @@ -261,7 +264,7 @@ xfs_daddr_to_rtb(
> xfs_rfsblock_t bno;
>
> bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
> - if (xfs_has_rtgroups(mp)) {
> + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
> xfs_rgnumber_t rgno;
> uint32_t rgbno;
>
> diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
> index ee56fc22fd06..18e4c4908f94 100644
> --- a/fs/xfs/libxfs/xfs_sb.c
> +++ b/fs/xfs/libxfs/xfs_sb.c
> @@ -1176,6 +1176,9 @@ xfs_sb_mount_rextsize(
> rgs->blklog = mp->m_sb.sb_rgblklog;
> rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
> rgs->start_fsb = mp->m_sb.sb_rtstart;
> + if (xfs_sb_has_incompat_feature(sbp,
> + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
> + rgs->has_daddr_gaps = true;
> } else {
> rgs->blocks = 0;
> rgs->blklog = 0;
> diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
> index e170d7c13533..c17111f40821 100644
> --- a/fs/xfs/libxfs/xfs_zones.c
> +++ b/fs/xfs/libxfs/xfs_zones.c
> @@ -135,6 +135,7 @@ xfs_zone_validate(
> {
> struct xfs_mount *mp = rtg_mount(rtg);
> struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
> + uint32_t expected_size;
>
> /*
> * Check that the zone capacity matches the rtgroup size stored in the
> @@ -149,11 +150,25 @@ xfs_zone_validate(
> return -EIO;
> }
>
> - if (XFS_BB_TO_FSB(mp, zone->len) != 1 << g->blklog) {
> + if (g->has_daddr_gaps) {
> + expected_size = 1 << g->blklog;
> + } else {
> + if (zone->len != zone->capacity) {
> + xfs_warn(mp,
> +"zone %u has capacity != size ((0x%llx vs 0x%llx)",
> + rtg_rgno(rtg),
> + XFS_BB_TO_FSB(mp, zone->len),
> + XFS_BB_TO_FSB(mp, zone->capacity));
> + return -EIO;
> + }
> + expected_size = g->blocks;
> + }
> +
> + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
> xfs_warn(mp,
> "zone %u length (0x%llx) does match geometry (0x%x).",
> rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
> - 1 << g->blklog);
> + expected_size);
> }
>
> switch (zone->type) {
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 831d9e09fe72..ec8612c8b71d 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -97,6 +97,15 @@ struct xfs_groups {
> */
> uint8_t blklog;
>
> + /*
> + * Zoned devices can have gaps beyond the usable capacity of a zone and
> + * the end in the LBA/daddr address space. In other words, the hardware
> + * equivalent to the RT groups already takes care of the power of 2
> + * alignment for us. In this case the sparse FSB/RTB address space maps
> + * 1:1 to the device address space.
> + */
> + bool has_daddr_gaps;
> +
> /*
> * Mask to extract the group-relative block number from a FSB.
> * For a pre-rtgroups filesystem we pretend to have one very large
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 40/43] xfs: add a max_open_zones mount option
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (38 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 39/43] xfs: support zone gaps Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 22:57 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 41/43] xfs: support write life time based data placement Christoph Hellwig
` (2 subsequent siblings)
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
Allow limiting the number of open zones used below that exported by the
device. This is required to tune the number of write streams when zoned
RT devices are used on conventional devices, and can be useful on zoned
devices that support a very large number of open zones.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_super.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 690bb068a23a..e24f6a608b91 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -110,7 +110,7 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -155,6 +155,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+ fsparam_u32("max_open_zones", Opt_max_open_zones),
{}
};
@@ -234,6 +235,9 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
+ if (mp->m_max_open_zones)
+ seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+
return 0;
}
@@ -1456,6 +1460,9 @@ xfs_fs_parse_param(
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
+ case Opt_max_open_zones:
+ parsing_mp->m_max_open_zones = result.uint_32;
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 40/43] xfs: add a max_open_zones mount option
2024-12-11 8:55 ` [PATCH 40/43] xfs: add a max_open_zones mount option Christoph Hellwig
@ 2024-12-13 22:57 ` Darrick J. Wong
2024-12-15 6:16 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 22:57 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:05AM +0100, Christoph Hellwig wrote:
> Allow limiting the number of open zones used below that exported by the
> device. This is required to tune the number of write streams when zoned
> RT devices are used on conventional devices, and can be useful on zoned
> devices that support a very large number of open zones.
Can this be changed during a remount operation? Do we have to
revalidate the value?
--D
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_super.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 690bb068a23a..e24f6a608b91 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -110,7 +110,7 @@ enum {
> Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
> Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
> Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
> + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
> };
>
> static const struct fs_parameter_spec xfs_fs_parameters[] = {
> @@ -155,6 +155,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
> fsparam_flag("nodiscard", Opt_nodiscard),
> fsparam_flag("dax", Opt_dax),
> fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
> + fsparam_u32("max_open_zones", Opt_max_open_zones),
> {}
> };
>
> @@ -234,6 +235,9 @@ xfs_fs_show_options(
> if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
> seq_puts(m, ",noquota");
>
> + if (mp->m_max_open_zones)
> + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
> +
> return 0;
> }
>
> @@ -1456,6 +1460,9 @@ xfs_fs_parse_param(
> xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
> parsing_mp->m_features |= XFS_FEAT_NOATTR2;
> return 0;
> + case Opt_max_open_zones:
> + parsing_mp->m_max_open_zones = result.uint_32;
> + return 0;
> default:
> xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
> return -EINVAL;
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 40/43] xfs: add a max_open_zones mount option
2024-12-13 22:57 ` Darrick J. Wong
@ 2024-12-15 6:16 ` Christoph Hellwig
2024-12-17 17:12 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:16 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 02:57:11PM -0800, Darrick J. Wong wrote:
> On Wed, Dec 11, 2024 at 09:55:05AM +0100, Christoph Hellwig wrote:
> > Allow limiting the number of open zones used below that exported by the
> > device. This is required to tune the number of write streams when zoned
> > RT devices are used on conventional devices, and can be useful on zoned
> > devices that support a very large number of open zones.
>
> Can this be changed during a remount operation? Do we have to
> revalidate the value?
Right no it can't be changed during remount as there is no code added for
it in xfs_fs_reconfigure. If a strong use case to change it shows up
we could support it, but it's going to require some nasty code especially
for reducing the limit, so I'd rather not do it unless I have to.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 40/43] xfs: add a max_open_zones mount option
2024-12-15 6:16 ` Christoph Hellwig
@ 2024-12-17 17:12 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:12 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 07:16:44AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 02:57:11PM -0800, Darrick J. Wong wrote:
> > On Wed, Dec 11, 2024 at 09:55:05AM +0100, Christoph Hellwig wrote:
> > > Allow limiting the number of open zones used below that exported by the
> > > device. This is required to tune the number of write streams when zoned
> > > RT devices are used on conventional devices, and can be useful on zoned
> > > devices that support a very large number of open zones.
> >
> > Can this be changed during a remount operation? Do we have to
> > revalidate the value?
>
> Right no it can't be changed during remount as there is no code added for
> it in xfs_fs_reconfigure. If a strong use case to change it shows up
> we could support it, but it's going to require some nasty code especially
> for reducing the limit, so I'd rather not do it unless I have to.
Nah let's wait until someone actually gives us a use case.
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 41/43] xfs: support write life time based data placement
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (39 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 40/43] xfs: add a max_open_zones mount option Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:00 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 42/43] xfs: wire up the show_stats super operation Christoph Hellwig
2024-12-11 8:55 ` [PATCH 43/43] xfs: export zone stats in /proc/*/mountstats Christoph Hellwig
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
From: Hans Holmberg <hans.holmberg@wdc.com>
Add a file write life time data placement allocation scheme that aims to
minimize fragmentation and thereby to do two things:
a) separate file data to different zones when possible.
b) colocate file data of similar life times when feasible.
To get best results, average file sizes should align with the zone
capacity that is reported through the XFS_IOC_FSGEOMETRY ioctl.
For RocksDB using leveled compaction, the lifetime hints can improve
throughput for overwrite workloads at 80% file system utilization by
~10%.
Lifetime hints can be disabled using the nolifetime mount option.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_mount.h | 2 +
fs/xfs/xfs_super.c | 10 ++++
fs/xfs/xfs_zone_alloc.c | 126 +++++++++++++++++++++++++++++++++++-----
fs/xfs/xfs_zone_gc.c | 2 +-
fs/xfs/xfs_zone_priv.h | 9 ++-
5 files changed, 134 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ec8612c8b71d..748b7a7da407 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -365,6 +365,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
+#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@@ -420,6 +421,7 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
__XFS_HAS_FEAT(zoned, ZONED)
+__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e24f6a608b91..d2f2fa26c487 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -111,6 +111,7 @@ enum {
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+ Opt_lifetime, Opt_nolifetime,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -156,6 +157,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
fsparam_u32("max_open_zones", Opt_max_open_zones),
+ fsparam_flag("lifetime", Opt_lifetime),
+ fsparam_flag("nolifetime", Opt_nolifetime),
{}
};
@@ -184,6 +187,7 @@ xfs_fs_show_options(
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
+ { XFS_FEAT_NOLIFETIME, ",nolifetime" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
@@ -1463,6 +1467,12 @@ xfs_fs_parse_param(
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
+ case Opt_lifetime:
+ parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
+ return 0;
+ case Opt_nolifetime:
+ parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 291cf39a5989..2f362da0d31c 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -381,6 +381,7 @@ static struct xfs_open_zone *
xfs_init_open_zone(
struct xfs_rtgroup *rtg,
xfs_rgblock_t write_pointer,
+ enum rw_hint write_hint,
bool is_gc)
{
struct xfs_open_zone *oz;
@@ -391,6 +392,7 @@ xfs_init_open_zone(
oz->oz_rtg = rtg;
oz->oz_write_pointer = write_pointer;
oz->oz_written = write_pointer;
+ oz->oz_write_hint = write_hint;
oz->oz_is_gc = is_gc;
/*
@@ -407,6 +409,7 @@ xfs_init_open_zone(
struct xfs_open_zone *
xfs_open_zone(
struct xfs_mount *mp,
+ enum rw_hint write_hint,
bool is_gc)
{
struct xfs_zone_info *zi = mp->m_zone_info;
@@ -422,7 +425,7 @@ xfs_open_zone(
xfs_group_clear_mark(xg, XFS_RTG_FREE);
atomic_dec(&zi->zi_nr_free_zones);
zi->zi_free_zone_cursor = xg->xg_gno;
- return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
+ return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
}
/*
@@ -434,7 +437,8 @@ xfs_open_zone(
*/
static struct xfs_open_zone *
xfs_activate_zone(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ enum rw_hint write_hint)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *oz;
@@ -443,7 +447,7 @@ xfs_activate_zone(
XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
return NULL;
- oz = xfs_open_zone(mp, false);
+ oz = xfs_open_zone(mp, write_hint, false);
if (!oz)
return NULL;
@@ -460,16 +464,78 @@ xfs_activate_zone(
return oz;
}
+/*
+ * For data with short or medium lifetime, try to colocated it into an
+ * already open zone with a matching temperature.
+ */
+static bool
+xfs_colocate_eagerly(
+ enum rw_hint file_hint)
+{
+ switch (file_hint) {
+ case WRITE_LIFE_MEDIUM:
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+xfs_good_hint_match(
+ struct xfs_open_zone *oz,
+ enum rw_hint file_hint)
+{
+ switch (oz->oz_write_hint) {
+ case WRITE_LIFE_LONG:
+ case WRITE_LIFE_EXTREME:
+ /* colocate long and extreme */
+ if (file_hint == WRITE_LIFE_LONG ||
+ file_hint == WRITE_LIFE_EXTREME)
+ return true;
+ break;
+ case WRITE_LIFE_MEDIUM:
+ /* colocate medium with medium */
+ if (file_hint == WRITE_LIFE_MEDIUM)
+ return true;
+ break;
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ case WRITE_LIFE_NOT_SET:
+ /* colocate short and none */
+ if (file_hint <= WRITE_LIFE_SHORT)
+ return true;
+ break;
+ }
+ return false;
+}
+
static bool
xfs_try_use_zone(
struct xfs_zone_info *zi,
- struct xfs_open_zone *oz)
+ enum rw_hint file_hint,
+ struct xfs_open_zone *oz,
+ bool lowspace)
{
if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
return false;
+ if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+ return false;
if (!atomic_inc_not_zero(&oz->oz_ref))
return false;
+ /*
+ * If we have a hint set for the data, use that for the zone even if
+ * some data was written already without any hint set, but don't change
+ * the temperature after that as that would make little sense without
+ * tracking per-temperature class written block counts, which is
+ * probably overkill anyway.
+ */
+ if (file_hint != WRITE_LIFE_NOT_SET &&
+ oz->oz_write_hint == WRITE_LIFE_NOT_SET)
+ oz->oz_write_hint = file_hint;
+
/*
* If we couldn't match by inode or life time we just pick the first
* zone with enough space above. For that we want the least busy zone
@@ -484,28 +550,38 @@ xfs_try_use_zone(
static struct xfs_open_zone *
xfs_select_open_zone_lru(
- struct xfs_zone_info *zi)
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint,
+ bool lowspace)
{
struct xfs_open_zone *oz;
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, oz))
+ if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
return oz;
return NULL;
}
static struct xfs_open_zone *
xfs_select_open_zone_mru(
- struct xfs_zone_info *zi)
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint)
{
struct xfs_open_zone *oz;
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, oz))
+ if (xfs_try_use_zone(zi, file_hint, oz, false))
return oz;
return NULL;
}
+static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
+{
+ if (xfs_has_nolifetime(ip->i_mount))
+ return WRITE_LIFE_NOT_SET;
+ return VFS_I(ip)->i_write_hint;
+}
+
/*
* Try to pack inodes that are written back after they were closed tight instead
* of trying to open new zones for them or spread them to the least recently
@@ -535,10 +611,19 @@ xfs_select_zone_nowait(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
+ enum rw_hint write_hint = xfs_inode_write_hint(ip);
struct xfs_open_zone *oz = NULL;
- if (xfs_zoned_pack_tight(ip))
- oz = xfs_select_open_zone_mru(zi);
+ /*
+ * Try to fill up open zones with matching temperature if available. It
+ * is better to try to co-locate data when this is favorable, so we can
+ * activate empty zones when it is statistically better to separate
+ * data.
+ */
+ if (xfs_colocate_eagerly(write_hint))
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ else if (xfs_zoned_pack_tight(ip))
+ oz = xfs_select_open_zone_mru(zi, write_hint);
if (oz)
return oz;
@@ -546,12 +631,26 @@ xfs_select_zone_nowait(
* If we are below the open limit try to activate a zone.
*/
if (zi->zi_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
- oz = xfs_activate_zone(mp);
+ oz = xfs_activate_zone(mp, write_hint);
if (oz)
return oz;
}
- return xfs_select_open_zone_lru(zi);
+ /*
+ * Try to colocate cold data with other cold data if we failed to open a
+ * new zone for it.
+ */
+ if (write_hint != WRITE_LIFE_NOT_SET &&
+ !xfs_colocate_eagerly(write_hint)) {
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ if (oz)
+ return oz;
+ }
+
+ oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
+ if (oz)
+ return oz;
+ return xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
}
static struct xfs_open_zone *
@@ -801,7 +900,8 @@ xfs_init_zone(
struct xfs_open_zone *oz;
atomic_inc(&rtg_group(rtg)->xg_active_ref);
- oz = xfs_init_open_zone(rtg, write_pointer, false);
+ oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
+ false);
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
zi->zi_nr_open_zones++;
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 085d7001935e..e9b2c8ed5e9f 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -489,7 +489,7 @@ xfs_select_gc_zone(
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
oz = xfs_steal_open_zone_for_gc(zi);
else
- oz = xfs_open_zone(mp, true);
+ oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
spin_unlock(&zi->zi_zone_list_lock);
if (oz)
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 0b720026e54a..eb7187e09551 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -26,6 +26,12 @@ struct xfs_open_zone {
xfs_rgblock_t oz_write_pointer;
xfs_rgblock_t oz_written;
+ /*
+ * Write hint (data temperature) assigned to this zone, or
+ * WRITE_LIFE_NOT_SET if none was set.
+ */
+ enum rw_hint oz_write_hint;
+
/*
* Is this open zone used for garbage collection? There can only be a
* single open GC zone, which is pointed to by zi_open_gc_zone in
@@ -80,7 +86,8 @@ struct xfs_zone_info {
struct xfs_group *zi_reset_list;
};
-struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
+ enum rw_hint write_hint, bool is_gc);
int xfs_zone_reset_sync(struct xfs_rtgroup *rtg);
bool xfs_zoned_need_gc(struct xfs_mount *mp);
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 41/43] xfs: support write life time based data placement
2024-12-11 8:55 ` [PATCH 41/43] xfs: support write life time based data placement Christoph Hellwig
@ 2024-12-13 23:00 ` Darrick J. Wong
2024-12-15 6:19 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:00 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:06AM +0100, Christoph Hellwig wrote:
> From: Hans Holmberg <hans.holmberg@wdc.com>
>
> Add a file write life time data placement allocation scheme that aims to
> minimize fragmentation and thereby to do two things:
>
> a) separate file data to different zones when possible.
> b) colocate file data of similar life times when feasible.
>
> To get best results, average file sizes should align with the zone
> capacity that is reported through the XFS_IOC_FSGEOMETRY ioctl.
>
> For RocksDB using leveled compaction, the lifetime hints can improve
> throughput for overwrite workloads at 80% file system utilization by
> ~10%.
>
> Lifetime hints can be disabled using the nolifetime mount option.
>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_mount.h | 2 +
> fs/xfs/xfs_super.c | 10 ++++
> fs/xfs/xfs_zone_alloc.c | 126 +++++++++++++++++++++++++++++++++++-----
> fs/xfs/xfs_zone_gc.c | 2 +-
> fs/xfs/xfs_zone_priv.h | 9 ++-
> 5 files changed, 134 insertions(+), 15 deletions(-)
>
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index ec8612c8b71d..748b7a7da407 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -365,6 +365,7 @@ typedef struct xfs_mount {
> #define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
>
> /* Mount features */
> +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
> #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
> #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
> #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
> @@ -420,6 +421,7 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64)
> __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
> __XFS_HAS_FEAT(metadir, METADIR)
> __XFS_HAS_FEAT(zoned, ZONED)
> +__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
>
> static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
> {
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e24f6a608b91..d2f2fa26c487 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -111,6 +111,7 @@ enum {
> Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
> Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
> + Opt_lifetime, Opt_nolifetime,
> };
>
> static const struct fs_parameter_spec xfs_fs_parameters[] = {
> @@ -156,6 +157,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
> fsparam_flag("dax", Opt_dax),
> fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
> fsparam_u32("max_open_zones", Opt_max_open_zones),
> + fsparam_flag("lifetime", Opt_lifetime),
> + fsparam_flag("nolifetime", Opt_nolifetime),
> {}
> };
>
> @@ -184,6 +187,7 @@ xfs_fs_show_options(
> { XFS_FEAT_LARGE_IOSIZE, ",largeio" },
> { XFS_FEAT_DAX_ALWAYS, ",dax=always" },
> { XFS_FEAT_DAX_NEVER, ",dax=never" },
> + { XFS_FEAT_NOLIFETIME, ",nolifetime" },
> { 0, NULL }
> };
> struct xfs_mount *mp = XFS_M(root->d_sb);
> @@ -1463,6 +1467,12 @@ xfs_fs_parse_param(
> case Opt_max_open_zones:
> parsing_mp->m_max_open_zones = result.uint_32;
> return 0;
> + case Opt_lifetime:
> + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
> + return 0;
> + case Opt_nolifetime:
> + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
> + return 0;
> default:
> xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
> return -EINVAL;
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> index 291cf39a5989..2f362da0d31c 100644
> --- a/fs/xfs/xfs_zone_alloc.c
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -381,6 +381,7 @@ static struct xfs_open_zone *
> xfs_init_open_zone(
> struct xfs_rtgroup *rtg,
> xfs_rgblock_t write_pointer,
> + enum rw_hint write_hint,
> bool is_gc)
> {
> struct xfs_open_zone *oz;
> @@ -391,6 +392,7 @@ xfs_init_open_zone(
> oz->oz_rtg = rtg;
> oz->oz_write_pointer = write_pointer;
> oz->oz_written = write_pointer;
> + oz->oz_write_hint = write_hint;
> oz->oz_is_gc = is_gc;
>
> /*
> @@ -407,6 +409,7 @@ xfs_init_open_zone(
> struct xfs_open_zone *
> xfs_open_zone(
> struct xfs_mount *mp,
> + enum rw_hint write_hint,
> bool is_gc)
> {
> struct xfs_zone_info *zi = mp->m_zone_info;
> @@ -422,7 +425,7 @@ xfs_open_zone(
> xfs_group_clear_mark(xg, XFS_RTG_FREE);
> atomic_dec(&zi->zi_nr_free_zones);
> zi->zi_free_zone_cursor = xg->xg_gno;
> - return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
> + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
> }
>
> /*
> @@ -434,7 +437,8 @@ xfs_open_zone(
> */
> static struct xfs_open_zone *
> xfs_activate_zone(
> - struct xfs_mount *mp)
> + struct xfs_mount *mp,
> + enum rw_hint write_hint)
> {
> struct xfs_zone_info *zi = mp->m_zone_info;
> struct xfs_open_zone *oz;
> @@ -443,7 +447,7 @@ xfs_activate_zone(
> XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
> return NULL;
>
> - oz = xfs_open_zone(mp, false);
> + oz = xfs_open_zone(mp, write_hint, false);
> if (!oz)
> return NULL;
>
> @@ -460,16 +464,78 @@ xfs_activate_zone(
> return oz;
> }
>
> +/*
> + * For data with short or medium lifetime, try to colocated it into an
> + * already open zone with a matching temperature.
> + */
> +static bool
> +xfs_colocate_eagerly(
> + enum rw_hint file_hint)
> +{
> + switch (file_hint) {
> + case WRITE_LIFE_MEDIUM:
> + case WRITE_LIFE_SHORT:
> + case WRITE_LIFE_NONE:
> + return true;
> + default:
> + return false;
> + }
> +}
> +
> +static bool
> +xfs_good_hint_match(
> + struct xfs_open_zone *oz,
> + enum rw_hint file_hint)
> +{
> + switch (oz->oz_write_hint) {
> + case WRITE_LIFE_LONG:
> + case WRITE_LIFE_EXTREME:
> + /* colocate long and extreme */
> + if (file_hint == WRITE_LIFE_LONG ||
> + file_hint == WRITE_LIFE_EXTREME)
> + return true;
> + break;
> + case WRITE_LIFE_MEDIUM:
> + /* colocate medium with medium */
> + if (file_hint == WRITE_LIFE_MEDIUM)
> + return true;
> + break;
> + case WRITE_LIFE_SHORT:
> + case WRITE_LIFE_NONE:
> + case WRITE_LIFE_NOT_SET:
> + /* colocate short and none */
> + if (file_hint <= WRITE_LIFE_SHORT)
> + return true;
> + break;
> + }
> + return false;
> +}
> +
> static bool
> xfs_try_use_zone(
> struct xfs_zone_info *zi,
> - struct xfs_open_zone *oz)
> + enum rw_hint file_hint,
> + struct xfs_open_zone *oz,
> + bool lowspace)
> {
> if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
> return false;
> + if (!lowspace && !xfs_good_hint_match(oz, file_hint))
> + return false;
> if (!atomic_inc_not_zero(&oz->oz_ref))
> return false;
>
> + /*
> + * If we have a hint set for the data, use that for the zone even if
> + * some data was written already without any hint set, but don't change
> + * the temperature after that as that would make little sense without
> + * tracking per-temperature class written block counts, which is
> + * probably overkill anyway.
> + */
> + if (file_hint != WRITE_LIFE_NOT_SET &&
> + oz->oz_write_hint == WRITE_LIFE_NOT_SET)
> + oz->oz_write_hint = file_hint;
> +
> /*
> * If we couldn't match by inode or life time we just pick the first
> * zone with enough space above. For that we want the least busy zone
> @@ -484,28 +550,38 @@ xfs_try_use_zone(
>
> static struct xfs_open_zone *
> xfs_select_open_zone_lru(
> - struct xfs_zone_info *zi)
> + struct xfs_zone_info *zi,
> + enum rw_hint file_hint,
> + bool lowspace)
> {
> struct xfs_open_zone *oz;
>
> list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
> - if (xfs_try_use_zone(zi, oz))
> + if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
> return oz;
> return NULL;
> }
>
> static struct xfs_open_zone *
> xfs_select_open_zone_mru(
> - struct xfs_zone_info *zi)
> + struct xfs_zone_info *zi,
> + enum rw_hint file_hint)
> {
> struct xfs_open_zone *oz;
>
> list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
> - if (xfs_try_use_zone(zi, oz))
> + if (xfs_try_use_zone(zi, file_hint, oz, false))
> return oz;
> return NULL;
> }
>
> +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
> +{
> + if (xfs_has_nolifetime(ip->i_mount))
> + return WRITE_LIFE_NOT_SET;
> + return VFS_I(ip)->i_write_hint;
> +}
> +
> /*
> * Try to pack inodes that are written back after they were closed tight instead
> * of trying to open new zones for them or spread them to the least recently
> @@ -535,10 +611,19 @@ xfs_select_zone_nowait(
> {
> struct xfs_mount *mp = ip->i_mount;
> struct xfs_zone_info *zi = mp->m_zone_info;
> + enum rw_hint write_hint = xfs_inode_write_hint(ip);
> struct xfs_open_zone *oz = NULL;
>
> - if (xfs_zoned_pack_tight(ip))
> - oz = xfs_select_open_zone_mru(zi);
> + /*
> + * Try to fill up open zones with matching temperature if available. It
> + * is better to try to co-locate data when this is favorable, so we can
> + * activate empty zones when it is statistically better to separate
> + * data.
> + */
> + if (xfs_colocate_eagerly(write_hint))
> + oz = xfs_select_open_zone_lru(zi, write_hint, false);
> + else if (xfs_zoned_pack_tight(ip))
> + oz = xfs_select_open_zone_mru(zi, write_hint);
> if (oz)
> return oz;
>
> @@ -546,12 +631,26 @@ xfs_select_zone_nowait(
> * If we are below the open limit try to activate a zone.
> */
> if (zi->zi_nr_open_zones < mp->m_max_open_zones - XFS_OPEN_GC_ZONES) {
> - oz = xfs_activate_zone(mp);
> + oz = xfs_activate_zone(mp, write_hint);
> if (oz)
> return oz;
> }
>
> - return xfs_select_open_zone_lru(zi);
> + /*
> + * Try to colocate cold data with other cold data if we failed to open a
> + * new zone for it.
> + */
> + if (write_hint != WRITE_LIFE_NOT_SET &&
> + !xfs_colocate_eagerly(write_hint)) {
> + oz = xfs_select_open_zone_lru(zi, write_hint, false);
> + if (oz)
> + return oz;
> + }
> +
> + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
> + if (oz)
> + return oz;
> + return xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
> }
>
> static struct xfs_open_zone *
> @@ -801,7 +900,8 @@ xfs_init_zone(
> struct xfs_open_zone *oz;
>
> atomic_inc(&rtg_group(rtg)->xg_active_ref);
> - oz = xfs_init_open_zone(rtg, write_pointer, false);
> + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
> + false);
> list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> zi->zi_nr_open_zones++;
>
> diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
> index 085d7001935e..e9b2c8ed5e9f 100644
> --- a/fs/xfs/xfs_zone_gc.c
> +++ b/fs/xfs/xfs_zone_gc.c
> @@ -489,7 +489,7 @@ xfs_select_gc_zone(
> if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
> oz = xfs_steal_open_zone_for_gc(zi);
> else
> - oz = xfs_open_zone(mp, true);
> + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
I wonder, is it possible to remember (at least incore) what write hint
was associated with an open zone all the way to gc time so that zones
with compatible hints can be gc'd into a new zone with the same hint?
Or is that overkill?
--D
> spin_unlock(&zi->zi_zone_list_lock);
>
> if (oz)
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index 0b720026e54a..eb7187e09551 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -26,6 +26,12 @@ struct xfs_open_zone {
> xfs_rgblock_t oz_write_pointer;
> xfs_rgblock_t oz_written;
>
> + /*
> + * Write hint (data temperature) assigned to this zone, or
> + * WRITE_LIFE_NOT_SET if none was set.
> + */
> + enum rw_hint oz_write_hint;
> +
> /*
> * Is this open zone used for garbage collection? There can only be a
> * single open GC zone, which is pointed to by zi_open_gc_zone in
> @@ -80,7 +86,8 @@ struct xfs_zone_info {
> struct xfs_group *zi_reset_list;
> };
>
> -struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
> +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
> + enum rw_hint write_hint, bool is_gc);
>
> int xfs_zone_reset_sync(struct xfs_rtgroup *rtg);
> bool xfs_zoned_need_gc(struct xfs_mount *mp);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread* Re: [PATCH 41/43] xfs: support write life time based data placement
2024-12-13 23:00 ` Darrick J. Wong
@ 2024-12-15 6:19 ` Christoph Hellwig
2024-12-17 17:14 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-15 6:19 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Fri, Dec 13, 2024 at 03:00:51PM -0800, Darrick J. Wong wrote:
> > if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
> > oz = xfs_steal_open_zone_for_gc(zi);
> > else
> > - oz = xfs_open_zone(mp, true);
> > + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
>
> I wonder, is it possible to remember (at least incore) what write hint
> was associated with an open zone all the way to gc time so that zones
> with compatible hints can be gc'd into a new zone with the same hint?
> Or is that overkill?
We've been thinking about that a lot. Right now we don't have an
immediate use case for it, but it sure would be nice to have it without
needing another incompat bit. But then we'd need to find some space
(3 bits to be exact) in the on-disk inode for it that doesn't make
otherwise useful space unavaіlable for more widely useful things.
If you have a good idea I'll look into implementing it.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 41/43] xfs: support write life time based data placement
2024-12-15 6:19 ` Christoph Hellwig
@ 2024-12-17 17:14 ` Darrick J. Wong
2024-12-18 7:10 ` Christoph Hellwig
0 siblings, 1 reply; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-17 17:14 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Sun, Dec 15, 2024 at 07:19:02AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 13, 2024 at 03:00:51PM -0800, Darrick J. Wong wrote:
> > > if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE))
> > > oz = xfs_steal_open_zone_for_gc(zi);
> > > else
> > > - oz = xfs_open_zone(mp, true);
> > > + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
> >
> > I wonder, is it possible to remember (at least incore) what write hint
> > was associated with an open zone all the way to gc time so that zones
> > with compatible hints can be gc'd into a new zone with the same hint?
> > Or is that overkill?
>
> We've been thinking about that a lot. Right now we don't have an
> immediate use case for it, but it sure would be nice to have it without
> needing another incompat bit. But then we'd need to find some space
> (3 bits to be exact) in the on-disk inode for it that doesn't make
> otherwise useful space unavaіlable for more widely useful things.
> If you have a good idea I'll look into implementing it.
How about reusing the dmapi fields in xfs_dinode, seeing as we forced
them to zero in the base metadir series? Or do you have another use in
mind for those 6 bytes?
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 41/43] xfs: support write life time based data placement
2024-12-17 17:14 ` Darrick J. Wong
@ 2024-12-18 7:10 ` Christoph Hellwig
2024-12-18 18:19 ` Darrick J. Wong
0 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-18 7:10 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Carlos Maiolino, Hans Holmberg, linux-xfs
On Tue, Dec 17, 2024 at 09:14:47AM -0800, Darrick J. Wong wrote:
> > We've been thinking about that a lot. Right now we don't have an
> > immediate use case for it, but it sure would be nice to have it without
> > needing another incompat bit. But then we'd need to find some space
> > (3 bits to be exact) in the on-disk inode for it that doesn't make
> > otherwise useful space unavaіlable for more widely useful things.
> > If you have a good idea I'll look into implementing it.
>
> How about reusing the dmapi fields in xfs_dinode, seeing as we forced
> them to zero in the base metadir series? Or do you have another use in
> mind for those 6 bytes?
I've always seen those a general space reserve for things that could
be useful for all inodes as they are fairly large and contiguous. For
these three bits I'd rather still them where it doesn't hurt too much.
But maybe I'm overthinking it.
^ permalink raw reply [flat|nested] 143+ messages in thread
* Re: [PATCH 41/43] xfs: support write life time based data placement
2024-12-18 7:10 ` Christoph Hellwig
@ 2024-12-18 18:19 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-18 18:19 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 18, 2024 at 08:10:48AM +0100, Christoph Hellwig wrote:
> On Tue, Dec 17, 2024 at 09:14:47AM -0800, Darrick J. Wong wrote:
> > > We've been thinking about that a lot. Right now we don't have an
> > > immediate use case for it, but it sure would be nice to have it without
> > > needing another incompat bit. But then we'd need to find some space
> > > (3 bits to be exact) in the on-disk inode for it that doesn't make
> > > otherwise useful space unavaіlable for more widely useful things.
> > > If you have a good idea I'll look into implementing it.
> >
> > How about reusing the dmapi fields in xfs_dinode, seeing as we forced
> > them to zero in the base metadir series? Or do you have another use in
> > mind for those 6 bytes?
>
> I've always seen those a general space reserve for things that could
> be useful for all inodes as they are fairly large and contiguous. For
> these three bits I'd rather still them where it doesn't hurt too much.
> But maybe I'm overthinking it.
Is anyone planning to add persistent write hints to xfs files?
--D
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 42/43] xfs: wire up the show_stats super operation
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (40 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 41/43] xfs: support write life time based data placement Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:01 ` Darrick J. Wong
2024-12-11 8:55 ` [PATCH 43/43] xfs: export zone stats in /proc/*/mountstats Christoph Hellwig
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
The show_stats option allows a file system to dump plain text statistic
on a per-mount basis into /proc/*/mountstats. Wire up a no-op version
which will grow useful information for zoned file systems later.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_super.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d2f2fa26c487..47468623fdc6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1238,6 +1238,14 @@ xfs_fs_shutdown(
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
+static int
+xfs_fs_show_stats(
+ struct seq_file *m,
+ struct dentry *root)
+{
+ return 0;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@@ -1252,6 +1260,7 @@ static const struct super_operations xfs_super_operations = {
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
+ .show_stats = xfs_fs_show_stats,
};
static int
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 42/43] xfs: wire up the show_stats super operation
2024-12-11 8:55 ` [PATCH 42/43] xfs: wire up the show_stats super operation Christoph Hellwig
@ 2024-12-13 23:01 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:01 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:07AM +0100, Christoph Hellwig wrote:
> The show_stats option allows a file system to dump plain text statistic
> on a per-mount basis into /proc/*/mountstats. Wire up a no-op version
> which will grow useful information for zoned file systems later.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_super.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d2f2fa26c487..47468623fdc6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1238,6 +1238,14 @@ xfs_fs_shutdown(
> xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
> }
>
> +static int
> +xfs_fs_show_stats(
> + struct seq_file *m,
> + struct dentry *root)
> +{
> + return 0;
> +}
> +
> static const struct super_operations xfs_super_operations = {
> .alloc_inode = xfs_fs_alloc_inode,
> .destroy_inode = xfs_fs_destroy_inode,
> @@ -1252,6 +1260,7 @@ static const struct super_operations xfs_super_operations = {
> .nr_cached_objects = xfs_fs_nr_cached_objects,
> .free_cached_objects = xfs_fs_free_cached_objects,
> .shutdown = xfs_fs_shutdown,
> + .show_stats = xfs_fs_show_stats,
> };
>
> static int
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread
* [PATCH 43/43] xfs: export zone stats in /proc/*/mountstats
2024-12-11 8:54 RFC: support for zoned devices Christoph Hellwig
` (41 preceding siblings ...)
2024-12-11 8:55 ` [PATCH 42/43] xfs: wire up the show_stats super operation Christoph Hellwig
@ 2024-12-11 8:55 ` Christoph Hellwig
2024-12-13 23:04 ` Darrick J. Wong
42 siblings, 1 reply; 143+ messages in thread
From: Christoph Hellwig @ 2024-12-11 8:55 UTC (permalink / raw)
To: Carlos Maiolino; +Cc: Darrick J. Wong, Hans Holmberg, linux-xfs
From: Hans Holmberg <hans.holmberg@wdc.com>
Add the per-zone life time hint and the used block distribution
for fully written zones, grouping reclaimable zones in fixed-percentage
buckets spanning 0..9%, 10..19% and full zones as 100% used as well as a
few statistics about the zone allocator and open and reclaimable zones
in /proc/*/mountstats.
This gives good insight into data fragmentation and data placement
success rate.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Co-developed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/Makefile | 1 +
fs/xfs/xfs_super.c | 4 ++
fs/xfs/xfs_zone_alloc.h | 1 +
fs/xfs/xfs_zone_info.c | 120 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+)
create mode 100644 fs/xfs/xfs_zone_info.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index e38838409271..5bf501cf8271 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -140,6 +140,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
xfs_zone_alloc.o \
xfs_zone_gc.o \
+ xfs_zone_info.o \
xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 47468623fdc6..df384c4de192 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1243,6 +1243,10 @@ xfs_fs_show_stats(
struct seq_file *m,
struct dentry *root)
{
+ struct xfs_mount *mp = XFS_M(root->d_sb);
+
+ if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+ xfs_zoned_show_stats(m, mp);
return 0;
}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index 44fa1594f73e..94ab32826c83 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -34,6 +34,7 @@ void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
enum xfs_free_counter ctr);
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
#ifdef CONFIG_XFS_RT
int xfs_mount_zones(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
new file mode 100644
index 000000000000..689c9acb24d7
--- /dev/null
+++ b/fs/xfs/xfs_zone_info.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2024 Christoph Hellwig.
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+
+static const char xfs_write_hint_shorthand[6][16] = {
+ "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
+
+static inline const char *
+xfs_write_hint_to_str(
+ uint8_t write_hint)
+{
+ if (write_hint > WRITE_LIFE_EXTREME)
+ return "UNKNOWN";
+ return xfs_write_hint_shorthand[write_hint];
+}
+
+static void
+xfs_show_open_zone(
+ struct seq_file *m,
+ struct xfs_open_zone *oz)
+{
+ seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
+ rtg_rgno(oz->oz_rtg),
+ oz->oz_write_pointer, oz->oz_written,
+ rtg_rmap(oz->oz_rtg)->i_used_blocks,
+ xfs_write_hint_to_str(oz->oz_write_hint));
+}
+
+#define XFS_USED_BUCKETS 10
+#define XFS_USED_BUCKET_PCT (100 / XFS_USED_BUCKETS)
+
+static unsigned int
+xfs_zone_to_bucket(
+ struct xfs_rtgroup *rtg)
+{
+ return div_u64(rtg_rmap(rtg)->i_used_blocks * XFS_USED_BUCKETS,
+ rtg->rtg_extents);
+}
+
+static void
+xfs_show_full_zone_used_distribution(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0);
+ unsigned int buckets[XFS_USED_BUCKETS] = {0};
+ unsigned int reclaimable = 0, full, i;
+ struct xfs_rtgroup *rtg;
+
+ lockdep_assert_held(&zi->zi_zone_list_lock);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, rtg, ULONG_MAX, XFS_RTG_RECLAIMABLE) {
+ buckets[xfs_zone_to_bucket(rtg)]++;
+ reclaimable++;
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < XFS_USED_BUCKETS; i++)
+ seq_printf(m, "\t %2u..%2u%%: %u\n", i * XFS_USED_BUCKET_PCT,
+ (i + 1) * XFS_USED_BUCKET_PCT - 1, buckets[i]);
+
+ full = mp->m_sb.sb_rgcount;
+ if (zi->zi_open_gc_zone)
+ full--;
+ full -= zi->zi_nr_open_zones;
+ full -= atomic_read(&zi->zi_nr_free_zones);
+ full -= reclaimable;
+
+ seq_printf(m, "\t 100%%: %u\n", full);
+}
+
+void
+xfs_zoned_show_stats(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ seq_puts(m, "\n");
+
+ seq_printf(m, "\tuser free blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ seq_printf(m, "\treserved free blocks: %lld\n",
+ mp->m_resblks[XC_FREE_RTEXTENTS].avail);
+ seq_printf(m, "\tuser available blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
+ seq_printf(m, "\treserved available blocks: %lld\n",
+ mp->m_resblks[XC_FREE_RTAVAILABLE].avail);
+ seq_printf(m, "\treservations required: %d\n",
+ !list_empty_careful(&zi->zi_reclaim_reservations));
+ seq_printf(m, "\tGC required: %d\n",
+ xfs_zoned_need_gc(mp));
+
+ spin_lock(&zi->zi_zone_list_lock);
+ seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
+ seq_puts(m, "\topen zones:\n");
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ xfs_show_open_zone(m, oz);
+ if (zi->zi_open_gc_zone) {
+ seq_puts(m, "\topen gc zone:\n");
+ xfs_show_open_zone(m, zi->zi_open_gc_zone);
+ }
+ seq_puts(m, "\tused blocks distribution (fully written zones):\n");
+ xfs_show_full_zone_used_distribution(m, mp);
+ spin_unlock(&zi->zi_zone_list_lock);
+}
--
2.45.2
^ permalink raw reply related [flat|nested] 143+ messages in thread* Re: [PATCH 43/43] xfs: export zone stats in /proc/*/mountstats
2024-12-11 8:55 ` [PATCH 43/43] xfs: export zone stats in /proc/*/mountstats Christoph Hellwig
@ 2024-12-13 23:04 ` Darrick J. Wong
0 siblings, 0 replies; 143+ messages in thread
From: Darrick J. Wong @ 2024-12-13 23:04 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Carlos Maiolino, Hans Holmberg, linux-xfs
On Wed, Dec 11, 2024 at 09:55:08AM +0100, Christoph Hellwig wrote:
> From: Hans Holmberg <hans.holmberg@wdc.com>
>
> Add the per-zone life time hint and the used block distribution
> for fully written zones, grouping reclaimable zones in fixed-percentage
> buckets spanning 0..9%, 10..19% and full zones as 100% used as well as a
> few statistics about the zone allocator and open and reclaimable zones
> in /proc/*/mountstats.
>
> This gives good insight into data fragmentation and data placement
> success rate.
>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Co-developed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/xfs_super.c | 4 ++
> fs/xfs/xfs_zone_alloc.h | 1 +
> fs/xfs/xfs_zone_info.c | 120 ++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 126 insertions(+)
> create mode 100644 fs/xfs/xfs_zone_info.c
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index e38838409271..5bf501cf8271 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -140,6 +140,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
> xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
> xfs_zone_alloc.o \
> xfs_zone_gc.o \
> + xfs_zone_info.o \
> xfs_zone_space_resv.o
>
> xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 47468623fdc6..df384c4de192 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1243,6 +1243,10 @@ xfs_fs_show_stats(
> struct seq_file *m,
> struct dentry *root)
> {
> + struct xfs_mount *mp = XFS_M(root->d_sb);
> +
> + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
> + xfs_zoned_show_stats(m, mp);
> return 0;
> }
>
> diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
> index 44fa1594f73e..94ab32826c83 100644
> --- a/fs/xfs/xfs_zone_alloc.h
> +++ b/fs/xfs/xfs_zone_alloc.h
> @@ -34,6 +34,7 @@ void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
>
> uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
> enum xfs_free_counter ctr);
> +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
>
> #ifdef CONFIG_XFS_RT
> int xfs_mount_zones(struct xfs_mount *mp);
> diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
> new file mode 100644
> index 000000000000..689c9acb24d7
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_info.c
> @@ -0,0 +1,120 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023-2024 Christoph Hellwig.
> + * Copyright (c) 2024, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
> +
> +static const char xfs_write_hint_shorthand[6][16] = {
> + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
> +
> +static inline const char *
> +xfs_write_hint_to_str(
> + uint8_t write_hint)
> +{
> + if (write_hint > WRITE_LIFE_EXTREME)
> + return "UNKNOWN";
> + return xfs_write_hint_shorthand[write_hint];
> +}
> +
> +static void
> +xfs_show_open_zone(
> + struct seq_file *m,
> + struct xfs_open_zone *oz)
> +{
> + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
> + rtg_rgno(oz->oz_rtg),
> + oz->oz_write_pointer, oz->oz_written,
> + rtg_rmap(oz->oz_rtg)->i_used_blocks,
> + xfs_write_hint_to_str(oz->oz_write_hint));
> +}
> +
> +#define XFS_USED_BUCKETS 10
> +#define XFS_USED_BUCKET_PCT (100 / XFS_USED_BUCKETS)
> +
> +static unsigned int
> +xfs_zone_to_bucket(
> + struct xfs_rtgroup *rtg)
> +{
> + return div_u64(rtg_rmap(rtg)->i_used_blocks * XFS_USED_BUCKETS,
> + rtg->rtg_extents);
> +}
> +
> +static void
> +xfs_show_full_zone_used_distribution(
> + struct seq_file *m,
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0);
> + unsigned int buckets[XFS_USED_BUCKETS] = {0};
> + unsigned int reclaimable = 0, full, i;
> + struct xfs_rtgroup *rtg;
> +
> + lockdep_assert_held(&zi->zi_zone_list_lock);
> +
> + rcu_read_lock();
> + xas_for_each_marked(&xas, rtg, ULONG_MAX, XFS_RTG_RECLAIMABLE) {
> + buckets[xfs_zone_to_bucket(rtg)]++;
> + reclaimable++;
> + }
> + rcu_read_unlock();
> +
> + for (i = 0; i < XFS_USED_BUCKETS; i++)
> + seq_printf(m, "\t %2u..%2u%%: %u\n", i * XFS_USED_BUCKET_PCT,
> + (i + 1) * XFS_USED_BUCKET_PCT - 1, buckets[i]);
> +
> + full = mp->m_sb.sb_rgcount;
> + if (zi->zi_open_gc_zone)
> + full--;
> + full -= zi->zi_nr_open_zones;
> + full -= atomic_read(&zi->zi_nr_free_zones);
> + full -= reclaimable;
> +
> + seq_printf(m, "\t 100%%: %u\n", full);
> +}
> +
> +void
> +xfs_zoned_show_stats(
> + struct seq_file *m,
> + struct xfs_mount *mp)
> +{
> + struct xfs_zone_info *zi = mp->m_zone_info;
> + struct xfs_open_zone *oz;
> +
> + seq_puts(m, "\n");
> +
> + seq_printf(m, "\tuser free blocks: %lld\n",
> + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
> + seq_printf(m, "\treserved free blocks: %lld\n",
> + mp->m_resblks[XC_FREE_RTEXTENTS].avail);
> + seq_printf(m, "\tuser available blocks: %lld\n",
> + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
> + seq_printf(m, "\treserved available blocks: %lld\n",
> + mp->m_resblks[XC_FREE_RTAVAILABLE].avail);
> + seq_printf(m, "\treservations required: %d\n",
> + !list_empty_careful(&zi->zi_reclaim_reservations));
Might want to mention that these are zoned rt stats, not for the data
device.
--D
> + seq_printf(m, "\tGC required: %d\n",
> + xfs_zoned_need_gc(mp));
> +
> + spin_lock(&zi->zi_zone_list_lock);
> + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
> + seq_puts(m, "\topen zones:\n");
> + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
> + xfs_show_open_zone(m, oz);
> + if (zi->zi_open_gc_zone) {
> + seq_puts(m, "\topen gc zone:\n");
> + xfs_show_open_zone(m, zi->zi_open_gc_zone);
> + }
> + seq_puts(m, "\tused blocks distribution (fully written zones):\n");
> + xfs_show_full_zone_used_distribution(m, mp);
> + spin_unlock(&zi->zi_zone_list_lock);
> +}
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 143+ messages in thread