* [PATCH] xfs: cache open zone in inode->i_private
@ 2025-10-15 6:30 Christoph Hellwig
2025-10-16 5:17 ` Hans Holmberg
0 siblings, 1 reply; 3+ messages in thread
From: Christoph Hellwig @ 2025-10-15 6:30 UTC (permalink / raw)
To: cem; +Cc: hans.holmberg, linux-xfs
The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones. One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.
So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private. This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.
Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/xfs/xfs_mount.h | 1 -
fs/xfs/xfs_super.c | 6 ++
fs/xfs/xfs_zone_alloc.c | 134 +++++++++++++---------------------------
fs/xfs/xfs_zone_priv.h | 2 +
4 files changed, 50 insertions(+), 93 deletions(-)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..464ae1e657d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 1147bacb2da8..07adbefae114 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -745,98 +753,44 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
+
+ /*
+ * The caching in i_private is unbound, and the zone could have been
+ * reused for GC in the meantime. Skip it in that case.
+ */
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
+ if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
+ oz = NULL;
+ rcu_read_unlock();
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
- if (oz) {
- /*
- * GC only steals open zones at mount time, so no GC zones
- * should end up in the cache.
- */
- ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
- }
- xfs_mru_cache_done(mp->m_zone_cache);
return oz;
}
/*
- * Update the last used zone cache for a given inode.
- *
- * The caller must have a reference on the open zone.
+ * Try to stash our zone in the inode so that it is reused for future
+ * allocations.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +834,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +919,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1279,14 +1238,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1300,5 +1251,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*
--
2.47.3
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] xfs: cache open zone in inode->i_private
2025-10-15 6:30 [PATCH] xfs: cache open zone in inode->i_private Christoph Hellwig
@ 2025-10-16 5:17 ` Hans Holmberg
2025-10-16 6:06 ` hch
0 siblings, 1 reply; 3+ messages in thread
From: Hans Holmberg @ 2025-10-16 5:17 UTC (permalink / raw)
To: hch, cem@kernel.org; +Cc: linux-xfs@vger.kernel.org
On 15/10/2025 15:30, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones. One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
>
> So switch the open zone to RCU freeing, and then stash the last used
> open zone into inode->i_private. This helps to significantly reduce
> fragmentation by keeping I/O localized to zones for workloads that
> write using many open files to HDD.
>
> Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/xfs/xfs_mount.h | 1 -
> fs/xfs/xfs_super.c | 6 ++
> fs/xfs/xfs_zone_alloc.c | 134 +++++++++++++---------------------------
> fs/xfs/xfs_zone_priv.h | 2 +
> 4 files changed, 50 insertions(+), 93 deletions(-)
>
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index f046d1215b04..b871dfde372b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -236,7 +236,6 @@ typedef struct xfs_mount {
> bool m_update_sb; /* sb needs update in mount */
> unsigned int m_max_open_zones;
> unsigned int m_zonegc_low_space;
> - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
>
> /* max_atomic_write mount option value */
> unsigned long long m_awu_max_bytes;
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e85a156dc17d..464ae1e657d9 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -786,6 +786,12 @@ xfs_fs_evict_inode(
>
> truncate_inode_pages_final(&inode->i_data);
> clear_inode(inode);
> +
> + if (IS_ENABLED(CONFIG_XFS_RT) &&
> + S_ISREG(inode->i_mode) && inode->i_private) {
> + xfs_open_zone_put(inode->i_private);
> + inode->i_private = NULL;
> + }
> }
>
> static void
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> index 1147bacb2da8..07adbefae114 100644
> --- a/fs/xfs/xfs_zone_alloc.c
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -26,14 +26,22 @@
> #include "xfs_trace.h"
> #include "xfs_mru_cache.h"
>
> +static void
> +xfs_open_zone_free_rcu(
> + struct callback_head *cb)
> +{
> + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
> +
> + xfs_rtgroup_rele(oz->oz_rtg);
> + kfree(oz);
> +}
> +
> void
> xfs_open_zone_put(
> struct xfs_open_zone *oz)
> {
> - if (atomic_dec_and_test(&oz->oz_ref)) {
> - xfs_rtgroup_rele(oz->oz_rtg);
> - kfree(oz);
> - }
> + if (atomic_dec_and_test(&oz->oz_ref))
> + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
> }
>
> static inline uint32_t
> @@ -745,98 +753,44 @@ xfs_mark_rtg_boundary(
> ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
> }
>
> -/*
> - * Cache the last zone written to for an inode so that it is considered first
> - * for subsequent writes.
> - */
> -struct xfs_zone_cache_item {
> - struct xfs_mru_cache_elem mru;
> - struct xfs_open_zone *oz;
> -};
> -
> -static inline struct xfs_zone_cache_item *
> -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
> -{
> - return container_of(mru, struct xfs_zone_cache_item, mru);
> -}
> -
> -static void
> -xfs_zone_cache_free_func(
> - void *data,
> - struct xfs_mru_cache_elem *mru)
> -{
> - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
> -
> - xfs_open_zone_put(item->oz);
> - kfree(item);
> -}
> -
> /*
> * Check if we have a cached last open zone available for the inode and
> * if yes return a reference to it.
> */
> static struct xfs_open_zone *
> -xfs_cached_zone(
> - struct xfs_mount *mp,
> - struct xfs_inode *ip)
> +xfs_get_cached_zone(
> + struct xfs_inode *ip)
> {
> - struct xfs_mru_cache_elem *mru;
> - struct xfs_open_zone *oz;
> + struct xfs_open_zone *oz;
> +
> + /*
> + * The caching in i_private is unbound, and the zone could have been
> + * reused for GC in the meantime. Skip it in that case.
> + */
> + rcu_read_lock();
> + oz = VFS_I(ip)->i_private;
> + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
> + oz = NULL;
We still won't steal any open zones for gc(except at mount time), so
the keep the assert and comment below in stead?
> + rcu_read_unlock();
>
> - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> - if (!mru)
> - return NULL;
> - oz = xfs_zone_cache_item(mru)->oz;
> - if (oz) {
> - /*
> - * GC only steals open zones at mount time, so no GC zones
> - * should end up in the cache.
> - */
> - ASSERT(!oz->oz_is_gc);
> - ASSERT(atomic_read(&oz->oz_ref) > 0);
> - atomic_inc(&oz->oz_ref);
> - }
> - xfs_mru_cache_done(mp->m_zone_cache);
> return oz;
> }
>
> /*
> - * Update the last used zone cache for a given inode.
> - *
> - * The caller must have a reference on the open zone.
> + * Try to stash our zone in the inode so that it is reused for future
> + * allocations.
> */
> static void
> -xfs_zone_cache_create_association(
> - struct xfs_inode *ip,
> - struct xfs_open_zone *oz)
> +xfs_set_cached_zone(
> + struct xfs_inode *ip,
> + struct xfs_open_zone *oz)
> {
> - struct xfs_mount *mp = ip->i_mount;
> - struct xfs_zone_cache_item *item = NULL;
> - struct xfs_mru_cache_elem *mru;
> + struct xfs_open_zone *old_oz;
>
> - ASSERT(atomic_read(&oz->oz_ref) > 0);
> atomic_inc(&oz->oz_ref);
> -
> - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> - if (mru) {
> - /*
> - * If we have an association already, update it to point to the
> - * new zone.
> - */
> - item = xfs_zone_cache_item(mru);
> - xfs_open_zone_put(item->oz);
> - item->oz = oz;
> - xfs_mru_cache_done(mp->m_zone_cache);
> - return;
> - }
> -
> - item = kmalloc(sizeof(*item), GFP_KERNEL);
> - if (!item) {
> - xfs_open_zone_put(oz);
> - return;
> - }
> - item->oz = oz;
> - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
> + old_oz = xchg(&VFS_I(ip)->i_private, oz);
> + if (old_oz)
> + xfs_open_zone_put(old_oz);
> }
>
> static void
> @@ -880,15 +834,14 @@ xfs_zone_alloc_and_submit(
> * the inode is still associated with a zone and use that if so.
> */
> if (!*oz)
> - *oz = xfs_cached_zone(mp, ip);
> + *oz = xfs_get_cached_zone(ip);
>
> if (!*oz) {
> select_zone:
> *oz = xfs_select_zone(mp, write_hint, pack_tight);
> if (!*oz)
> goto out_error;
> -
> - xfs_zone_cache_create_association(ip, *oz);
> + xfs_set_cached_zone(ip, *oz);
> }
>
> alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
> @@ -966,6 +919,12 @@ xfs_free_open_zones(
> xfs_open_zone_put(oz);
> }
> spin_unlock(&zi->zi_open_zones_lock);
> +
> + /*
> + * Wait for all open zones to be freed so that they drop the group
> + * references:
> + */
> + rcu_barrier();
> }
>
> struct xfs_init_zones {
> @@ -1279,14 +1238,6 @@ xfs_mount_zones(
> error = xfs_zone_gc_mount(mp);
> if (error)
> goto out_free_zone_info;
> -
> - /*
> - * Set up a mru cache to track inode to open zone for data placement
> - * purposes. The magic values for group count and life time is the
> - * same as the defaults for file streams, which seems sane enough.
> - */
> - xfs_mru_cache_create(&mp->m_zone_cache, mp,
> - 5000, 10, xfs_zone_cache_free_func);
> return 0;
>
> out_free_zone_info:
> @@ -1300,5 +1251,4 @@ xfs_unmount_zones(
> {
> xfs_zone_gc_unmount(mp);
> xfs_free_zone_info(mp->m_zone_info);
> - xfs_mru_cache_destroy(mp->m_zone_cache);
> }
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index 35e6de3d25ed..4322e26dd99a 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -44,6 +44,8 @@ struct xfs_open_zone {
> * the life time of an open zone.
> */
> struct xfs_rtgroup *oz_rtg;
> +
> + struct rcu_head oz_rcu;
> };
>
> /*
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] xfs: cache open zone in inode->i_private
2025-10-16 5:17 ` Hans Holmberg
@ 2025-10-16 6:06 ` hch
0 siblings, 0 replies; 3+ messages in thread
From: hch @ 2025-10-16 6:06 UTC (permalink / raw)
To: Hans Holmberg; +Cc: hch, cem@kernel.org, linux-xfs@vger.kernel.org
On Thu, Oct 16, 2025 at 05:17:31AM +0000, Hans Holmberg wrote:
> > + * reused for GC in the meantime. Skip it in that case.
> > + */
> > + rcu_read_lock();
> > + oz = VFS_I(ip)->i_private;
> > + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
> > + oz = NULL;
>
> We still won't steal any open zones for gc(except at mount time), so
> the keep the assert and comment below in stead?
True. When writing this I thought that a zone pointed to could be
reused as the GC zone, and it could. But the open zone would not
have the oz_is_gc set in that case, but instead we're protected by
it being marked as full inside the oz still.
I'll bring back the old assert and will write extensive comments
explaining all this for v2.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-10-16 6:06 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-15 6:30 [PATCH] xfs: cache open zone in inode->i_private Christoph Hellwig
2025-10-16 5:17 ` Hans Holmberg
2025-10-16 6:06 ` hch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox