* [PATCH v2] xfs: cache open zone in inode->i_private
@ 2025-10-17 3:51 Christoph Hellwig
2025-10-17 5:27 ` Christoph Hellwig
0 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17 3:51 UTC (permalink / raw)
To: cem; +Cc: hans.holmberg, linux-xfs
The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones. One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.
So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private. This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.
Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Changes since v1:
- keep the is_gc assert and update the comment for it
fs/xfs/xfs_mount.h | 1 -
fs/xfs/xfs_super.c | 6 ++
fs/xfs/xfs_zone_alloc.c | 123 +++++++++++++---------------------------
fs/xfs/xfs_zone_priv.h | 2 +
4 files changed, 46 insertions(+), 86 deletions(-)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..464ae1e657d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index c342595acc3e..e4b2aadaf765 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -745,98 +753,47 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
- *
- * The caller must have a reference on the open zone.
+ * Try to stash our zone in the inode so that is is reused for future
+ * allocations.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +837,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +922,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1303,14 +1265,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1324,5 +1278,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*
--
2.47.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH v2] xfs: cache open zone in inode->i_private
@ 2025-10-17 3:55 Christoph Hellwig
2025-10-18 4:17 ` Damien Le Moal
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17 3:55 UTC (permalink / raw)
To: cem; +Cc: hans.holmberg, linux-xfs
The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones. One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.
So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private. This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.
Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Changes since v1:
- keep the is_gc assert
- add a lot of comments explaining the caching logic
fs/xfs/xfs_mount.h | 1 -
fs/xfs/xfs_super.c | 6 ++
fs/xfs/xfs_zone_alloc.c | 129 ++++++++++++++--------------------------
fs/xfs/xfs_zone_priv.h | 2 +
4 files changed, 53 insertions(+), 85 deletions(-)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
- struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..464ae1e657d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index c342595acc3e..e7e439918f6d 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
+
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -745,98 +753,55 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
- struct xfs_mru_cache_elem mru;
- struct xfs_open_zone *oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
- return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
- void *data,
- struct xfs_mru_cache_elem *mru)
-{
- struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
-
- xfs_open_zone_put(item->oz);
- kfree(item);
-}
-
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
-xfs_cached_zone(
- struct xfs_mount *mp,
- struct xfs_inode *ip)
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
{
- struct xfs_mru_cache_elem *mru;
- struct xfs_open_zone *oz;
+ struct xfs_open_zone *oz;
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (!mru)
- return NULL;
- oz = xfs_zone_cache_item(mru)->oz;
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
- ASSERT(atomic_read(&oz->oz_ref) > 0);
- atomic_inc(&oz->oz_ref);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
}
- xfs_mru_cache_done(mp->m_zone_cache);
+ rcu_read_unlock();
+
return oz;
}
/*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
*
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
*/
static void
-xfs_zone_cache_create_association(
- struct xfs_inode *ip,
- struct xfs_open_zone *oz)
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_zone_cache_item *item = NULL;
- struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *old_oz;
- ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
-
- mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
- if (mru) {
- /*
- * If we have an association already, update it to point to the
- * new zone.
- */
- item = xfs_zone_cache_item(mru);
- xfs_open_zone_put(item->oz);
- item->oz = oz;
- xfs_mru_cache_done(mp->m_zone_cache);
- return;
- }
-
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- xfs_open_zone_put(oz);
- return;
- }
- item->oz = oz;
- xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
}
static void
@@ -880,15 +845,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
- *oz = xfs_cached_zone(mp, ip);
+ *oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
-
- xfs_zone_cache_create_association(ip, *oz);
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +930,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -1303,14 +1273,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
-
- /*
- * Set up a mru cache to track inode to open zone for data placement
- * purposes. The magic values for group count and life time is the
- * same as the defaults for file streams, which seems sane enough.
- */
- xfs_mru_cache_create(&mp->m_zone_cache, mp,
- 5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1324,5 +1286,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
- xfs_mru_cache_destroy(mp->m_zone_cache);
}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*
--
2.47.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v2] xfs: cache open zone in inode->i_private
2025-10-17 3:51 Christoph Hellwig
@ 2025-10-17 5:27 ` Christoph Hellwig
0 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17 5:27 UTC (permalink / raw)
To: cem; +Cc: hans.holmberg, linux-xfs
Disregard this version, it was missing the fold of the comment
update. I though I had stopped git-send-email before it went out,
but that seems to have failed.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] xfs: cache open zone in inode->i_private
2025-10-17 3:55 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
@ 2025-10-18 4:17 ` Damien Le Moal
2025-10-20 8:17 ` Hans Holmberg
2025-10-21 9:37 ` Carlos Maiolino
2 siblings, 0 replies; 6+ messages in thread
From: Damien Le Moal @ 2025-10-18 4:17 UTC (permalink / raw)
To: Christoph Hellwig, cem; +Cc: hans.holmberg, linux-xfs
On 10/17/25 12:55, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones. One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
>
> So switch the open zone to RCU freeing, and then stash the last used
> open zone into inode->i_private. This helps to significantly reduce
> fragmentation by keeping I/O localized to zones for workloads that
> write using many open files to HDD.
>
> Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Damien Le Moal <dlemoal@kernel.org>
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] xfs: cache open zone in inode->i_private
2025-10-17 3:55 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
2025-10-18 4:17 ` Damien Le Moal
@ 2025-10-20 8:17 ` Hans Holmberg
2025-10-21 9:37 ` Carlos Maiolino
2 siblings, 0 replies; 6+ messages in thread
From: Hans Holmberg @ 2025-10-20 8:17 UTC (permalink / raw)
To: hch, cem@kernel.org; +Cc: linux-xfs@vger.kernel.org
On 17/10/2025 05:56, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones. One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
>
> So switch the open zone to RCU freeing, and then stash the last used
> open zone into inode->i_private. This helps to significantly reduce
> fragmentation by keeping I/O localized to zones for workloads that
> write using many open files to HDD.
>
> Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>
> Changes since v1:
> - keep the is_gc assert
> - add a lot of comments explaining the caching logic
>
> fs/xfs/xfs_mount.h | 1 -
> fs/xfs/xfs_super.c | 6 ++
> fs/xfs/xfs_zone_alloc.c | 129 ++++++++++++++--------------------------
> fs/xfs/xfs_zone_priv.h | 2 +
> 4 files changed, 53 insertions(+), 85 deletions(-)
>
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index f046d1215b04..b871dfde372b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -236,7 +236,6 @@ typedef struct xfs_mount {
> bool m_update_sb; /* sb needs update in mount */
> unsigned int m_max_open_zones;
> unsigned int m_zonegc_low_space;
> - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
>
> /* max_atomic_write mount option value */
> unsigned long long m_awu_max_bytes;
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e85a156dc17d..464ae1e657d9 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -786,6 +786,12 @@ xfs_fs_evict_inode(
>
> truncate_inode_pages_final(&inode->i_data);
> clear_inode(inode);
> +
> + if (IS_ENABLED(CONFIG_XFS_RT) &&
> + S_ISREG(inode->i_mode) && inode->i_private) {
> + xfs_open_zone_put(inode->i_private);
> + inode->i_private = NULL;
> + }
> }
>
> static void
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> index c342595acc3e..e7e439918f6d 100644
> --- a/fs/xfs/xfs_zone_alloc.c
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -26,14 +26,22 @@
> #include "xfs_trace.h"
> #include "xfs_mru_cache.h"
>
> +static void
> +xfs_open_zone_free_rcu(
> + struct callback_head *cb)
> +{
> + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
> +
> + xfs_rtgroup_rele(oz->oz_rtg);
> + kfree(oz);
> +}
> +
> void
> xfs_open_zone_put(
> struct xfs_open_zone *oz)
> {
> - if (atomic_dec_and_test(&oz->oz_ref)) {
> - xfs_rtgroup_rele(oz->oz_rtg);
> - kfree(oz);
> - }
> + if (atomic_dec_and_test(&oz->oz_ref))
> + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
> }
>
> static inline uint32_t
> @@ -745,98 +753,55 @@ xfs_mark_rtg_boundary(
> ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
> }
>
> -/*
> - * Cache the last zone written to for an inode so that it is considered first
> - * for subsequent writes.
> - */
> -struct xfs_zone_cache_item {
> - struct xfs_mru_cache_elem mru;
> - struct xfs_open_zone *oz;
> -};
> -
> -static inline struct xfs_zone_cache_item *
> -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
> -{
> - return container_of(mru, struct xfs_zone_cache_item, mru);
> -}
> -
> -static void
> -xfs_zone_cache_free_func(
> - void *data,
> - struct xfs_mru_cache_elem *mru)
> -{
> - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
> -
> - xfs_open_zone_put(item->oz);
> - kfree(item);
> -}
> -
> /*
> * Check if we have a cached last open zone available for the inode and
> * if yes return a reference to it.
> */
> static struct xfs_open_zone *
> -xfs_cached_zone(
> - struct xfs_mount *mp,
> - struct xfs_inode *ip)
> +xfs_get_cached_zone(
> + struct xfs_inode *ip)
> {
> - struct xfs_mru_cache_elem *mru;
> - struct xfs_open_zone *oz;
> + struct xfs_open_zone *oz;
>
> - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> - if (!mru)
> - return NULL;
> - oz = xfs_zone_cache_item(mru)->oz;
> + rcu_read_lock();
> + oz = VFS_I(ip)->i_private;
> if (oz) {
> /*
> * GC only steals open zones at mount time, so no GC zones
> * should end up in the cache.
> */
> ASSERT(!oz->oz_is_gc);
> - ASSERT(atomic_read(&oz->oz_ref) > 0);
> - atomic_inc(&oz->oz_ref);
> + if (!atomic_inc_not_zero(&oz->oz_ref))
> + oz = NULL;
> }
> - xfs_mru_cache_done(mp->m_zone_cache);
> + rcu_read_unlock();
> +
> return oz;
> }
>
> /*
> - * Update the last used zone cache for a given inode.
> + * Stash our zone in the inode so that is is reused for future allocations.
> *
> - * The caller must have a reference on the open zone.
> + * The open_zone structure will be pinned until either the inode is freed or
> + * until the cached open zone is replaced with a different one because the
> + * current one was full when we tried to use it. This means we keep any
> + * open zone around forever as long as any inode that used it for the last
> + * write is cached, which slightly increases the memory use of cached inodes
> + * that were every written to, but significantly simplifies the cached zone
> + * lookup. Because the open_zone is clearly marked as full when all data
> + * in the underlying RTG was written, the caching is always safe.
> */
> static void
> -xfs_zone_cache_create_association(
> - struct xfs_inode *ip,
> - struct xfs_open_zone *oz)
> +xfs_set_cached_zone(
> + struct xfs_inode *ip,
> + struct xfs_open_zone *oz)
> {
> - struct xfs_mount *mp = ip->i_mount;
> - struct xfs_zone_cache_item *item = NULL;
> - struct xfs_mru_cache_elem *mru;
> + struct xfs_open_zone *old_oz;
>
> - ASSERT(atomic_read(&oz->oz_ref) > 0);
> atomic_inc(&oz->oz_ref);
> -
> - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> - if (mru) {
> - /*
> - * If we have an association already, update it to point to the
> - * new zone.
> - */
> - item = xfs_zone_cache_item(mru);
> - xfs_open_zone_put(item->oz);
> - item->oz = oz;
> - xfs_mru_cache_done(mp->m_zone_cache);
> - return;
> - }
> -
> - item = kmalloc(sizeof(*item), GFP_KERNEL);
> - if (!item) {
> - xfs_open_zone_put(oz);
> - return;
> - }
> - item->oz = oz;
> - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
> + old_oz = xchg(&VFS_I(ip)->i_private, oz);
> + if (old_oz)
> + xfs_open_zone_put(old_oz);
> }
>
> static void
> @@ -880,15 +845,14 @@ xfs_zone_alloc_and_submit(
> * the inode is still associated with a zone and use that if so.
> */
> if (!*oz)
> - *oz = xfs_cached_zone(mp, ip);
> + *oz = xfs_get_cached_zone(ip);
>
> if (!*oz) {
> select_zone:
> *oz = xfs_select_zone(mp, write_hint, pack_tight);
> if (!*oz)
> goto out_error;
> -
> - xfs_zone_cache_create_association(ip, *oz);
> + xfs_set_cached_zone(ip, *oz);
> }
>
> alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
> @@ -966,6 +930,12 @@ xfs_free_open_zones(
> xfs_open_zone_put(oz);
> }
> spin_unlock(&zi->zi_open_zones_lock);
> +
> + /*
> + * Wait for all open zones to be freed so that they drop the group
> + * references:
> + */
> + rcu_barrier();
> }
>
> struct xfs_init_zones {
> @@ -1303,14 +1273,6 @@ xfs_mount_zones(
> error = xfs_zone_gc_mount(mp);
> if (error)
> goto out_free_zone_info;
> -
> - /*
> - * Set up a mru cache to track inode to open zone for data placement
> - * purposes. The magic values for group count and life time is the
> - * same as the defaults for file streams, which seems sane enough.
> - */
> - xfs_mru_cache_create(&mp->m_zone_cache, mp,
> - 5000, 10, xfs_zone_cache_free_func);
> return 0;
>
> out_free_zone_info:
> @@ -1324,5 +1286,4 @@ xfs_unmount_zones(
> {
> xfs_zone_gc_unmount(mp);
> xfs_free_zone_info(mp->m_zone_info);
> - xfs_mru_cache_destroy(mp->m_zone_cache);
> }
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index 35e6de3d25ed..4322e26dd99a 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -44,6 +44,8 @@ struct xfs_open_zone {
> * the life time of an open zone.
> */
> struct xfs_rtgroup *oz_rtg;
> +
> + struct rcu_head oz_rcu;
> };
>
> /*
Looks good,
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] xfs: cache open zone in inode->i_private
2025-10-17 3:55 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
2025-10-18 4:17 ` Damien Le Moal
2025-10-20 8:17 ` Hans Holmberg
@ 2025-10-21 9:37 ` Carlos Maiolino
2 siblings, 0 replies; 6+ messages in thread
From: Carlos Maiolino @ 2025-10-21 9:37 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: hans.holmberg, linux-xfs
On Fri, 17 Oct 2025 05:55:41 +0200, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones. One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
>
> [...]
Applied to for-next, thanks!
[1/1] xfs: cache open zone in inode->i_private
commit: ca3d643a970139f5456f90dd555a0955752d70cb
Best regards,
--
Carlos Maiolino <cem@kernel.org>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-10-21 9:37 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-17 3:55 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
2025-10-18 4:17 ` Damien Le Moal
2025-10-20 8:17 ` Hans Holmberg
2025-10-21 9:37 ` Carlos Maiolino
-- strict thread matches above, loose matches on Subject: below --
2025-10-17 3:51 Christoph Hellwig
2025-10-17 5:27 ` Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).