linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] xfs: cache open zone in inode->i_private
@ 2025-10-17  3:51 Christoph Hellwig
  2025-10-17  5:27 ` Christoph Hellwig
  0 siblings, 1 reply; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17  3:51 UTC (permalink / raw)
  To: cem; +Cc: hans.holmberg, linux-xfs

The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones.  One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.

So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private.  This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.

Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---

Changes since v1:
 - keep the is_gc assert and update the comment for it

 fs/xfs/xfs_mount.h      |   1 -
 fs/xfs/xfs_super.c      |   6 ++
 fs/xfs/xfs_zone_alloc.c | 123 +++++++++++++---------------------------
 fs/xfs/xfs_zone_priv.h  |   2 +
 4 files changed, 46 insertions(+), 86 deletions(-)

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
 	bool			m_update_sb;	/* sb needs update in mount */
 	unsigned int		m_max_open_zones;
 	unsigned int		m_zonegc_low_space;
-	struct xfs_mru_cache	*m_zone_cache;  /* Inode to open zone cache */
 
 	/* max_atomic_write mount option value */
 	unsigned long long	m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..464ae1e657d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
 
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
+
+	if (IS_ENABLED(CONFIG_XFS_RT) &&
+	    S_ISREG(inode->i_mode) && inode->i_private) {
+		xfs_open_zone_put(inode->i_private);
+		inode->i_private = NULL;
+	}
 }
 
 static void
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index c342595acc3e..e4b2aadaf765 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
 #include "xfs_trace.h"
 #include "xfs_mru_cache.h"
 
+static void
+xfs_open_zone_free_rcu(
+	struct callback_head	*cb)
+{
+	struct xfs_open_zone	*oz = container_of(cb, typeof(*oz), oz_rcu);
+
+	xfs_rtgroup_rele(oz->oz_rtg);
+	kfree(oz);
+}
+
 void
 xfs_open_zone_put(
 	struct xfs_open_zone	*oz)
 {
-	if (atomic_dec_and_test(&oz->oz_ref)) {
-		xfs_rtgroup_rele(oz->oz_rtg);
-		kfree(oz);
-	}
+	if (atomic_dec_and_test(&oz->oz_ref))
+		call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
 }
 
 static inline uint32_t
@@ -745,98 +753,47 @@ xfs_mark_rtg_boundary(
 		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
 }
 
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
-	struct xfs_mru_cache_elem	mru;
-	struct xfs_open_zone		*oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
-	return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
-	void				*data,
-	struct xfs_mru_cache_elem	*mru)
-{
-	struct xfs_zone_cache_item	*item = xfs_zone_cache_item(mru);
-
-	xfs_open_zone_put(item->oz);
-	kfree(item);
-}
-
 /*
  * Check if we have a cached last open zone available for the inode and
  * if yes return a reference to it.
  */
 static struct xfs_open_zone *
-xfs_cached_zone(
-	struct xfs_mount		*mp,
-	struct xfs_inode		*ip)
+xfs_get_cached_zone(
+	struct xfs_inode	*ip)
 {
-	struct xfs_mru_cache_elem	*mru;
-	struct xfs_open_zone		*oz;
+	struct xfs_open_zone	*oz;
 
-	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
-	if (!mru)
-		return NULL;
-	oz = xfs_zone_cache_item(mru)->oz;
+	rcu_read_lock();
+	oz = VFS_I(ip)->i_private;
 	if (oz) {
 		/*
 		 * GC only steals open zones at mount time, so no GC zones
 		 * should end up in the cache.
 		 */
 		ASSERT(!oz->oz_is_gc);
-		ASSERT(atomic_read(&oz->oz_ref) > 0);
-		atomic_inc(&oz->oz_ref);
+		if (!atomic_inc_not_zero(&oz->oz_ref))
+			oz = NULL;
 	}
-	xfs_mru_cache_done(mp->m_zone_cache);
+	rcu_read_unlock();
+
 	return oz;
 }
 
 /*
- * Update the last used zone cache for a given inode.
- *
- * The caller must have a reference on the open zone.
+ * Try to stash our zone in the inode so that is is reused for future
+ * allocations.
  */
 static void
-xfs_zone_cache_create_association(
-	struct xfs_inode		*ip,
-	struct xfs_open_zone		*oz)
+xfs_set_cached_zone(
+	struct xfs_inode	*ip,
+	struct xfs_open_zone	*oz)
 {
-	struct xfs_mount		*mp = ip->i_mount;
-	struct xfs_zone_cache_item	*item = NULL;
-	struct xfs_mru_cache_elem	*mru;
+	struct xfs_open_zone	*old_oz;
 
-	ASSERT(atomic_read(&oz->oz_ref) > 0);
 	atomic_inc(&oz->oz_ref);
-
-	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
-	if (mru) {
-		/*
-		 * If we have an association already, update it to point to the
-		 * new zone.
-		 */
-		item = xfs_zone_cache_item(mru);
-		xfs_open_zone_put(item->oz);
-		item->oz = oz;
-		xfs_mru_cache_done(mp->m_zone_cache);
-		return;
-	}
-
-	item = kmalloc(sizeof(*item), GFP_KERNEL);
-	if (!item) {
-		xfs_open_zone_put(oz);
-		return;
-	}
-	item->oz = oz;
-	xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+	old_oz = xchg(&VFS_I(ip)->i_private, oz);
+	if (old_oz)
+		xfs_open_zone_put(old_oz);
 }
 
 static void
@@ -880,15 +837,14 @@ xfs_zone_alloc_and_submit(
 	 * the inode is still associated with a zone and use that if so.
 	 */
 	if (!*oz)
-		*oz = xfs_cached_zone(mp, ip);
+		*oz = xfs_get_cached_zone(ip);
 
 	if (!*oz) {
 select_zone:
 		*oz = xfs_select_zone(mp, write_hint, pack_tight);
 		if (!*oz)
 			goto out_error;
-
-		xfs_zone_cache_create_association(ip, *oz);
+		xfs_set_cached_zone(ip, *oz);
 	}
 
 	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +922,12 @@ xfs_free_open_zones(
 		xfs_open_zone_put(oz);
 	}
 	spin_unlock(&zi->zi_open_zones_lock);
+
+	/*
+	 * Wait for all open zones to be freed so that they drop the group
+	 * references:
+	 */
+	rcu_barrier();
 }
 
 struct xfs_init_zones {
@@ -1303,14 +1265,6 @@ xfs_mount_zones(
 	error = xfs_zone_gc_mount(mp);
 	if (error)
 		goto out_free_zone_info;
-
-	/*
-	 * Set up a mru cache to track inode to open zone for data placement
-	 * purposes. The magic values for group count and life time is the
-	 * same as the defaults for file streams, which seems sane enough.
-	 */
-	xfs_mru_cache_create(&mp->m_zone_cache, mp,
-			5000, 10, xfs_zone_cache_free_func);
 	return 0;
 
 out_free_zone_info:
@@ -1324,5 +1278,4 @@ xfs_unmount_zones(
 {
 	xfs_zone_gc_unmount(mp);
 	xfs_free_zone_info(mp->m_zone_info);
-	xfs_mru_cache_destroy(mp->m_zone_cache);
 }
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
 	 * the life time of an open zone.
 	 */
 	struct xfs_rtgroup	*oz_rtg;
+
+	struct rcu_head		oz_rcu;
 };
 
 /*
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2] xfs: cache open zone in inode->i_private
@ 2025-10-17  3:55 Christoph Hellwig
  2025-10-18  4:17 ` Damien Le Moal
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17  3:55 UTC (permalink / raw)
  To: cem; +Cc: hans.holmberg, linux-xfs

The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones.  One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.

So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private.  This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.

Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---

Changes since v1:
 - keep the is_gc assert
 - add a lot of comments explaining the caching logic

 fs/xfs/xfs_mount.h      |   1 -
 fs/xfs/xfs_super.c      |   6 ++
 fs/xfs/xfs_zone_alloc.c | 129 ++++++++++++++--------------------------
 fs/xfs/xfs_zone_priv.h  |   2 +
 4 files changed, 53 insertions(+), 85 deletions(-)

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f046d1215b04..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
 	bool			m_update_sb;	/* sb needs update in mount */
 	unsigned int		m_max_open_zones;
 	unsigned int		m_zonegc_low_space;
-	struct xfs_mru_cache	*m_zone_cache;  /* Inode to open zone cache */
 
 	/* max_atomic_write mount option value */
 	unsigned long long	m_awu_max_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e85a156dc17d..464ae1e657d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
 
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
+
+	if (IS_ENABLED(CONFIG_XFS_RT) &&
+	    S_ISREG(inode->i_mode) && inode->i_private) {
+		xfs_open_zone_put(inode->i_private);
+		inode->i_private = NULL;
+	}
 }
 
 static void
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index c342595acc3e..e7e439918f6d 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -26,14 +26,22 @@
 #include "xfs_trace.h"
 #include "xfs_mru_cache.h"
 
+static void
+xfs_open_zone_free_rcu(
+	struct callback_head	*cb)
+{
+	struct xfs_open_zone	*oz = container_of(cb, typeof(*oz), oz_rcu);
+
+	xfs_rtgroup_rele(oz->oz_rtg);
+	kfree(oz);
+}
+
 void
 xfs_open_zone_put(
 	struct xfs_open_zone	*oz)
 {
-	if (atomic_dec_and_test(&oz->oz_ref)) {
-		xfs_rtgroup_rele(oz->oz_rtg);
-		kfree(oz);
-	}
+	if (atomic_dec_and_test(&oz->oz_ref))
+		call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
 }
 
 static inline uint32_t
@@ -745,98 +753,55 @@ xfs_mark_rtg_boundary(
 		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
 }
 
-/*
- * Cache the last zone written to for an inode so that it is considered first
- * for subsequent writes.
- */
-struct xfs_zone_cache_item {
-	struct xfs_mru_cache_elem	mru;
-	struct xfs_open_zone		*oz;
-};
-
-static inline struct xfs_zone_cache_item *
-xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
-{
-	return container_of(mru, struct xfs_zone_cache_item, mru);
-}
-
-static void
-xfs_zone_cache_free_func(
-	void				*data,
-	struct xfs_mru_cache_elem	*mru)
-{
-	struct xfs_zone_cache_item	*item = xfs_zone_cache_item(mru);
-
-	xfs_open_zone_put(item->oz);
-	kfree(item);
-}
-
 /*
  * Check if we have a cached last open zone available for the inode and
  * if yes return a reference to it.
  */
 static struct xfs_open_zone *
-xfs_cached_zone(
-	struct xfs_mount		*mp,
-	struct xfs_inode		*ip)
+xfs_get_cached_zone(
+	struct xfs_inode	*ip)
 {
-	struct xfs_mru_cache_elem	*mru;
-	struct xfs_open_zone		*oz;
+	struct xfs_open_zone	*oz;
 
-	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
-	if (!mru)
-		return NULL;
-	oz = xfs_zone_cache_item(mru)->oz;
+	rcu_read_lock();
+	oz = VFS_I(ip)->i_private;
 	if (oz) {
 		/*
 		 * GC only steals open zones at mount time, so no GC zones
 		 * should end up in the cache.
 		 */
 		ASSERT(!oz->oz_is_gc);
-		ASSERT(atomic_read(&oz->oz_ref) > 0);
-		atomic_inc(&oz->oz_ref);
+		if (!atomic_inc_not_zero(&oz->oz_ref))
+			oz = NULL;
 	}
-	xfs_mru_cache_done(mp->m_zone_cache);
+	rcu_read_unlock();
+
 	return oz;
 }
 
 /*
- * Update the last used zone cache for a given inode.
+ * Stash our zone in the inode so that is is reused for future allocations.
  *
- * The caller must have a reference on the open zone.
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it.  This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup.  Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
  */
 static void
-xfs_zone_cache_create_association(
-	struct xfs_inode		*ip,
-	struct xfs_open_zone		*oz)
+xfs_set_cached_zone(
+	struct xfs_inode	*ip,
+	struct xfs_open_zone	*oz)
 {
-	struct xfs_mount		*mp = ip->i_mount;
-	struct xfs_zone_cache_item	*item = NULL;
-	struct xfs_mru_cache_elem	*mru;
+	struct xfs_open_zone	*old_oz;
 
-	ASSERT(atomic_read(&oz->oz_ref) > 0);
 	atomic_inc(&oz->oz_ref);
-
-	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
-	if (mru) {
-		/*
-		 * If we have an association already, update it to point to the
-		 * new zone.
-		 */
-		item = xfs_zone_cache_item(mru);
-		xfs_open_zone_put(item->oz);
-		item->oz = oz;
-		xfs_mru_cache_done(mp->m_zone_cache);
-		return;
-	}
-
-	item = kmalloc(sizeof(*item), GFP_KERNEL);
-	if (!item) {
-		xfs_open_zone_put(oz);
-		return;
-	}
-	item->oz = oz;
-	xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+	old_oz = xchg(&VFS_I(ip)->i_private, oz);
+	if (old_oz)
+		xfs_open_zone_put(old_oz);
 }
 
 static void
@@ -880,15 +845,14 @@ xfs_zone_alloc_and_submit(
 	 * the inode is still associated with a zone and use that if so.
 	 */
 	if (!*oz)
-		*oz = xfs_cached_zone(mp, ip);
+		*oz = xfs_get_cached_zone(ip);
 
 	if (!*oz) {
 select_zone:
 		*oz = xfs_select_zone(mp, write_hint, pack_tight);
 		if (!*oz)
 			goto out_error;
-
-		xfs_zone_cache_create_association(ip, *oz);
+		xfs_set_cached_zone(ip, *oz);
 	}
 
 	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -966,6 +930,12 @@ xfs_free_open_zones(
 		xfs_open_zone_put(oz);
 	}
 	spin_unlock(&zi->zi_open_zones_lock);
+
+	/*
+	 * Wait for all open zones to be freed so that they drop the group
+	 * references:
+	 */
+	rcu_barrier();
 }
 
 struct xfs_init_zones {
@@ -1303,14 +1273,6 @@ xfs_mount_zones(
 	error = xfs_zone_gc_mount(mp);
 	if (error)
 		goto out_free_zone_info;
-
-	/*
-	 * Set up a mru cache to track inode to open zone for data placement
-	 * purposes. The magic values for group count and life time is the
-	 * same as the defaults for file streams, which seems sane enough.
-	 */
-	xfs_mru_cache_create(&mp->m_zone_cache, mp,
-			5000, 10, xfs_zone_cache_free_func);
 	return 0;
 
 out_free_zone_info:
@@ -1324,5 +1286,4 @@ xfs_unmount_zones(
 {
 	xfs_zone_gc_unmount(mp);
 	xfs_free_zone_info(mp->m_zone_info);
-	xfs_mru_cache_destroy(mp->m_zone_cache);
 }
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index 35e6de3d25ed..4322e26dd99a 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -44,6 +44,8 @@ struct xfs_open_zone {
 	 * the life time of an open zone.
 	 */
 	struct xfs_rtgroup	*oz_rtg;
+
+	struct rcu_head		oz_rcu;
 };
 
 /*
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] xfs: cache open zone in inode->i_private
  2025-10-17  3:51 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
@ 2025-10-17  5:27 ` Christoph Hellwig
  0 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-10-17  5:27 UTC (permalink / raw)
  To: cem; +Cc: hans.holmberg, linux-xfs

Disregard this version, it was missing the fold of the comment
update.  I though I had stopped git-send-email before it went out,
but that seems to have failed.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] xfs: cache open zone in inode->i_private
  2025-10-17  3:55 Christoph Hellwig
@ 2025-10-18  4:17 ` Damien Le Moal
  2025-10-20  8:17 ` Hans Holmberg
  2025-10-21  9:37 ` Carlos Maiolino
  2 siblings, 0 replies; 6+ messages in thread
From: Damien Le Moal @ 2025-10-18  4:17 UTC (permalink / raw)
  To: Christoph Hellwig, cem; +Cc: hans.holmberg, linux-xfs

On 10/17/25 12:55, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones.  One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
> 
> So switch the open zone to RCU freeing, and then stash the last used
> open zone into inode->i_private.  This helps to significantly reduce
> fragmentation by keeping I/O localized to zones for workloads that
> write using many open files to HDD.
> 
> Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Damien Le Moal <dlemoal@kernel.org>

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] xfs: cache open zone in inode->i_private
  2025-10-17  3:55 Christoph Hellwig
  2025-10-18  4:17 ` Damien Le Moal
@ 2025-10-20  8:17 ` Hans Holmberg
  2025-10-21  9:37 ` Carlos Maiolino
  2 siblings, 0 replies; 6+ messages in thread
From: Hans Holmberg @ 2025-10-20  8:17 UTC (permalink / raw)
  To: hch, cem@kernel.org; +Cc: linux-xfs@vger.kernel.org

On 17/10/2025 05:56, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones.  One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
> 
> So switch the open zone to RCU freeing, and then stash the last used
> open zone into inode->i_private.  This helps to significantly reduce
> fragmentation by keeping I/O localized to zones for workloads that
> write using many open files to HDD.
> 
> Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> 
> Changes since v1:
>  - keep the is_gc assert
>  - add a lot of comments explaining the caching logic
> 
>  fs/xfs/xfs_mount.h      |   1 -
>  fs/xfs/xfs_super.c      |   6 ++
>  fs/xfs/xfs_zone_alloc.c | 129 ++++++++++++++--------------------------
>  fs/xfs/xfs_zone_priv.h  |   2 +
>  4 files changed, 53 insertions(+), 85 deletions(-)
> 
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index f046d1215b04..b871dfde372b 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -236,7 +236,6 @@ typedef struct xfs_mount {
>  	bool			m_update_sb;	/* sb needs update in mount */
>  	unsigned int		m_max_open_zones;
>  	unsigned int		m_zonegc_low_space;
> -	struct xfs_mru_cache	*m_zone_cache;  /* Inode to open zone cache */
>  
>  	/* max_atomic_write mount option value */
>  	unsigned long long	m_awu_max_bytes;
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e85a156dc17d..464ae1e657d9 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -786,6 +786,12 @@ xfs_fs_evict_inode(
>  
>  	truncate_inode_pages_final(&inode->i_data);
>  	clear_inode(inode);
> +
> +	if (IS_ENABLED(CONFIG_XFS_RT) &&
> +	    S_ISREG(inode->i_mode) && inode->i_private) {
> +		xfs_open_zone_put(inode->i_private);
> +		inode->i_private = NULL;
> +	}
>  }
>  
>  static void
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> index c342595acc3e..e7e439918f6d 100644
> --- a/fs/xfs/xfs_zone_alloc.c
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -26,14 +26,22 @@
>  #include "xfs_trace.h"
>  #include "xfs_mru_cache.h"
>  
> +static void
> +xfs_open_zone_free_rcu(
> +	struct callback_head	*cb)
> +{
> +	struct xfs_open_zone	*oz = container_of(cb, typeof(*oz), oz_rcu);
> +
> +	xfs_rtgroup_rele(oz->oz_rtg);
> +	kfree(oz);
> +}
> +
>  void
>  xfs_open_zone_put(
>  	struct xfs_open_zone	*oz)
>  {
> -	if (atomic_dec_and_test(&oz->oz_ref)) {
> -		xfs_rtgroup_rele(oz->oz_rtg);
> -		kfree(oz);
> -	}
> +	if (atomic_dec_and_test(&oz->oz_ref))
> +		call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
>  }
>  
>  static inline uint32_t
> @@ -745,98 +753,55 @@ xfs_mark_rtg_boundary(
>  		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
>  }
>  
> -/*
> - * Cache the last zone written to for an inode so that it is considered first
> - * for subsequent writes.
> - */
> -struct xfs_zone_cache_item {
> -	struct xfs_mru_cache_elem	mru;
> -	struct xfs_open_zone		*oz;
> -};
> -
> -static inline struct xfs_zone_cache_item *
> -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
> -{
> -	return container_of(mru, struct xfs_zone_cache_item, mru);
> -}
> -
> -static void
> -xfs_zone_cache_free_func(
> -	void				*data,
> -	struct xfs_mru_cache_elem	*mru)
> -{
> -	struct xfs_zone_cache_item	*item = xfs_zone_cache_item(mru);
> -
> -	xfs_open_zone_put(item->oz);
> -	kfree(item);
> -}
> -
>  /*
>   * Check if we have a cached last open zone available for the inode and
>   * if yes return a reference to it.
>   */
>  static struct xfs_open_zone *
> -xfs_cached_zone(
> -	struct xfs_mount		*mp,
> -	struct xfs_inode		*ip)
> +xfs_get_cached_zone(
> +	struct xfs_inode	*ip)
>  {
> -	struct xfs_mru_cache_elem	*mru;
> -	struct xfs_open_zone		*oz;
> +	struct xfs_open_zone	*oz;
>  
> -	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> -	if (!mru)
> -		return NULL;
> -	oz = xfs_zone_cache_item(mru)->oz;
> +	rcu_read_lock();
> +	oz = VFS_I(ip)->i_private;
>  	if (oz) {
>  		/*
>  		 * GC only steals open zones at mount time, so no GC zones
>  		 * should end up in the cache.
>  		 */
>  		ASSERT(!oz->oz_is_gc);
> -		ASSERT(atomic_read(&oz->oz_ref) > 0);
> -		atomic_inc(&oz->oz_ref);
> +		if (!atomic_inc_not_zero(&oz->oz_ref))
> +			oz = NULL;
>  	}
> -	xfs_mru_cache_done(mp->m_zone_cache);
> +	rcu_read_unlock();
> +
>  	return oz;
>  }
>  
>  /*
> - * Update the last used zone cache for a given inode.
> + * Stash our zone in the inode so that is is reused for future allocations.
>   *
> - * The caller must have a reference on the open zone.
> + * The open_zone structure will be pinned until either the inode is freed or
> + * until the cached open zone is replaced with a different one because the
> + * current one was full when we tried to use it.  This means we keep any
> + * open zone around forever as long as any inode that used it for the last
> + * write is cached, which slightly increases the memory use of cached inodes
> + * that were every written to, but significantly simplifies the cached zone
> + * lookup.  Because the open_zone is clearly marked as full when all data
> + * in the underlying RTG was written, the caching is always safe.
>   */
>  static void
> -xfs_zone_cache_create_association(
> -	struct xfs_inode		*ip,
> -	struct xfs_open_zone		*oz)
> +xfs_set_cached_zone(
> +	struct xfs_inode	*ip,
> +	struct xfs_open_zone	*oz)
>  {
> -	struct xfs_mount		*mp = ip->i_mount;
> -	struct xfs_zone_cache_item	*item = NULL;
> -	struct xfs_mru_cache_elem	*mru;
> +	struct xfs_open_zone	*old_oz;
>  
> -	ASSERT(atomic_read(&oz->oz_ref) > 0);
>  	atomic_inc(&oz->oz_ref);
> -
> -	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
> -	if (mru) {
> -		/*
> -		 * If we have an association already, update it to point to the
> -		 * new zone.
> -		 */
> -		item = xfs_zone_cache_item(mru);
> -		xfs_open_zone_put(item->oz);
> -		item->oz = oz;
> -		xfs_mru_cache_done(mp->m_zone_cache);
> -		return;
> -	}
> -
> -	item = kmalloc(sizeof(*item), GFP_KERNEL);
> -	if (!item) {
> -		xfs_open_zone_put(oz);
> -		return;
> -	}
> -	item->oz = oz;
> -	xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
> +	old_oz = xchg(&VFS_I(ip)->i_private, oz);
> +	if (old_oz)
> +		xfs_open_zone_put(old_oz);
>  }
>  
>  static void
> @@ -880,15 +845,14 @@ xfs_zone_alloc_and_submit(
>  	 * the inode is still associated with a zone and use that if so.
>  	 */
>  	if (!*oz)
> -		*oz = xfs_cached_zone(mp, ip);
> +		*oz = xfs_get_cached_zone(ip);
>  
>  	if (!*oz) {
>  select_zone:
>  		*oz = xfs_select_zone(mp, write_hint, pack_tight);
>  		if (!*oz)
>  			goto out_error;
> -
> -		xfs_zone_cache_create_association(ip, *oz);
> +		xfs_set_cached_zone(ip, *oz);
>  	}
>  
>  	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
> @@ -966,6 +930,12 @@ xfs_free_open_zones(
>  		xfs_open_zone_put(oz);
>  	}
>  	spin_unlock(&zi->zi_open_zones_lock);
> +
> +	/*
> +	 * Wait for all open zones to be freed so that they drop the group
> +	 * references:
> +	 */
> +	rcu_barrier();
>  }
>  
>  struct xfs_init_zones {
> @@ -1303,14 +1273,6 @@ xfs_mount_zones(
>  	error = xfs_zone_gc_mount(mp);
>  	if (error)
>  		goto out_free_zone_info;
> -
> -	/*
> -	 * Set up a mru cache to track inode to open zone for data placement
> -	 * purposes. The magic values for group count and life time is the
> -	 * same as the defaults for file streams, which seems sane enough.
> -	 */
> -	xfs_mru_cache_create(&mp->m_zone_cache, mp,
> -			5000, 10, xfs_zone_cache_free_func);
>  	return 0;
>  
>  out_free_zone_info:
> @@ -1324,5 +1286,4 @@ xfs_unmount_zones(
>  {
>  	xfs_zone_gc_unmount(mp);
>  	xfs_free_zone_info(mp->m_zone_info);
> -	xfs_mru_cache_destroy(mp->m_zone_cache);
>  }
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> index 35e6de3d25ed..4322e26dd99a 100644
> --- a/fs/xfs/xfs_zone_priv.h
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -44,6 +44,8 @@ struct xfs_open_zone {
>  	 * the life time of an open zone.
>  	 */
>  	struct xfs_rtgroup	*oz_rtg;
> +
> +	struct rcu_head		oz_rcu;
>  };
>  
>  /*

Looks good,

Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] xfs: cache open zone in inode->i_private
  2025-10-17  3:55 Christoph Hellwig
  2025-10-18  4:17 ` Damien Le Moal
  2025-10-20  8:17 ` Hans Holmberg
@ 2025-10-21  9:37 ` Carlos Maiolino
  2 siblings, 0 replies; 6+ messages in thread
From: Carlos Maiolino @ 2025-10-21  9:37 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: hans.holmberg, linux-xfs

On Fri, 17 Oct 2025 05:55:41 +0200, Christoph Hellwig wrote:
> The MRU cache for open zones is unfortunately still not ideal, as it can
> time out pretty easily when doing heavy I/O to hard disks using up most
> or all open zones.  One option would be to just increase the timeout,
> but while looking into that I realized we're just better off caching it
> indefinitely as there is no real downside to that once we don't hold a
> reference to the cache open zone.
> 
> [...]

Applied to for-next, thanks!

[1/1] xfs: cache open zone in inode->i_private
      commit: ca3d643a970139f5456f90dd555a0955752d70cb

Best regards,
-- 
Carlos Maiolino <cem@kernel.org>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-10-21  9:37 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-17  3:51 [PATCH v2] xfs: cache open zone in inode->i_private Christoph Hellwig
2025-10-17  5:27 ` Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2025-10-17  3:55 Christoph Hellwig
2025-10-18  4:17 ` Damien Le Moal
2025-10-20  8:17 ` Hans Holmberg
2025-10-21  9:37 ` Carlos Maiolino

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).