[PATCH 10/12] xfs: use vmalloc instead of vm_map

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-02-26 15:51 use folios and vmalloc for buffer cache " Christoph Hellwig
@ 2025-02-26 15:51 ` Christoph Hellwig
  2025-02-26 18:02   ` Darrick J. Wong
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2025-02-26 15:51 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

The fallback buffer allocation path currently open codes a suboptimal
version of vmalloc to allocate pages that are then mapped into
vmalloc space.  Switch to using vmalloc instead, which uses all the
optimizations in the common vmalloc code, and removes the need to
track the backing pages in the xfs_buf structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c     | 209 ++++++++++---------------------------------
 fs/xfs/xfs_buf.h     |   7 --
 fs/xfs/xfs_buf_mem.c |  11 +--
 3 files changed, 49 insertions(+), 178 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15087f24372f..fb127589c6b4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -60,13 +60,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
 	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
 }
 
-static inline int
-xfs_buf_vmap_len(
-	struct xfs_buf	*bp)
-{
-	return (bp->b_page_count * PAGE_SIZE);
-}
-
 /*
  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
  * this buffer. The count is incremented once per buffer (per hold cycle)
@@ -248,30 +241,6 @@ _xfs_buf_alloc(
 	return 0;
 }
 
-static void
-xfs_buf_free_pages(
-	struct xfs_buf	*bp)
-{
-	uint		i;
-
-	ASSERT(bp->b_flags & _XBF_PAGES);
-
-	if (is_vmalloc_addr(bp->b_addr))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
-	for (i = 0; i < bp->b_page_count; i++) {
-		if (bp->b_pages[i])
-			folio_put(page_folio(bp->b_pages[i]));
-	}
-	mm_account_reclaimed_pages(
-			DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE));
-
-	if (bp->b_pages != bp->b_page_array)
-		kfree(bp->b_pages);
-	bp->b_pages = NULL;
-	bp->b_flags &= ~_XBF_PAGES;
-}
-
 static void
 xfs_buf_free_callback(
 	struct callback_head	*cb)
@@ -286,16 +255,23 @@ static void
 xfs_buf_free(
 	struct xfs_buf		*bp)
 {
+	unsigned int		size = BBTOB(bp->b_length);
+
 	trace_xfs_buf_free(bp, _RET_IP_);
 
 	ASSERT(list_empty(&bp->b_lru));
 
+	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+		mm_account_reclaimed_pages(DIV_ROUND_UP(size, PAGE_SHIFT));
+
 	if (xfs_buftarg_is_mem(bp->b_target))
 		xmbuf_unmap_page(bp);
-	else if (bp->b_flags & _XBF_PAGES)
-		xfs_buf_free_pages(bp);
+	else if (is_vmalloc_addr(bp->b_addr))
+		vfree(bp->b_addr);
 	else if (bp->b_flags & _XBF_KMEM)
 		kfree(bp->b_addr);
+	else
+		folio_put(virt_to_folio(bp->b_addr));
 
 	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
@@ -324,9 +300,6 @@ xfs_buf_alloc_kmem(
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = kmem_to_page(bp->b_addr);
-	bp->b_page_count = 1;
 	bp->b_flags |= _XBF_KMEM;
 	return 0;
 }
@@ -347,9 +320,9 @@ xfs_buf_alloc_kmem(
  * by the rest of the code - the buffer memory spans a single contiguous memory
  * region that we don't have to map and unmap to access the data directly.
  *
- * The third type of buffer is the multi-page buffer. These are always made
- * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
  */
 static int
 xfs_buf_alloc_backing_mem(
@@ -359,7 +332,6 @@ xfs_buf_alloc_backing_mem(
 	size_t		size = BBTOB(bp->b_length);
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
 	struct folio	*folio;
-	long		filled = 0;
 
 	if (xfs_buftarg_is_mem(bp->b_target))
 		return xmbuf_map_page(bp);
@@ -412,98 +384,18 @@ xfs_buf_alloc_backing_mem(
 		goto fallback;
 	}
 	bp->b_addr = folio_address(folio);
-	bp->b_page_array[0] = &folio->page;
-	bp->b_pages = bp->b_page_array;
-	bp->b_page_count = 1;
-	bp->b_flags |= _XBF_PAGES;
 	return 0;
 
 fallback:
-	/* Fall back to allocating an array of single page folios. */
-	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
-	if (bp->b_page_count <= XB_PAGES) {
-		bp->b_pages = bp->b_page_array;
-	} else {
-		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
-					gfp_mask);
-		if (!bp->b_pages)
-			return -ENOMEM;
-	}
-	bp->b_flags |= _XBF_PAGES;
-
-	/*
-	 * Bulk filling of pages can take multiple calls. Not filling the entire
-	 * array is not an allocation failure, so don't back off if we get at
-	 * least one extra page.
-	 */
 	for (;;) {
-		long	last = filled;
-
-		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
-					  bp->b_pages);
-		if (filled == bp->b_page_count) {
-			XFS_STATS_INC(bp->b_mount, xb_page_found);
+		bp->b_addr = __vmalloc(size, gfp_mask);
+		if (bp->b_addr)
 			break;
-		}
-
-		if (filled != last)
-			continue;
-
-		if (flags & XBF_READ_AHEAD) {
-			xfs_buf_free_pages(bp);
+		if (flags & XBF_READ_AHEAD)
 			return -ENOMEM;
-		}
-
 		XFS_STATS_INC(bp->b_mount, xb_page_retries);
 		memalloc_retry_wait(gfp_mask);
 	}
-	return 0;
-}
-
-/*
- *	Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-	struct xfs_buf		*bp,
-	xfs_buf_flags_t		flags)
-{
-	ASSERT(bp->b_flags & _XBF_PAGES);
-	if (bp->b_page_count == 1) {
-		/* A single page buffer is always mappable */
-		bp->b_addr = page_address(bp->b_pages[0]);
-	} else {
-		int retried = 0;
-		unsigned nofs_flag;
-
-		/*
-		 * vm_map_ram() will allocate auxiliary structures (e.g.
-		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-		 * from the same call site that can be run from both above and
-		 * below memory reclaim causes lockdep false positives. Hence we
-		 * always need to force this allocation to nofs context because
-		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-		 * prevent false positive lockdep reports.
-		 *
-		 * XXX(dgc): I think dquot reclaim is the only place we can get
-		 * to this function from memory reclaim context now. If we fix
-		 * that like we've fixed inode reclaim to avoid writeback from
-		 * reclaim, this nofs wrapping can go away.
-		 */
-		nofs_flag = memalloc_nofs_save();
-		do {
-			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-						-1);
-			if (bp->b_addr)
-				break;
-			vm_unmap_aliases();
-		} while (retried++ <= 1);
-		memalloc_nofs_restore(nofs_flag);
-
-		if (!bp->b_addr)
-			return -ENOMEM;
-	}
 
 	return 0;
 }
@@ -623,7 +515,7 @@ xfs_buf_find_lock(
 			return -ENOENT;
 		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM;
 		bp->b_ops = NULL;
 	}
 	return 0;
@@ -809,18 +701,6 @@ xfs_buf_get_map(
 			xfs_perag_put(pag);
 	}
 
-	/* We do not hold a perag reference anymore. */
-	if (!bp->b_addr) {
-		error = _xfs_buf_map_pages(bp, flags);
-		if (unlikely(error)) {
-			xfs_warn_ratelimited(btp->bt_mount,
-				"%s: failed to map %u pages", __func__,
-				bp->b_page_count);
-			xfs_buf_relse(bp);
-			return error;
-		}
-	}
-
 	/*
 	 * Clear b_error if this is a lookup from a caller that doesn't expect
 	 * valid data to be found in the buffer.
@@ -1061,14 +941,6 @@ xfs_buf_get_uncached(
 	if (error)
 		goto fail_free_buf;
 
-	if (!bp->b_addr)
-		error = _xfs_buf_map_pages(bp, 0);
-	if (unlikely(error)) {
-		xfs_warn(target->bt_mount,
-			"%s: failed to map pages", __func__);
-		goto fail_free_buf;
-	}
-
 	trace_xfs_buf_get_uncached(bp, _RET_IP_);
 	*bpp = bp;
 	return 0;
@@ -1409,7 +1281,8 @@ xfs_buf_ioend(
 	if (bp->b_flags & XBF_READ) {
 		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
 			invalidate_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+					DIV_ROUND_UP(BBTOB(bp->b_length),
+							PAGE_SIZE));
 		if (!bp->b_error && bp->b_ops)
 			bp->b_ops->verify_read(bp);
 		if (!bp->b_error)
@@ -1561,29 +1434,43 @@ static void
 xfs_buf_submit_bio(
 	struct xfs_buf		*bp)
 {
-	unsigned int		size = BBTOB(bp->b_length);
-	unsigned int		map = 0, p;
+	unsigned int		map = 0;
 	struct blk_plug		plug;
 	struct bio		*bio;
 
-	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
-			xfs_buf_bio_op(bp), GFP_NOIO);
-	bio->bi_private = bp;
-	bio->bi_end_io = xfs_buf_bio_end_io;
+	if (is_vmalloc_addr(bp->b_addr)) {
+		unsigned int	size = BBTOB(bp->b_length);
+		unsigned int	alloc_size = DIV_ROUND_UP(size, PAGE_SIZE);
+		void		*data = bp->b_addr;
 
-	if (bp->b_page_count == 1) {
-		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
-				offset_in_page(bp->b_addr));
-	} else {
-		for (p = 0; p < bp->b_page_count; p++)
-			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
-		bio->bi_iter.bi_size = size; /* limit to the actual size used */
+		bio = bio_alloc(bp->b_target->bt_bdev, size >> PAGE_SHIFT,
+				xfs_buf_bio_op(bp), GFP_NOIO);
+
+		do {
+			unsigned int	len = min(size, PAGE_SIZE);
+
+			ASSERT(offset_in_page(data) == 0);
+			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
+			data += len;
+			size -= len;
+		} while (size);
 
-		if (is_vmalloc_addr(bp->b_addr))
-			flush_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+		flush_kernel_vmap_range(bp->b_addr, alloc_size);
+	} else {
+		/*
+		 * Single folio or slab allocation.  Must be contigous and thus
+		 * only a single bvec is needed.
+		 */
+		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+				GFP_NOIO);
+		__bio_add_page(bio, virt_to_page(bp->b_addr),
+				BBTOB(bp->b_length),
+				offset_in_page(bp->b_addr));
 	}
 
+	bio->bi_private = bp;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+
 	/*
 	 * If there is more than one map segment, split out a new bio for each
 	 * map except of the last one.  The last map is handled by the
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 57faed82e93c..3089e5d5f042 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -37,7 +37,6 @@ struct xfs_buf;
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
@@ -63,7 +62,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
 	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
-	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
@@ -125,8 +123,6 @@ struct xfs_buftarg {
 	struct xfs_buf_cache	bt_cache[];
 };
 
-#define XB_PAGES	2
-
 struct xfs_buf_map {
 	xfs_daddr_t		bm_bn;	/* block number for I/O */
 	int			bm_len;	/* size of I/O */
@@ -188,13 +184,10 @@ struct xfs_buf {
 	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
-	struct page		**b_pages;	/* array of page pointers */
-	struct page		*b_page_array[XB_PAGES]; /* inline pages */
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
-	unsigned int		b_page_count;	/* size of page array */
 	int			b_error;	/* error code on I/O */
 	void			(*b_iodone)(struct xfs_buf *bp);
 
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 07bebbfb16ee..e2f6c5524771 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -169,9 +169,6 @@ xmbuf_map_page(
 	unlock_page(page);
 
 	bp->b_addr = page_address(page);
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = page;
-	bp->b_page_count = 1;
 	return 0;
 }
 
@@ -180,16 +177,10 @@ void
 xmbuf_unmap_page(
 	struct xfs_buf		*bp)
 {
-	struct page		*page = bp->b_pages[0];
-
 	ASSERT(xfs_buftarg_is_mem(bp->b_target));
 
-	put_page(page);
-
+	put_page(virt_to_page(bp->b_addr));
 	bp->b_addr = NULL;
-	bp->b_pages[0] = NULL;
-	bp->b_pages = NULL;
-	bp->b_page_count = 0;
 }
 
 /* Is this a valid daddr within the buftarg? */
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-02-26 15:51 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer " Christoph Hellwig
@ 2025-02-26 18:02   ` Darrick J. Wong
  2025-03-04 14:10     ` Christoph Hellwig
  0 siblings, 1 reply; 34+ messages in thread
From: Darrick J. Wong @ 2025-02-26 18:02 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Feb 26, 2025 at 07:51:38AM -0800, Christoph Hellwig wrote:
> The fallback buffer allocation path currently open codes a suboptimal
> version of vmalloc to allocate pages that are then mapped into
> vmalloc space.  Switch to using vmalloc instead, which uses all the
> optimizations in the common vmalloc code, and removes the need to
> track the backing pages in the xfs_buf structure.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c     | 209 ++++++++++---------------------------------
>  fs/xfs/xfs_buf.h     |   7 --
>  fs/xfs/xfs_buf_mem.c |  11 +--
>  3 files changed, 49 insertions(+), 178 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 15087f24372f..fb127589c6b4 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c

<snip>

> @@ -412,98 +384,18 @@ xfs_buf_alloc_backing_mem(
>  		goto fallback;
>  	}
>  	bp->b_addr = folio_address(folio);
> -	bp->b_page_array[0] = &folio->page;
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_page_count = 1;
> -	bp->b_flags |= _XBF_PAGES;
>  	return 0;
>  
>  fallback:
> -	/* Fall back to allocating an array of single page folios. */
> -	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
> -	if (bp->b_page_count <= XB_PAGES) {
> -		bp->b_pages = bp->b_page_array;
> -	} else {
> -		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
> -					gfp_mask);
> -		if (!bp->b_pages)
> -			return -ENOMEM;
> -	}
> -	bp->b_flags |= _XBF_PAGES;
> -
> -	/*
> -	 * Bulk filling of pages can take multiple calls. Not filling the entire
> -	 * array is not an allocation failure, so don't back off if we get at
> -	 * least one extra page.
> -	 */
>  	for (;;) {
> -		long	last = filled;
> -
> -		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
> -					  bp->b_pages);
> -		if (filled == bp->b_page_count) {
> -			XFS_STATS_INC(bp->b_mount, xb_page_found);
> +		bp->b_addr = __vmalloc(size, gfp_mask);
> +		if (bp->b_addr)
>  			break;
> -		}
> -
> -		if (filled != last)
> -			continue;
> -
> -		if (flags & XBF_READ_AHEAD) {
> -			xfs_buf_free_pages(bp);
> +		if (flags & XBF_READ_AHEAD)
>  			return -ENOMEM;
> -		}
> -
>  		XFS_STATS_INC(bp->b_mount, xb_page_retries);
>  		memalloc_retry_wait(gfp_mask);
>  	}
> -	return 0;
> -}
> -
> -/*
> - *	Map buffer into kernel address-space if necessary.
> - */
> -STATIC int
> -_xfs_buf_map_pages(
> -	struct xfs_buf		*bp,
> -	xfs_buf_flags_t		flags)
> -{
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -	if (bp->b_page_count == 1) {
> -		/* A single page buffer is always mappable */
> -		bp->b_addr = page_address(bp->b_pages[0]);
> -	} else {
> -		int retried = 0;
> -		unsigned nofs_flag;
> -
> -		/*
> -		 * vm_map_ram() will allocate auxiliary structures (e.g.
> -		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
> -		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
> -		 * from the same call site that can be run from both above and
> -		 * below memory reclaim causes lockdep false positives. Hence we
> -		 * always need to force this allocation to nofs context because
> -		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
> -		 * prevent false positive lockdep reports.
> -		 *
> -		 * XXX(dgc): I think dquot reclaim is the only place we can get
> -		 * to this function from memory reclaim context now. If we fix
> -		 * that like we've fixed inode reclaim to avoid writeback from
> -		 * reclaim, this nofs wrapping can go away.

Heh, I should've got rid of this comment when I added the code pinning
dquot buffers to the dquot log item at transaction commit/quotacheck
dirty time.

> -		 */
> -		nofs_flag = memalloc_nofs_save();
> -		do {
> -			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
> -						-1);
> -			if (bp->b_addr)
> -				break;
> -			vm_unmap_aliases();
> -		} while (retried++ <= 1);
> -		memalloc_nofs_restore(nofs_flag);
> -
> -		if (!bp->b_addr)
> -			return -ENOMEM;
> -	}
>  
>  	return 0;
>  }
> @@ -1409,7 +1281,8 @@ xfs_buf_ioend(
>  	if (bp->b_flags & XBF_READ) {
>  		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
>  			invalidate_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +					DIV_ROUND_UP(BBTOB(bp->b_length),
> +							PAGE_SIZE));

The second argument to invalidate_kernel_vmap_range is the number of
bytes, right?  Isn't this BBTOB() without the DIV_ROUND_UP?  Or do you
actually want roundup(BBTOB(b_length), PAGE_SIZE) here?

>  		if (!bp->b_error && bp->b_ops)
>  			bp->b_ops->verify_read(bp);
>  		if (!bp->b_error)
> @@ -1561,29 +1434,43 @@ static void
>  xfs_buf_submit_bio(
>  	struct xfs_buf		*bp)
>  {
> -	unsigned int		size = BBTOB(bp->b_length);
> -	unsigned int		map = 0, p;
> +	unsigned int		map = 0;
>  	struct blk_plug		plug;
>  	struct bio		*bio;
>  
> -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> -			xfs_buf_bio_op(bp), GFP_NOIO);
> -	bio->bi_private = bp;
> -	bio->bi_end_io = xfs_buf_bio_end_io;
> +	if (is_vmalloc_addr(bp->b_addr)) {
> +		unsigned int	size = BBTOB(bp->b_length);
> +		unsigned int	alloc_size = DIV_ROUND_UP(size, PAGE_SIZE);
> +		void		*data = bp->b_addr;
>  
> -	if (bp->b_page_count == 1) {
> -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> -				offset_in_page(bp->b_addr));
> -	} else {
> -		for (p = 0; p < bp->b_page_count; p++)
> -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +		bio = bio_alloc(bp->b_target->bt_bdev, size >> PAGE_SHIFT,

Is the second argument (size >> PAGE_SHIFT) supposed to be the number of
pages that we're going to __bio_add_page to the bio?

In which case, shouldn't it be alloc_size ?

> +				xfs_buf_bio_op(bp), GFP_NOIO);
> +
> +		do {
> +			unsigned int	len = min(size, PAGE_SIZE);
> +
> +			ASSERT(offset_in_page(data) == 0);
> +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> +			data += len;
> +			size -= len;
> +		} while (size);
>  
> -		if (is_vmalloc_addr(bp->b_addr))
> -			flush_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +		flush_kernel_vmap_range(bp->b_addr, alloc_size);

...and this one is roundup(size, PAGE_SIZE) isn't it?

> +	} else {
> +		/*
> +		 * Single folio or slab allocation.  Must be contigous and thus

s/contigous/contiguous/

--D

> +		 * only a single bvec is needed.
> +		 */
> +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> +				GFP_NOIO);
> +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> +				BBTOB(bp->b_length),
> +				offset_in_page(bp->b_addr));
>  	}
>  
> +	bio->bi_private = bp;
> +	bio->bi_end_io = xfs_buf_bio_end_io;
> +
>  	/*
>  	 * If there is more than one map segment, split out a new bio for each
>  	 * map except of the last one.  The last map is handled by the
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 57faed82e93c..3089e5d5f042 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -37,7 +37,6 @@ struct xfs_buf;
>  #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
>  
>  /* flags used only internally */
> -#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
>  #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
>  #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
>  
> @@ -63,7 +62,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_STALE,		"STALE" }, \
>  	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
>  	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
> -	{ _XBF_PAGES,		"PAGES" }, \
>  	{ _XBF_KMEM,		"KMEM" }, \
>  	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
>  	/* The following interface flags should never be set */ \
> @@ -125,8 +123,6 @@ struct xfs_buftarg {
>  	struct xfs_buf_cache	bt_cache[];
>  };
>  
> -#define XB_PAGES	2
> -
>  struct xfs_buf_map {
>  	xfs_daddr_t		bm_bn;	/* block number for I/O */
>  	int			bm_len;	/* size of I/O */
> @@ -188,13 +184,10 @@ struct xfs_buf {
>  	struct xfs_buf_log_item	*b_log_item;
>  	struct list_head	b_li_list;	/* Log items list head */
>  	struct xfs_trans	*b_transp;
> -	struct page		**b_pages;	/* array of page pointers */
> -	struct page		*b_page_array[XB_PAGES]; /* inline pages */
>  	struct xfs_buf_map	*b_maps;	/* compound buffer map */
>  	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
>  	int			b_map_count;
>  	atomic_t		b_pin_count;	/* pin count */
> -	unsigned int		b_page_count;	/* size of page array */
>  	int			b_error;	/* error code on I/O */
>  	void			(*b_iodone)(struct xfs_buf *bp);
>  
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 07bebbfb16ee..e2f6c5524771 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -169,9 +169,6 @@ xmbuf_map_page(
>  	unlock_page(page);
>  
>  	bp->b_addr = page_address(page);
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = page;
> -	bp->b_page_count = 1;
>  	return 0;
>  }
>  
> @@ -180,16 +177,10 @@ void
>  xmbuf_unmap_page(
>  	struct xfs_buf		*bp)
>  {
> -	struct page		*page = bp->b_pages[0];
> -
>  	ASSERT(xfs_buftarg_is_mem(bp->b_target));
>  
> -	put_page(page);
> -
> +	put_page(virt_to_page(bp->b_addr));
>  	bp->b_addr = NULL;
> -	bp->b_pages[0] = NULL;
> -	bp->b_pages = NULL;
> -	bp->b_page_count = 0;
>  }
>  
>  /* Is this a valid daddr within the buftarg? */
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-02-26 18:02   ` Darrick J. Wong
@ 2025-03-04 14:10     ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-04 14:10 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Feb 26, 2025 at 10:02:34AM -0800, Darrick J. Wong wrote:
> >  		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
> >  			invalidate_kernel_vmap_range(bp->b_addr,
> > -					xfs_buf_vmap_len(bp));
> > +					DIV_ROUND_UP(BBTOB(bp->b_length),
> > +							PAGE_SIZE));
> 
> The second argument to invalidate_kernel_vmap_range is the number of
> bytes, right?

Yes.

> Isn't this BBTOB() without the DIV_ROUND_UP?  Or do you
> actually want roundup(BBTOB(b_length), PAGE_SIZE) here?

Yes.

> > -	if (bp->b_page_count == 1) {
> > -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> > -				offset_in_page(bp->b_addr));
> > -	} else {
> > -		for (p = 0; p < bp->b_page_count; p++)
> > -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> > -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> > +		bio = bio_alloc(bp->b_target->bt_bdev, size >> PAGE_SHIFT,
> 
> Is the second argument (size >> PAGE_SHIFT) supposed to be the number of
> pages that we're going to __bio_add_page to the bio?

Yes.

> In which case, shouldn't it be alloc_size ?

Yes.

> > +		} while (size);
> >  
> > -		if (is_vmalloc_addr(bp->b_addr))
> > -			flush_kernel_vmap_range(bp->b_addr,
> > -					xfs_buf_vmap_len(bp));
> > +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> 
> ...and this one is roundup(size, PAGE_SIZE) isn't it?

Yes.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* use folios and vmalloc for buffer cache backing memory v2
@ 2025-03-05 14:05 Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 01/12] xfs: unmapped buffer item size straddling mismatch Christoph Hellwig
                   ` (11 more replies)
  0 siblings, 12 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Hi all,

this is another spin on converting the XFS buffer cache to use folios and
generally simplify the memory allocation in it.  It is based on Dave's
last folio series (which itself had pulled in bits from my earlier
vmalloc series).

It converts the backing memory allocation for all large buffers that are
power of two sized to large folios, converts > PAGE_SIZE but not power of
two allocations to vmalloc instead of vm_map_ram and generally cleans up
a lot of code around the memory allocation and reduces the size of the
xfs_buf structure by removing the embedded pages array and pages pointer.

I've benchmarked it using buffer heavy workloads, most notable fs_mark
run on null_blk without any fsync or O_SYNC to stress the buffer memory
allocator.  The performance results are disappointingly boring
unfortunately: for 4k directory block I see no significant change
(although the variance for both loads is very high to start with), and
for 64k directory block I see a minimal 1-2% gain that is barely about
the variance.  So based on the performance results alone I would not
propose this series, but I think it actually cleans the code up very
nicely.

Changes since v1:
 - use a WARN_ON_ONCE for the slab alignment guarantee check
 - fix confusion about units passed to the vmap flushing helpers
 - remove a duplicate setting of __GFP_ZERO
 - use howmany more
 - improve a code comment
 - spelling fixes

Diffstat:
 libxfs/xfs_ialloc.c    |    2 
 libxfs/xfs_inode_buf.c |    2 
 scrub/inode_repair.c   |    3 
 xfs_buf.c              |  368 ++++++++++++++++---------------------------------
 xfs_buf.h              |   25 +--
 xfs_buf_item.c         |  114 ---------------
 xfs_buf_item_recover.c |    8 -
 xfs_buf_mem.c          |   43 +----
 xfs_buf_mem.h          |    6 
 xfs_inode.c            |    3 
 xfs_trace.h            |    4 
 11 files changed, 159 insertions(+), 419 deletions(-)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH 01/12] xfs: unmapped buffer item size straddling mismatch
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 02/12] xfs: add a fast path to xfs_buf_zero when b_addr is set Christoph Hellwig
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

From: Dave Chinner <dchinner@redhat.com>

We never log large contiguous regions of unmapped buffers, so this
bug is never triggered by the current code. However, the slowpath
for formatting buffer straddling regions is broken.

That is, the size and shape of the log vector calculated across a
straddle does not match how the formatting code formats a straddle.
This results in a log vector with an uninitialised iovec and this
causes a crash when xlog_write_full() goes to copy the iovec into
the journal.

Whilst touching this code, don't bother checking mapped or single
folio buffers for discontiguous regions because they don't have
them. This significantly reduces the overhead of this check when
logging large buffers as calling xfs_buf_offset() is not free and
it occurs a *lot* in those cases.

Fixes: 929f8b0deb83 ("xfs: optimise xfs_buf_item_size/format for contiguous regions")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf_item.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 47549cfa61cd..0ee6fa9efd18 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -57,6 +57,10 @@ xfs_buf_log_format_size(
 			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
 }
 
+/*
+ * We only have to worry about discontiguous buffer range straddling on unmapped
+ * buffers. Everything else will have a contiguous data region we can copy from.
+ */
 static inline bool
 xfs_buf_item_straddle(
 	struct xfs_buf		*bp,
@@ -66,6 +70,9 @@ xfs_buf_item_straddle(
 {
 	void			*first, *last;
 
+	if (bp->b_page_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
+		return false;
+
 	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
 	last = xfs_buf_offset(bp,
 			offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
@@ -133,11 +140,13 @@ xfs_buf_item_size_segment(
 	return;
 
 slow_scan:
-	/* Count the first bit we jumped out of the above loop from */
-	(*nvecs)++;
-	*nbytes += XFS_BLF_CHUNK;
+	ASSERT(bp->b_addr == NULL);
 	last_bit = first_bit;
+	nbits = 1;
 	while (last_bit != -1) {
+
+		*nbytes += XFS_BLF_CHUNK;
+
 		/*
 		 * This takes the bit number to start looking from and
 		 * returns the next set bit from there.  It returns -1
@@ -152,6 +161,8 @@ xfs_buf_item_size_segment(
 		 * else keep scanning the current set of bits.
 		 */
 		if (next_bit == -1) {
+			if (first_bit != last_bit)
+				(*nvecs)++;
 			break;
 		} else if (next_bit != last_bit + 1 ||
 		           xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
@@ -163,7 +174,6 @@ xfs_buf_item_size_segment(
 			last_bit++;
 			nbits++;
 		}
-		*nbytes += XFS_BLF_CHUNK;
 	}
 }
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 02/12] xfs: add a fast path to xfs_buf_zero when b_addr is set
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 01/12] xfs: unmapped buffer item size straddling mismatch Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 03/12] xfs: remove xfs_buf.b_offset Christoph Hellwig
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

No need to walk the page list if bp->b_addr is valid.  That also means
b_offset doesn't need to be taken into account in the unmapped loop as
b_offset is only set for kmem backed buffers which are always mapped.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5d560e9073f4..ba0bdff3ad57 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1633,13 +1633,18 @@ xfs_buf_zero(
 {
 	size_t			bend;
 
+	if (bp->b_addr) {
+		memset(bp->b_addr + boff, 0, bsize);
+		return;
+	}
+
 	bend = boff + bsize;
 	while (boff < bend) {
 		struct page	*page;
 		int		page_index, page_offset, csize;
 
-		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
-		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
+		page_index = boff >> PAGE_SHIFT;
+		page_offset = boff & ~PAGE_MASK;
 		page = bp->b_pages[page_index];
 		csize = min_t(size_t, PAGE_SIZE - page_offset,
 				      BBTOB(bp->b_length) - boff);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 03/12] xfs: remove xfs_buf.b_offset
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 01/12] xfs: unmapped buffer item size straddling mismatch Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 02/12] xfs: add a fast path to xfs_buf_zero when b_addr is set Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 04/12] xfs: remove xfs_buf_is_vmapped Christoph Hellwig
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

b_offset is only set for slab backed buffers and always set to
offset_in_page(bp->b_addr), which can be done just as easily in the only
user of b_offset.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 3 +--
 fs/xfs/xfs_buf.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ba0bdff3ad57..972ea34ecfd4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -278,7 +278,6 @@ xfs_buf_alloc_kmem(
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
-	bp->b_offset = offset_in_page(bp->b_addr);
 	bp->b_pages = bp->b_page_array;
 	bp->b_pages[0] = kmem_to_page(bp->b_addr);
 	bp->b_page_count = 1;
@@ -1474,7 +1473,7 @@ xfs_buf_submit_bio(
 
 	if (bp->b_flags & _XBF_KMEM) {
 		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
-				bp->b_offset);
+				offset_in_page(bp->b_addr));
 	} else {
 		for (p = 0; p < bp->b_page_count; p++)
 			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 80e06eecaf56..c92a328252cc 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -194,8 +194,6 @@ struct xfs_buf {
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
 	unsigned int		b_page_count;	/* size of page array */
-	unsigned int		b_offset;	/* page offset of b_addr,
-						   only for _XBF_KMEM buffers */
 	int			b_error;	/* error code on I/O */
 	void			(*b_iodone)(struct xfs_buf *bp);
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 04/12] xfs: remove xfs_buf_is_vmapped
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 03/12] xfs: remove xfs_buf.b_offset Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 05/12] xfs: refactor backing memory allocations for buffers Christoph Hellwig
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

No need to look at the page count if we can simply call is_vmalloc_addr
on bp->b_addr.  This prepares for eventualy removing the b_page_count
field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 972ea34ecfd4..58eaf5a13c12 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -55,20 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
 	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
 }
 
-static inline int
-xfs_buf_is_vmapped(
-	struct xfs_buf	*bp)
-{
-	/*
-	 * Return true if the buffer is vmapped.
-	 *
-	 * b_addr is null if the buffer is not mapped, but the code is clever
-	 * enough to know it doesn't have to map a single page, so the check has
-	 * to be both for b_addr and bp->b_page_count > 1.
-	 */
-	return bp->b_addr && bp->b_page_count > 1;
-}
-
 static inline int
 xfs_buf_vmap_len(
 	struct xfs_buf	*bp)
@@ -212,7 +198,7 @@ xfs_buf_free_pages(
 
 	ASSERT(bp->b_flags & _XBF_PAGES);
 
-	if (xfs_buf_is_vmapped(bp))
+	if (is_vmalloc_addr(bp->b_addr))
 		vm_unmap_ram(bp->b_addr, bp->b_page_count);
 
 	for (i = 0; i < bp->b_page_count; i++) {
@@ -1298,7 +1284,7 @@ __xfs_buf_ioend(
 	trace_xfs_buf_iodone(bp, _RET_IP_);
 
 	if (bp->b_flags & XBF_READ) {
-		if (!bp->b_error && xfs_buf_is_vmapped(bp))
+		if (!bp->b_error && bp->b_addr && is_vmalloc_addr(bp->b_addr))
 			invalidate_kernel_vmap_range(bp->b_addr,
 					xfs_buf_vmap_len(bp));
 		if (!bp->b_error && bp->b_ops)
@@ -1479,7 +1465,7 @@ xfs_buf_submit_bio(
 			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
 		bio->bi_iter.bi_size = size; /* limit to the actual size used */
 
-		if (xfs_buf_is_vmapped(bp))
+		if (bp->b_addr && is_vmalloc_addr(bp->b_addr))
 			flush_kernel_vmap_range(bp->b_addr,
 					xfs_buf_vmap_len(bp));
 	}
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 05/12] xfs: refactor backing memory allocations for buffers
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 04/12] xfs: remove xfs_buf_is_vmapped Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback Christoph Hellwig
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Lift handling of shmem and slab backed buffers into xfs_buf_alloc_pages
and rename the result to xfs_buf_alloc_backing_mem.  This shares more
code and ensures uncached buffers can also use slab, which slightly
reduces the memory usage of growfs on 512 byte sector size file systems,
but more importantly means the allocation invariants are the same for
cached and uncached buffers.  Document these new invariants with a big
fat comment mostly stolen from a patch by Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c | 55 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 58eaf5a13c12..18ec1c1fbca1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -271,19 +271,49 @@ xfs_buf_alloc_kmem(
 	return 0;
 }
 
+/*
+ * Allocate backing memory for a buffer.
+ *
+ * For tmpfs-backed buffers used by in-memory btrees this directly maps the
+ * tmpfs page cache folios.
+ *
+ * For real file system buffers there are two different kinds backing memory:
+ *
+ * The first type backs the buffer by a kmalloc allocation.  This is done for
+ * less than PAGE_SIZE allocations to avoid wasting memory.
+ *
+ * The second type of buffer is the multi-page buffer. These are always made
+ * up of single pages so that they can be fed to vmap_ram() to return a
+ * contiguous memory region we can access the data through, or mark it as
+ * XBF_UNMAPPED and access the data directly through individual page_address()
+ * calls.
+ */
 static int
-xfs_buf_alloc_pages(
+xfs_buf_alloc_backing_mem(
 	struct xfs_buf	*bp,
 	xfs_buf_flags_t	flags)
 {
+	size_t		size = BBTOB(bp->b_length);
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
 	long		filled = 0;
 
+	if (xfs_buftarg_is_mem(bp->b_target))
+		return xmbuf_map_page(bp);
+
+	/*
+	 * For buffers that fit entirely within a single page, first attempt to
+	 * allocate the memory from the heap to minimise memory usage.  If we
+	 * can't get heap memory for these small buffers, we fall back to using
+	 * the page allocator.
+	 */
+	if (size < PAGE_SIZE && xfs_buf_alloc_kmem(new_bp, flags) == 0)
+		return 0;
+
 	if (flags & XBF_READ_AHEAD)
 		gfp_mask |= __GFP_NORETRY;
 
 	/* Make sure that we have a page list */
-	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
 	if (bp->b_page_count <= XB_PAGES) {
 		bp->b_pages = bp->b_page_array;
 	} else {
@@ -564,18 +594,7 @@ xfs_buf_find_insert(
 	if (error)
 		goto out_drop_pag;
 
-	if (xfs_buftarg_is_mem(new_bp->b_target)) {
-		error = xmbuf_map_page(new_bp);
-	} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
-		   xfs_buf_alloc_kmem(new_bp, flags) < 0) {
-		/*
-		 * For buffers that fit entirely within a single page, first
-		 * attempt to allocate the memory from the heap to minimise
-		 * memory usage. If we can't get heap memory for these small
-		 * buffers, we fall back to using the page allocator.
-		 */
-		error = xfs_buf_alloc_pages(new_bp, flags);
-	}
+	error = xfs_buf_alloc_backing_mem(new_bp, flags);
 	if (error)
 		goto out_free_buf;
 
@@ -939,14 +958,12 @@ xfs_buf_get_uncached(
 	if (error)
 		return error;
 
-	if (xfs_buftarg_is_mem(bp->b_target))
-		error = xmbuf_map_page(bp);
-	else
-		error = xfs_buf_alloc_pages(bp, flags);
+	error = xfs_buf_alloc_backing_mem(bp, flags);
 	if (error)
 		goto fail_free_buf;
 
-	error = _xfs_buf_map_pages(bp, 0);
+	if (!bp->b_addr)
+		error = _xfs_buf_map_pages(bp, 0);
 	if (unlikely(error)) {
 		xfs_warn(target->bt_mount,
 			"%s: failed to map pages", __func__);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 05/12] xfs: refactor backing memory allocations for buffers Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 18:18   ` Darrick J. Wong
  2025-03-05 21:02   ` Dave Chinner
  2025-03-05 14:05 ` [PATCH 07/12] xfs: convert buffer cache to use high order folios Christoph Hellwig
                   ` (5 subsequent siblings)
  11 siblings, 2 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment
for kmalloc(power-of-two)", kmalloc and friends guarantee that power of
two sized allocations are naturally aligned.  Limit our use of kmalloc
for buffers to these power of two sizes and remove the fallback to
the page allocator for this case, but keep a check in addition to
trusting the slab allocator to get the alignment right.

Also refactor the kmalloc path to reuse various calculations for the
size and gfp flags.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 18ec1c1fbca1..073246d4352f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -243,23 +243,23 @@ xfs_buf_free(
 
 static int
 xfs_buf_alloc_kmem(
-	struct xfs_buf	*bp,
-	xfs_buf_flags_t	flags)
+	struct xfs_buf		*bp,
+	size_t			size,
+	gfp_t			gfp_mask)
 {
-	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
-	size_t		size = BBTOB(bp->b_length);
-
-	/* Assure zeroed buffer for non-read cases. */
-	if (!(flags & XBF_READ))
-		gfp_mask |= __GFP_ZERO;
+	ASSERT(is_power_of_2(size));
+	ASSERT(size < PAGE_SIZE);
 
-	bp->b_addr = kmalloc(size, gfp_mask);
+	bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
 	if (!bp->b_addr)
 		return -ENOMEM;
 
-	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
-	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
-		/* b_addr spans two pages - use alloc_page instead */
+	/*
+	 * Slab guarantees that we get back naturally aligned allocations for
+	 * power of two sizes.  Keep this check as the canary in the coal mine
+	 * if anything changes in slab.
+	 */
+	if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
 		kfree(bp->b_addr);
 		bp->b_addr = NULL;
 		return -ENOMEM;
@@ -300,18 +300,22 @@ xfs_buf_alloc_backing_mem(
 	if (xfs_buftarg_is_mem(bp->b_target))
 		return xmbuf_map_page(bp);
 
-	/*
-	 * For buffers that fit entirely within a single page, first attempt to
-	 * allocate the memory from the heap to minimise memory usage.  If we
-	 * can't get heap memory for these small buffers, we fall back to using
-	 * the page allocator.
-	 */
-	if (size < PAGE_SIZE && xfs_buf_alloc_kmem(new_bp, flags) == 0)
-		return 0;
+	/* Assure zeroed buffer for non-read cases. */
+	if (!(flags & XBF_READ))
+		gfp_mask |= __GFP_ZERO;
 
 	if (flags & XBF_READ_AHEAD)
 		gfp_mask |= __GFP_NORETRY;
 
+	/*
+	 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
+	 * is properly aligned.  The slab allocator now guarantees an aligned
+	 * allocation for all power of two sizes, we matches most of the smaller
+	 * than PAGE_SIZE buffers used by XFS.
+	 */
+	if (size < PAGE_SIZE && is_power_of_2(size))
+		return xfs_buf_alloc_kmem(bp, size, gfp_mask);
+
 	/* Make sure that we have a page list */
 	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
 	if (bp->b_page_count <= XB_PAGES) {
@@ -324,10 +328,6 @@ xfs_buf_alloc_backing_mem(
 	}
 	bp->b_flags |= _XBF_PAGES;
 
-	/* Assure zeroed buffer for non-read cases. */
-	if (!(flags & XBF_READ))
-		gfp_mask |= __GFP_ZERO;
-
 	/*
 	 * Bulk filling of pages can take multiple calls. Not filling the entire
 	 * array is not an allocation failure, so don't back off if we get at
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 07/12] xfs: convert buffer cache to use high order folios
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 18:20   ` Darrick J. Wong
  2025-03-05 20:50   ` Dave Chinner
  2025-03-05 14:05 ` [PATCH 08/12] xfs: kill XBF_UNMAPPED Christoph Hellwig
                   ` (4 subsequent siblings)
  11 siblings, 2 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Now that we have the buffer cache using the folio API, we can extend
the use of folios to allocate high order folios for multi-page
buffers rather than an array of single pages that are then vmapped
into a contiguous range.

This creates a new type of single folio buffers that can have arbitrary
order in addition to the existing multi-folio buffers made up of many
single page folios that get vmapped.  The single folio is for now
stashed into the existing b_pages array, but that will go away entirely
later in the series and remove the temporary page vs folio typing issues
that only work because the two structures currently can be used largely
interchangeable.

The code that allocates buffers will optimistically attempt a high
order folio allocation as a fast path if the buffer size is a power
of two and thus fits into a folio. If this high order allocation
fails, then we fall back to the existing multi-folio allocation
code. This now forms the slow allocation path, and hopefully will be
largely unused in normal conditions except for buffers with size
that are not a power of two like larger remote xattrs.

This should improve performance of large buffer operations (e.g.
large directory block sizes) as we should now mostly avoid the
expense of vmapping large buffers (and the vmap lock contention that
can occur) as well as avoid the runtime pressure that frequently
accessing kernel vmapped pages put on the TLBs.

Based on a patch from Dave Chinner <dchinner@redhat.com>, but mutilated
beyond recognition.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c | 52 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 073246d4352f..f0666ef57bd2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -203,9 +203,9 @@ xfs_buf_free_pages(
 
 	for (i = 0; i < bp->b_page_count; i++) {
 		if (bp->b_pages[i])
-			__free_page(bp->b_pages[i]);
+			folio_put(page_folio(bp->b_pages[i]));
 	}
-	mm_account_reclaimed_pages(bp->b_page_count);
+	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
 
 	if (bp->b_pages != bp->b_page_array)
 		kfree(bp->b_pages);
@@ -277,12 +277,17 @@ xfs_buf_alloc_kmem(
  * For tmpfs-backed buffers used by in-memory btrees this directly maps the
  * tmpfs page cache folios.
  *
- * For real file system buffers there are two different kinds backing memory:
+ * For real file system buffers there are three different kinds backing memory:
  *
  * The first type backs the buffer by a kmalloc allocation.  This is done for
  * less than PAGE_SIZE allocations to avoid wasting memory.
  *
- * The second type of buffer is the multi-page buffer. These are always made
+ * The second type is a single folio buffer - this may be a high order folio or
+ * just a single page sized folio, but either way they get treated the same way
+ * by the rest of the code - the buffer memory spans a single contiguous memory
+ * region that we don't have to map and unmap to access the data directly.
+ *
+ * The third type of buffer is the multi-page buffer. These are always made
  * up of single pages so that they can be fed to vmap_ram() to return a
  * contiguous memory region we can access the data through, or mark it as
  * XBF_UNMAPPED and access the data directly through individual page_address()
@@ -295,6 +300,7 @@ xfs_buf_alloc_backing_mem(
 {
 	size_t		size = BBTOB(bp->b_length);
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
+	struct folio	*folio;
 	long		filled = 0;
 
 	if (xfs_buftarg_is_mem(bp->b_target))
@@ -316,7 +322,41 @@ xfs_buf_alloc_backing_mem(
 	if (size < PAGE_SIZE && is_power_of_2(size))
 		return xfs_buf_alloc_kmem(bp, size, gfp_mask);
 
-	/* Make sure that we have a page list */
+	/*
+	 * Don't bother with the retry loop for single PAGE allocations: vmalloc
+	 * won't do any better.
+	 */
+	if (size <= PAGE_SIZE)
+		gfp_mask |= __GFP_NOFAIL;
+
+	/*
+	 * Optimistically attempt a single high order folio allocation for
+	 * larger than PAGE_SIZE buffers.
+	 *
+	 * Allocating a high order folio makes the assumption that buffers are a
+	 * power-of-2 size, matching the power-of-2 folios sizes available.
+	 *
+	 * The exception here are user xattr data buffers, which can be arbitrarily
+	 * sized up to 64kB plus structure metadata, skip straight to the vmalloc
+	 * path for them instead of wasting memory here.
+	 */
+	if (size > PAGE_SIZE && !is_power_of_2(size))
+		goto fallback;
+	folio = folio_alloc(gfp_mask, get_order(size));
+	if (!folio) {
+		if (size <= PAGE_SIZE)
+			return -ENOMEM;
+		goto fallback;
+	}
+	bp->b_addr = folio_address(folio);
+	bp->b_page_array[0] = &folio->page;
+	bp->b_pages = bp->b_page_array;
+	bp->b_page_count = 1;
+	bp->b_flags |= _XBF_PAGES;
+	return 0;
+
+fallback:
+	/* Fall back to allocating an array of single page folios. */
 	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
 	if (bp->b_page_count <= XB_PAGES) {
 		bp->b_pages = bp->b_page_array;
@@ -1474,7 +1514,7 @@ xfs_buf_submit_bio(
 	bio->bi_private = bp;
 	bio->bi_end_io = xfs_buf_bio_end_io;
 
-	if (bp->b_flags & _XBF_KMEM) {
+	if (bp->b_page_count == 1) {
 		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
 				offset_in_page(bp->b_addr));
 	} else {
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 08/12] xfs: kill XBF_UNMAPPED
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 07/12] xfs: convert buffer cache to use high order folios Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 09/12] xfs: buffer items don't straddle pages anymore Christoph Hellwig
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Unmapped buffer access is a pain, so kill it. The switch to large
folios means we rarely pay a vmap penalty for large buffers,
so this functionality is largely unnecessary now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_ialloc.c    |  2 +-
 fs/xfs/libxfs/xfs_inode_buf.c |  2 +-
 fs/xfs/scrub/inode_repair.c   |  3 +-
 fs/xfs/xfs_buf.c              | 58 +++--------------------------------
 fs/xfs/xfs_buf.h              | 16 +++++++---
 fs/xfs/xfs_buf_item.c         |  2 +-
 fs/xfs/xfs_buf_item_recover.c |  8 +----
 fs/xfs/xfs_inode.c            |  3 +-
 8 files changed, 21 insertions(+), 73 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f3a840a425f5..24b133930368 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -364,7 +364,7 @@ xfs_ialloc_inode_init(
 				(j * M_IGEO(mp)->blocks_per_cluster));
 		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 				mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
-				XBF_UNMAPPED, &fbuf);
+				0, &fbuf);
 		if (error)
 			return error;
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f24fa628fecf..2f575b88cd7c 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -137,7 +137,7 @@ xfs_imap_to_bp(
 	int			error;
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-			imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+			imap->im_len, 0, bpp, &xfs_inode_buf_ops);
 	if (xfs_metadata_is_sick(error))
 		xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
 				XFS_SICK_AG_INODES);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 13ff1c933cb8..2d2ff07e63e5 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1558,8 +1558,7 @@ xrep_dinode_core(
 
 	/* Read the inode cluster buffer. */
 	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
-			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
-			NULL);
+			ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f0666ef57bd2..2b4b8c104b0c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -145,7 +145,7 @@ _xfs_buf_alloc(
 	 * We don't want certain flags to appear in b_flags unless they are
 	 * specifically set by later operations on the buffer.
 	 */
-	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+	flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
 
 	/*
 	 * A new buffer is held and locked by the owner.  This ensures that the
@@ -289,9 +289,7 @@ xfs_buf_alloc_kmem(
  *
  * The third type of buffer is the multi-page buffer. These are always made
  * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through, or mark it as
- * XBF_UNMAPPED and access the data directly through individual page_address()
- * calls.
+ * contiguous memory region we can access the data through.
  */
 static int
 xfs_buf_alloc_backing_mem(
@@ -409,8 +407,6 @@ _xfs_buf_map_pages(
 	if (bp->b_page_count == 1) {
 		/* A single page buffer is always mappable */
 		bp->b_addr = page_address(bp->b_pages[0]);
-	} else if (flags & XBF_UNMAPPED) {
-		bp->b_addr = NULL;
 	} else {
 		int retried = 0;
 		unsigned nofs_flag;
@@ -1341,7 +1337,7 @@ __xfs_buf_ioend(
 	trace_xfs_buf_iodone(bp, _RET_IP_);
 
 	if (bp->b_flags & XBF_READ) {
-		if (!bp->b_error && bp->b_addr && is_vmalloc_addr(bp->b_addr))
+		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
 			invalidate_kernel_vmap_range(bp->b_addr,
 					xfs_buf_vmap_len(bp));
 		if (!bp->b_error && bp->b_ops)
@@ -1522,7 +1518,7 @@ xfs_buf_submit_bio(
 			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
 		bio->bi_iter.bi_size = size; /* limit to the actual size used */
 
-		if (bp->b_addr && is_vmalloc_addr(bp->b_addr))
+		if (is_vmalloc_addr(bp->b_addr))
 			flush_kernel_vmap_range(bp->b_addr,
 					xfs_buf_vmap_len(bp));
 	}
@@ -1653,52 +1649,6 @@ xfs_buf_submit(
 	xfs_buf_submit_bio(bp);
 }
 
-void *
-xfs_buf_offset(
-	struct xfs_buf		*bp,
-	size_t			offset)
-{
-	struct page		*page;
-
-	if (bp->b_addr)
-		return bp->b_addr + offset;
-
-	page = bp->b_pages[offset >> PAGE_SHIFT];
-	return page_address(page) + (offset & (PAGE_SIZE-1));
-}
-
-void
-xfs_buf_zero(
-	struct xfs_buf		*bp,
-	size_t			boff,
-	size_t			bsize)
-{
-	size_t			bend;
-
-	if (bp->b_addr) {
-		memset(bp->b_addr + boff, 0, bsize);
-		return;
-	}
-
-	bend = boff + bsize;
-	while (boff < bend) {
-		struct page	*page;
-		int		page_index, page_offset, csize;
-
-		page_index = boff >> PAGE_SHIFT;
-		page_offset = boff & ~PAGE_MASK;
-		page = bp->b_pages[page_index];
-		csize = min_t(size_t, PAGE_SIZE - page_offset,
-				      BBTOB(bp->b_length) - boff);
-
-		ASSERT((csize + page_offset) <= PAGE_SIZE);
-
-		memset(page_address(page) + page_offset, 0, csize);
-
-		boff += csize;
-	}
-}
-
 /*
  * Log a message about and stale a buffer that a caller has decided is corrupt.
  *
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c92a328252cc..8db522f19b0c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -48,7 +48,6 @@ struct xfs_buf;
 #define XBF_LIVESCAN	 (1u << 28)
 #define XBF_INCORE	 (1u << 29)/* lookup only, return if found in cache */
 #define XBF_TRYLOCK	 (1u << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED	 (1u << 31)/* do not map the buffer */
 
 
 typedef unsigned int xfs_buf_flags_t;
@@ -68,8 +67,7 @@ typedef unsigned int xfs_buf_flags_t;
 	/* The following interface flags should never be set */ \
 	{ XBF_LIVESCAN,		"LIVESCAN" }, \
 	{ XBF_INCORE,		"INCORE" }, \
-	{ XBF_TRYLOCK,		"TRYLOCK" }, \
-	{ XBF_UNMAPPED,		"UNMAPPED" }
+	{ XBF_TRYLOCK,		"TRYLOCK" }
 
 /*
  * Internal state flags.
@@ -313,12 +311,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
 #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
 extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
 void xfs_buf_ioend_fail(struct xfs_buf *);
-void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
 void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
 #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
 
 /* Buffer Utility Routines */
-extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
+{
+	return bp->b_addr + offset;
+}
+
+static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
+{
+	memset(bp->b_addr + boff, 0, bsize);
+}
+
 extern void xfs_buf_stale(struct xfs_buf *bp);
 
 /* Delayed Write Buffer Routines */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0ee6fa9efd18..41f0bc9aa5f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -70,7 +70,7 @@ xfs_buf_item_straddle(
 {
 	void			*first, *last;
 
-	if (bp->b_page_count == 1 || !(bp->b_flags & XBF_UNMAPPED))
+	if (bp->b_page_count == 1)
 		return false;
 
 	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 05a2f6927c12..d4c5cef5bc43 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2(
 	struct xfs_mount		*mp = log->l_mp;
 	struct xfs_buf			*bp;
 	int				error;
-	uint				buf_flags;
 	xfs_lsn_t			lsn;
 
 	/*
@@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2(
 	}
 
 	trace_xfs_log_recover_buf_recover(log, buf_f);
-
-	buf_flags = 0;
-	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
-		buf_flags |= XBF_UNMAPPED;
-
 	error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-			  buf_flags, &bp, NULL);
+			  0, &bp, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b1f9f156ec88..36cfd9c457ce 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1721,8 +1721,7 @@ xfs_ifree_cluster(
 		 * to mark all the active inodes on the buffer stale.
 		 */
 		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-				mp->m_bsize * igeo->blocks_per_cluster,
-				XBF_UNMAPPED, &bp);
+				mp->m_bsize * igeo->blocks_per_cluster, 0, &bp);
 		if (error)
 			return error;
 
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 09/12] xfs: buffer items don't straddle pages anymore
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 08/12] xfs: kill XBF_UNMAPPED Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

From: Dave Chinner <dchinner@redhat.com>

Unmapped buffers don't exist anymore, so the page straddling
detection and slow path code can go away now.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/xfs_buf_item.c | 124 ------------------------------------------
 1 file changed, 124 deletions(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 41f0bc9aa5f4..19eb0b7a3e58 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -57,31 +57,6 @@ xfs_buf_log_format_size(
 			(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
 }
 
-/*
- * We only have to worry about discontiguous buffer range straddling on unmapped
- * buffers. Everything else will have a contiguous data region we can copy from.
- */
-static inline bool
-xfs_buf_item_straddle(
-	struct xfs_buf		*bp,
-	uint			offset,
-	int			first_bit,
-	int			nbits)
-{
-	void			*first, *last;
-
-	if (bp->b_page_count == 1)
-		return false;
-
-	first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
-	last = xfs_buf_offset(bp,
-			offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
-
-	if (last - first != nbits * XFS_BLF_CHUNK)
-		return true;
-	return false;
-}
-
 /*
  * Return the number of log iovecs and space needed to log the given buf log
  * item segment.
@@ -98,11 +73,8 @@ xfs_buf_item_size_segment(
 	int				*nvecs,
 	int				*nbytes)
 {
-	struct xfs_buf			*bp = bip->bli_buf;
 	int				first_bit;
 	int				nbits;
-	int				next_bit;
-	int				last_bit;
 
 	first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
 	if (first_bit == -1)
@@ -115,15 +87,6 @@ xfs_buf_item_size_segment(
 		nbits = xfs_contig_bits(blfp->blf_data_map,
 					blfp->blf_map_size, first_bit);
 		ASSERT(nbits > 0);
-
-		/*
-		 * Straddling a page is rare because we don't log contiguous
-		 * chunks of unmapped buffers anywhere.
-		 */
-		if (nbits > 1 &&
-		    xfs_buf_item_straddle(bp, offset, first_bit, nbits))
-			goto slow_scan;
-
 		(*nvecs)++;
 		*nbytes += nbits * XFS_BLF_CHUNK;
 
@@ -138,43 +101,6 @@ xfs_buf_item_size_segment(
 	} while (first_bit != -1);
 
 	return;
-
-slow_scan:
-	ASSERT(bp->b_addr == NULL);
-	last_bit = first_bit;
-	nbits = 1;
-	while (last_bit != -1) {
-
-		*nbytes += XFS_BLF_CHUNK;
-
-		/*
-		 * This takes the bit number to start looking from and
-		 * returns the next set bit from there.  It returns -1
-		 * if there are no more bits set or the start bit is
-		 * beyond the end of the bitmap.
-		 */
-		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
-					last_bit + 1);
-		/*
-		 * If we run out of bits, leave the loop,
-		 * else if we find a new set of bits bump the number of vecs,
-		 * else keep scanning the current set of bits.
-		 */
-		if (next_bit == -1) {
-			if (first_bit != last_bit)
-				(*nvecs)++;
-			break;
-		} else if (next_bit != last_bit + 1 ||
-		           xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
-			last_bit = next_bit;
-			first_bit = next_bit;
-			(*nvecs)++;
-			nbits = 1;
-		} else {
-			last_bit++;
-			nbits++;
-		}
-	}
 }
 
 /*
@@ -287,8 +213,6 @@ xfs_buf_item_format_segment(
 	struct xfs_buf		*bp = bip->bli_buf;
 	uint			base_size;
 	int			first_bit;
-	int			last_bit;
-	int			next_bit;
 	uint			nbits;
 
 	/* copy the flags across from the base format item */
@@ -333,15 +257,6 @@ xfs_buf_item_format_segment(
 		nbits = xfs_contig_bits(blfp->blf_data_map,
 					blfp->blf_map_size, first_bit);
 		ASSERT(nbits > 0);
-
-		/*
-		 * Straddling a page is rare because we don't log contiguous
-		 * chunks of unmapped buffers anywhere.
-		 */
-		if (nbits > 1 &&
-		    xfs_buf_item_straddle(bp, offset, first_bit, nbits))
-			goto slow_scan;
-
 		xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
 					first_bit, nbits);
 		blfp->blf_size++;
@@ -357,45 +272,6 @@ xfs_buf_item_format_segment(
 	} while (first_bit != -1);
 
 	return;
-
-slow_scan:
-	ASSERT(bp->b_addr == NULL);
-	last_bit = first_bit;
-	nbits = 1;
-	for (;;) {
-		/*
-		 * This takes the bit number to start looking from and
-		 * returns the next set bit from there.  It returns -1
-		 * if there are no more bits set or the start bit is
-		 * beyond the end of the bitmap.
-		 */
-		next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
-					(uint)last_bit + 1);
-		/*
-		 * If we run out of bits fill in the last iovec and get out of
-		 * the loop.  Else if we start a new set of bits then fill in
-		 * the iovec for the series we were looking at and start
-		 * counting the bits in the new one.  Else we're still in the
-		 * same set of bits so just keep counting and scanning.
-		 */
-		if (next_bit == -1) {
-			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-						first_bit, nbits);
-			blfp->blf_size++;
-			break;
-		} else if (next_bit != last_bit + 1 ||
-		           xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
-			xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
-						first_bit, nbits);
-			blfp->blf_size++;
-			first_bit = next_bit;
-			last_bit = next_bit;
-			nbits = 1;
-		} else {
-			last_bit++;
-			nbits++;
-		}
-	}
 }
 
 /*
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 09/12] xfs: buffer items don't straddle pages anymore Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 18:22   ` Darrick J. Wong
  2025-03-05 21:20   ` Dave Chinner
  2025-03-05 14:05 ` [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache Christoph Hellwig
  2025-03-05 14:05 ` [PATCH 12/12] xfs: trace what memory backs a buffer Christoph Hellwig
  11 siblings, 2 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

The fallback buffer allocation path currently open codes a suboptimal
version of vmalloc to allocate pages that are then mapped into
vmalloc space.  Switch to using vmalloc instead, which uses all the
optimizations in the common vmalloc code, and removes the need to
track the backing pages in the xfs_buf structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c     | 207 ++++++++++---------------------------------
 fs/xfs/xfs_buf.h     |   7 --
 fs/xfs/xfs_buf_mem.c |  11 +--
 3 files changed, 48 insertions(+), 177 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 2b4b8c104b0c..f28ca5cb5bd8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
 	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
 }
 
-static inline int
-xfs_buf_vmap_len(
-	struct xfs_buf	*bp)
-{
-	return (bp->b_page_count * PAGE_SIZE);
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -190,29 +183,6 @@ _xfs_buf_alloc(
 	return 0;
 }
 
-static void
-xfs_buf_free_pages(
-	struct xfs_buf	*bp)
-{
-	uint		i;
-
-	ASSERT(bp->b_flags & _XBF_PAGES);
-
-	if (is_vmalloc_addr(bp->b_addr))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
-	for (i = 0; i < bp->b_page_count; i++) {
-		if (bp->b_pages[i])
-			folio_put(page_folio(bp->b_pages[i]));
-	}
-	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
-
-	if (bp->b_pages != bp->b_page_array)
-		kfree(bp->b_pages);
-	bp->b_pages = NULL;
-	bp->b_flags &= ~_XBF_PAGES;
-}
-
 static void
 xfs_buf_free_callback(
 	struct callback_head	*cb)
@@ -227,16 +197,23 @@ static void
 xfs_buf_free(
 	struct xfs_buf		*bp)
 {
+	unsigned int		size = BBTOB(bp->b_length);
+
 	trace_xfs_buf_free(bp, _RET_IP_);
 
 	ASSERT(list_empty(&bp->b_lru));
 
+	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
 	if (xfs_buftarg_is_mem(bp->b_target))
 		xmbuf_unmap_page(bp);
-	else if (bp->b_flags & _XBF_PAGES)
-		xfs_buf_free_pages(bp);
+	else if (is_vmalloc_addr(bp->b_addr))
+		vfree(bp->b_addr);
 	else if (bp->b_flags & _XBF_KMEM)
 		kfree(bp->b_addr);
+	else
+		folio_put(virt_to_folio(bp->b_addr));
 
 	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
@@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = kmem_to_page(bp->b_addr);
-	bp->b_page_count = 1;
 	bp->b_flags |= _XBF_KMEM;
 	return 0;
 }
@@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
  * by the rest of the code - the buffer memory spans a single contiguous memory
  * region that we don't have to map and unmap to access the data directly.
  *
- * The third type of buffer is the multi-page buffer. These are always made
- * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
  */
 static int
 xfs_buf_alloc_backing_mem(
@@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
 	size_t		size = BBTOB(bp->b_length);
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
 	struct folio	*folio;
-	long		filled = 0;
 
 	if (xfs_buftarg_is_mem(bp->b_target))
 		return xmbuf_map_page(bp);
@@ -347,98 +320,18 @@ xfs_buf_alloc_backing_mem(
 		goto fallback;
 	}
 	bp->b_addr = folio_address(folio);
-	bp->b_page_array[0] = &folio->page;
-	bp->b_pages = bp->b_page_array;
-	bp->b_page_count = 1;
-	bp->b_flags |= _XBF_PAGES;
 	return 0;
 
 fallback:
-	/* Fall back to allocating an array of single page folios. */
-	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
-	if (bp->b_page_count <= XB_PAGES) {
-		bp->b_pages = bp->b_page_array;
-	} else {
-		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
-					gfp_mask);
-		if (!bp->b_pages)
-			return -ENOMEM;
-	}
-	bp->b_flags |= _XBF_PAGES;
-
-	/*
-	 * Bulk filling of pages can take multiple calls. Not filling the entire
-	 * array is not an allocation failure, so don't back off if we get at
-	 * least one extra page.
-	 */
 	for (;;) {
-		long	last = filled;
-
-		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
-					  bp->b_pages);
-		if (filled == bp->b_page_count) {
-			XFS_STATS_INC(bp->b_mount, xb_page_found);
+		bp->b_addr = __vmalloc(size, gfp_mask);
+		if (bp->b_addr)
 			break;
-		}
-
-		if (filled != last)
-			continue;
-
-		if (flags & XBF_READ_AHEAD) {
-			xfs_buf_free_pages(bp);
+		if (flags & XBF_READ_AHEAD)
 			return -ENOMEM;
-		}
-
 		XFS_STATS_INC(bp->b_mount, xb_page_retries);
 		memalloc_retry_wait(gfp_mask);
 	}
-	return 0;
-}
-
-/*
- *	Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-	struct xfs_buf		*bp,
-	xfs_buf_flags_t		flags)
-{
-	ASSERT(bp->b_flags & _XBF_PAGES);
-	if (bp->b_page_count == 1) {
-		/* A single page buffer is always mappable */
-		bp->b_addr = page_address(bp->b_pages[0]);
-	} else {
-		int retried = 0;
-		unsigned nofs_flag;
-
-		/*
-		 * vm_map_ram() will allocate auxiliary structures (e.g.
-		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-		 * from the same call site that can be run from both above and
-		 * below memory reclaim causes lockdep false positives. Hence we
-		 * always need to force this allocation to nofs context because
-		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-		 * prevent false positive lockdep reports.
-		 *
-		 * XXX(dgc): I think dquot reclaim is the only place we can get
-		 * to this function from memory reclaim context now. If we fix
-		 * that like we've fixed inode reclaim to avoid writeback from
-		 * reclaim, this nofs wrapping can go away.
-		 */
-		nofs_flag = memalloc_nofs_save();
-		do {
-			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-						-1);
-			if (bp->b_addr)
-				break;
-			vm_unmap_aliases();
-		} while (retried++ <= 1);
-		memalloc_nofs_restore(nofs_flag);
-
-		if (!bp->b_addr)
-			return -ENOMEM;
-	}
 
 	return 0;
 }
@@ -558,7 +451,7 @@ xfs_buf_find_lock(
 			return -ENOENT;
 		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM;
 		bp->b_ops = NULL;
 	}
 	return 0;
@@ -744,18 +637,6 @@ xfs_buf_get_map(
 			xfs_perag_put(pag);
 	}
 
-	/* We do not hold a perag reference anymore. */
-	if (!bp->b_addr) {
-		error = _xfs_buf_map_pages(bp, flags);
-		if (unlikely(error)) {
-			xfs_warn_ratelimited(btp->bt_mount,
-				"%s: failed to map %u pages", __func__,
-				bp->b_page_count);
-			xfs_buf_relse(bp);
-			return error;
-		}
-	}
-
 	/*
 	 * Clear b_error if this is a lookup from a caller that doesn't expect
 	 * valid data to be found in the buffer.
@@ -998,14 +879,6 @@ xfs_buf_get_uncached(
 	if (error)
 		goto fail_free_buf;
 
-	if (!bp->b_addr)
-		error = _xfs_buf_map_pages(bp, 0);
-	if (unlikely(error)) {
-		xfs_warn(target->bt_mount,
-			"%s: failed to map pages", __func__);
-		goto fail_free_buf;
-	}
-
 	trace_xfs_buf_get_uncached(bp, _RET_IP_);
 	*bpp = bp;
 	return 0;
@@ -1339,7 +1212,7 @@ __xfs_buf_ioend(
 	if (bp->b_flags & XBF_READ) {
 		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
 			invalidate_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+				roundup(BBTOB(bp->b_length), PAGE_SIZE));
 		if (!bp->b_error && bp->b_ops)
 			bp->b_ops->verify_read(bp);
 		if (!bp->b_error)
@@ -1500,29 +1373,43 @@ static void
 xfs_buf_submit_bio(
 	struct xfs_buf		*bp)
 {
-	unsigned int		size = BBTOB(bp->b_length);
-	unsigned int		map = 0, p;
+	unsigned int		map = 0;
 	struct blk_plug		plug;
 	struct bio		*bio;
 
-	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
-			xfs_buf_bio_op(bp), GFP_NOIO);
-	bio->bi_private = bp;
-	bio->bi_end_io = xfs_buf_bio_end_io;
+	if (is_vmalloc_addr(bp->b_addr)) {
+		unsigned int	size = BBTOB(bp->b_length);
+		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
+		void		*data = bp->b_addr;
 
-	if (bp->b_page_count == 1) {
-		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
-				offset_in_page(bp->b_addr));
-	} else {
-		for (p = 0; p < bp->b_page_count; p++)
-			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
-		bio->bi_iter.bi_size = size; /* limit to the actual size used */
+		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
+				xfs_buf_bio_op(bp), GFP_NOIO);
+
+		do {
+			unsigned int	len = min(size, PAGE_SIZE);
 
-		if (is_vmalloc_addr(bp->b_addr))
-			flush_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+			ASSERT(offset_in_page(data) == 0);
+			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
+			data += len;
+			size -= len;
+		} while (size);
+
+		flush_kernel_vmap_range(bp->b_addr, alloc_size);
+	} else {
+		/*
+		 * Single folio or slab allocation.  Must be contiguous and thus
+		 * only a single bvec is needed.
+		 */
+		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+				GFP_NOIO);
+		__bio_add_page(bio, virt_to_page(bp->b_addr),
+				BBTOB(bp->b_length),
+				offset_in_page(bp->b_addr));
 	}
 
+	bio->bi_private = bp;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+
 	/*
 	 * If there is more than one map segment, split out a new bio for each
 	 * map except of the last one.  The last map is handled by the
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8db522f19b0c..db43bdc17f55 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -36,7 +36,6 @@ struct xfs_buf;
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
@@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
 	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
-	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
@@ -122,8 +120,6 @@ struct xfs_buftarg {
 	struct xfs_buf_cache	bt_cache[];
 };
 
-#define XB_PAGES	2
-
 struct xfs_buf_map {
 	xfs_daddr_t		bm_bn;	/* block number for I/O */
 	int			bm_len;	/* size of I/O */
@@ -185,13 +181,10 @@ struct xfs_buf {
 	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
-	struct page		**b_pages;	/* array of page pointers */
-	struct page		*b_page_array[XB_PAGES]; /* inline pages */
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
-	unsigned int		b_page_count;	/* size of page array */
 	int			b_error;	/* error code on I/O */
 	void			(*b_iodone)(struct xfs_buf *bp);
 
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 5b64a2b3b113..b207754d2ee0 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -169,9 +169,6 @@ xmbuf_map_page(
 	unlock_page(page);
 
 	bp->b_addr = page_address(page);
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = page;
-	bp->b_page_count = 1;
 	return 0;
 }
 
@@ -180,16 +177,10 @@ void
 xmbuf_unmap_page(
 	struct xfs_buf		*bp)
 {
-	struct page		*page = bp->b_pages[0];
-
 	ASSERT(xfs_buftarg_is_mem(bp->b_target));
 
-	put_page(page);
-
+	put_page(virt_to_page(bp->b_addr));
 	bp->b_addr = NULL;
-	bp->b_pages[0] = NULL;
-	bp->b_pages = NULL;
-	bp->b_page_count = 0;
 }
 
 /* Is this a valid daddr within the buftarg? */
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (9 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  2025-03-05 18:34   ` Darrick J. Wong
  2025-03-05 14:05 ` [PATCH 12/12] xfs: trace what memory backs a buffer Christoph Hellwig
  11 siblings, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Directly assign b_addr based on the tmpfs folios without a detour
through pages, reuse the folio_put path used for non-tmpfs buffers
and replace all references to pages in comments with folios.

Partially based on a patch from Dave Chinner <dchinner@redhat.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_buf.c     |  6 ++----
 fs/xfs/xfs_buf_mem.c | 34 ++++++++++------------------------
 fs/xfs/xfs_buf_mem.h |  6 ++----
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f28ca5cb5bd8..c7f4cafda705 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -206,9 +206,7 @@ xfs_buf_free(
 	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
 		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
 
-	if (xfs_buftarg_is_mem(bp->b_target))
-		xmbuf_unmap_page(bp);
-	else if (is_vmalloc_addr(bp->b_addr))
+	if (is_vmalloc_addr(bp->b_addr))
 		vfree(bp->b_addr);
 	else if (bp->b_flags & _XBF_KMEM)
 		kfree(bp->b_addr);
@@ -275,7 +273,7 @@ xfs_buf_alloc_backing_mem(
 	struct folio	*folio;
 
 	if (xfs_buftarg_is_mem(bp->b_target))
-		return xmbuf_map_page(bp);
+		return xmbuf_map_backing_mem(bp);
 
 	/* Assure zeroed buffer for non-read cases. */
 	if (!(flags & XBF_READ))
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b207754d2ee0..b4ffd80b7cb6 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -74,7 +74,7 @@ xmbuf_alloc(
 
 	/*
 	 * We don't want to bother with kmapping data during repair, so don't
-	 * allow highmem pages to back this mapping.
+	 * allow highmem folios to back this mapping.
 	 */
 	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
 
@@ -127,14 +127,13 @@ xmbuf_free(
 	kfree(btp);
 }
 
-/* Directly map a shmem page into the buffer cache. */
+/* Directly map a shmem folio into the buffer cache. */
 int
-xmbuf_map_page(
+xmbuf_map_backing_mem(
 	struct xfs_buf		*bp)
 {
 	struct inode		*inode = file_inode(bp->b_target->bt_file);
 	struct folio		*folio = NULL;
-	struct page		*page;
 	loff_t                  pos = BBTOB(xfs_buf_daddr(bp));
 	int			error;
 
@@ -159,30 +158,17 @@ xmbuf_map_page(
 		return -EIO;
 	}
 
-	page = folio_file_page(folio, pos >> PAGE_SHIFT);
-
 	/*
-	 * Mark the page dirty so that it won't be reclaimed once we drop the
-	 * (potentially last) reference in xmbuf_unmap_page.
+	 * Mark the folio dirty so that it won't be reclaimed once we drop the
+	 * (potentially last) reference in xfs_buf_free.
 	 */
-	set_page_dirty(page);
-	unlock_page(page);
+	folio_set_dirty(folio);
+	folio_unlock(folio);
 
-	bp->b_addr = page_address(page);
+	bp->b_addr = folio_address(folio);
 	return 0;
 }
 
-/* Unmap a shmem page that was mapped into the buffer cache. */
-void
-xmbuf_unmap_page(
-	struct xfs_buf		*bp)
-{
-	ASSERT(xfs_buftarg_is_mem(bp->b_target));
-
-	put_page(virt_to_page(bp->b_addr));
-	bp->b_addr = NULL;
-}
-
 /* Is this a valid daddr within the buftarg? */
 bool
 xmbuf_verify_daddr(
@@ -196,7 +182,7 @@ xmbuf_verify_daddr(
 	return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
 }
 
-/* Discard the page backing this buffer. */
+/* Discard the folio backing this buffer. */
 static void
 xmbuf_stale(
 	struct xfs_buf		*bp)
@@ -211,7 +197,7 @@ xmbuf_stale(
 }
 
 /*
- * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * Finalize a buffer -- discard the backing folio if it's stale, or run the
  * write verifier to detect problems.
  */
 int
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
index eed4a7b63232..67d525cc1513 100644
--- a/fs/xfs/xfs_buf_mem.h
+++ b/fs/xfs/xfs_buf_mem.h
@@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
 		struct xfs_buftarg **btpp);
 void xmbuf_free(struct xfs_buftarg *btp);
 
-int xmbuf_map_page(struct xfs_buf *bp);
-void xmbuf_unmap_page(struct xfs_buf *bp);
 bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
 void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
 int xmbuf_finalize(struct xfs_buf *bp);
 #else
 # define xfs_buftarg_is_mem(...)	(false)
-# define xmbuf_map_page(...)		(-ENOMEM)
-# define xmbuf_unmap_page(...)		((void)0)
 # define xmbuf_verify_daddr(...)	(false)
 #endif /* CONFIG_XFS_MEMORY_BUFS */
 
+int xmbuf_map_backing_mem(struct xfs_buf *bp);
+
 #endif /* __XFS_BUF_MEM_H__ */
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 12/12] xfs: trace what memory backs a buffer
  2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
                   ` (10 preceding siblings ...)
  2025-03-05 14:05 ` [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache Christoph Hellwig
@ 2025-03-05 14:05 ` Christoph Hellwig
  11 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 14:05 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

Add three trace points for the different backing memory allocators for
buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c   | 4 ++++
 fs/xfs/xfs_trace.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f4cafda705..0014cfab3414 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -240,6 +240,7 @@ xfs_buf_alloc_kmem(
 		return -ENOMEM;
 	}
 	bp->b_flags |= _XBF_KMEM;
+	trace_xfs_buf_backing_kmem(bp, _RET_IP_);
 	return 0;
 }
 
@@ -315,9 +316,11 @@ xfs_buf_alloc_backing_mem(
 	if (!folio) {
 		if (size <= PAGE_SIZE)
 			return -ENOMEM;
+		trace_xfs_buf_backing_fallback(bp, _RET_IP_);
 		goto fallback;
 	}
 	bp->b_addr = folio_address(folio);
+	trace_xfs_buf_backing_folio(bp, _RET_IP_);
 	return 0;
 
 fallback:
@@ -331,6 +334,7 @@ xfs_buf_alloc_backing_mem(
 		memalloc_retry_wait(gfp_mask);
 	}
 
+	trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bfc2f1249022..4a3724043713 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -545,6 +545,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+DEFINE_BUF_EVENT(xfs_buf_backing_folio);
+DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
+DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
+DEFINE_BUF_EVENT(xfs_buf_backing_fallback);
 
 /* not really buffer traces, but the buf provides useful information */
 DEFINE_BUF_EVENT(xfs_btree_corrupt);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback
  2025-03-05 14:05 ` [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback Christoph Hellwig
@ 2025-03-05 18:18   ` Darrick J. Wong
  2025-03-05 23:32     ` Christoph Hellwig
  2025-03-05 21:02   ` Dave Chinner
  1 sibling, 1 reply; 34+ messages in thread
From: Darrick J. Wong @ 2025-03-05 18:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:23AM -0700, Christoph Hellwig wrote:
> Since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment
> for kmalloc(power-of-two)", kmalloc and friends guarantee that power of
> two sized allocations are naturally aligned.  Limit our use of kmalloc
> for buffers to these power of two sizes and remove the fallback to
> the page allocator for this case, but keep a check in addition to
> trusting the slab allocator to get the alignment right.
> 
> Also refactor the kmalloc path to reuse various calculations for the
> size and gfp flags.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c | 48 ++++++++++++++++++++++++------------------------
>  1 file changed, 24 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 18ec1c1fbca1..073246d4352f 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -243,23 +243,23 @@ xfs_buf_free(
>  
>  static int
>  xfs_buf_alloc_kmem(
> -	struct xfs_buf	*bp,
> -	xfs_buf_flags_t	flags)
> +	struct xfs_buf		*bp,
> +	size_t			size,
> +	gfp_t			gfp_mask)
>  {
> -	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
> -	size_t		size = BBTOB(bp->b_length);
> -
> -	/* Assure zeroed buffer for non-read cases. */
> -	if (!(flags & XBF_READ))
> -		gfp_mask |= __GFP_ZERO;
> +	ASSERT(is_power_of_2(size));
> +	ASSERT(size < PAGE_SIZE);
>  
> -	bp->b_addr = kmalloc(size, gfp_mask);
> +	bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
>  	if (!bp->b_addr)
>  		return -ENOMEM;
>  
> -	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
> -	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
> -		/* b_addr spans two pages - use alloc_page instead */
> +	/*
> +	 * Slab guarantees that we get back naturally aligned allocations for
> +	 * power of two sizes.  Keep this check as the canary in the coal mine
> +	 * if anything changes in slab.
> +	 */
> +	if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
>  		kfree(bp->b_addr);
>  		bp->b_addr = NULL;
>  		return -ENOMEM;
> @@ -300,18 +300,22 @@ xfs_buf_alloc_backing_mem(
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		return xmbuf_map_page(bp);
>  
> -	/*
> -	 * For buffers that fit entirely within a single page, first attempt to
> -	 * allocate the memory from the heap to minimise memory usage.  If we
> -	 * can't get heap memory for these small buffers, we fall back to using
> -	 * the page allocator.
> -	 */
> -	if (size < PAGE_SIZE && xfs_buf_alloc_kmem(new_bp, flags) == 0)
> -		return 0;
> +	/* Assure zeroed buffer for non-read cases. */
> +	if (!(flags & XBF_READ))
> +		gfp_mask |= __GFP_ZERO;
>  
>  	if (flags & XBF_READ_AHEAD)
>  		gfp_mask |= __GFP_NORETRY;
>  
> +	/*
> +	 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
> +	 * is properly aligned.  The slab allocator now guarantees an aligned
> +	 * allocation for all power of two sizes, we matches most of the smaller

Same suggestion:

"...which matches most of the smaller than PAGE_SIZE buffers..."

as last time.

with the comment fixed,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> +	 * than PAGE_SIZE buffers used by XFS.
> +	 */
> +	if (size < PAGE_SIZE && is_power_of_2(size))
> +		return xfs_buf_alloc_kmem(bp, size, gfp_mask);
> +
>  	/* Make sure that we have a page list */
>  	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
>  	if (bp->b_page_count <= XB_PAGES) {
> @@ -324,10 +328,6 @@ xfs_buf_alloc_backing_mem(
>  	}
>  	bp->b_flags |= _XBF_PAGES;
>  
> -	/* Assure zeroed buffer for non-read cases. */
> -	if (!(flags & XBF_READ))
> -		gfp_mask |= __GFP_ZERO;
> -
>  	/*
>  	 * Bulk filling of pages can take multiple calls. Not filling the entire
>  	 * array is not an allocation failure, so don't back off if we get at
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 07/12] xfs: convert buffer cache to use high order folios
  2025-03-05 14:05 ` [PATCH 07/12] xfs: convert buffer cache to use high order folios Christoph Hellwig
@ 2025-03-05 18:20   ` Darrick J. Wong
  2025-03-05 20:50   ` Dave Chinner
  1 sibling, 0 replies; 34+ messages in thread
From: Darrick J. Wong @ 2025-03-05 18:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:24AM -0700, Christoph Hellwig wrote:
> Now that we have the buffer cache using the folio API, we can extend
> the use of folios to allocate high order folios for multi-page
> buffers rather than an array of single pages that are then vmapped
> into a contiguous range.
> 
> This creates a new type of single folio buffers that can have arbitrary
> order in addition to the existing multi-folio buffers made up of many
> single page folios that get vmapped.  The single folio is for now
> stashed into the existing b_pages array, but that will go away entirely
> later in the series and remove the temporary page vs folio typing issues
> that only work because the two structures currently can be used largely
> interchangeable.
> 
> The code that allocates buffers will optimistically attempt a high
> order folio allocation as a fast path if the buffer size is a power
> of two and thus fits into a folio. If this high order allocation
> fails, then we fall back to the existing multi-folio allocation
> code. This now forms the slow allocation path, and hopefully will be
> largely unused in normal conditions except for buffers with size
> that are not a power of two like larger remote xattrs.
> 
> This should improve performance of large buffer operations (e.g.
> large directory block sizes) as we should now mostly avoid the
> expense of vmapping large buffers (and the vmap lock contention that
> can occur) as well as avoid the runtime pressure that frequently
> accessing kernel vmapped pages put on the TLBs.
> 
> Based on a patch from Dave Chinner <dchinner@redhat.com>, but mutilated
> beyond recognition.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good now!
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c | 52 ++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 46 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 073246d4352f..f0666ef57bd2 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -203,9 +203,9 @@ xfs_buf_free_pages(
>  
>  	for (i = 0; i < bp->b_page_count; i++) {
>  		if (bp->b_pages[i])
> -			__free_page(bp->b_pages[i]);
> +			folio_put(page_folio(bp->b_pages[i]));
>  	}
> -	mm_account_reclaimed_pages(bp->b_page_count);
> +	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
>  
>  	if (bp->b_pages != bp->b_page_array)
>  		kfree(bp->b_pages);
> @@ -277,12 +277,17 @@ xfs_buf_alloc_kmem(
>   * For tmpfs-backed buffers used by in-memory btrees this directly maps the
>   * tmpfs page cache folios.
>   *
> - * For real file system buffers there are two different kinds backing memory:
> + * For real file system buffers there are three different kinds backing memory:
>   *
>   * The first type backs the buffer by a kmalloc allocation.  This is done for
>   * less than PAGE_SIZE allocations to avoid wasting memory.
>   *
> - * The second type of buffer is the multi-page buffer. These are always made
> + * The second type is a single folio buffer - this may be a high order folio or
> + * just a single page sized folio, but either way they get treated the same way
> + * by the rest of the code - the buffer memory spans a single contiguous memory
> + * region that we don't have to map and unmap to access the data directly.
> + *
> + * The third type of buffer is the multi-page buffer. These are always made
>   * up of single pages so that they can be fed to vmap_ram() to return a
>   * contiguous memory region we can access the data through, or mark it as
>   * XBF_UNMAPPED and access the data directly through individual page_address()
> @@ -295,6 +300,7 @@ xfs_buf_alloc_backing_mem(
>  {
>  	size_t		size = BBTOB(bp->b_length);
>  	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
> +	struct folio	*folio;
>  	long		filled = 0;
>  
>  	if (xfs_buftarg_is_mem(bp->b_target))
> @@ -316,7 +322,41 @@ xfs_buf_alloc_backing_mem(
>  	if (size < PAGE_SIZE && is_power_of_2(size))
>  		return xfs_buf_alloc_kmem(bp, size, gfp_mask);
>  
> -	/* Make sure that we have a page list */
> +	/*
> +	 * Don't bother with the retry loop for single PAGE allocations: vmalloc
> +	 * won't do any better.
> +	 */
> +	if (size <= PAGE_SIZE)
> +		gfp_mask |= __GFP_NOFAIL;
> +
> +	/*
> +	 * Optimistically attempt a single high order folio allocation for
> +	 * larger than PAGE_SIZE buffers.
> +	 *
> +	 * Allocating a high order folio makes the assumption that buffers are a
> +	 * power-of-2 size, matching the power-of-2 folios sizes available.
> +	 *
> +	 * The exception here are user xattr data buffers, which can be arbitrarily
> +	 * sized up to 64kB plus structure metadata, skip straight to the vmalloc
> +	 * path for them instead of wasting memory here.
> +	 */
> +	if (size > PAGE_SIZE && !is_power_of_2(size))
> +		goto fallback;
> +	folio = folio_alloc(gfp_mask, get_order(size));
> +	if (!folio) {
> +		if (size <= PAGE_SIZE)
> +			return -ENOMEM;
> +		goto fallback;
> +	}
> +	bp->b_addr = folio_address(folio);
> +	bp->b_page_array[0] = &folio->page;
> +	bp->b_pages = bp->b_page_array;
> +	bp->b_page_count = 1;
> +	bp->b_flags |= _XBF_PAGES;
> +	return 0;
> +
> +fallback:
> +	/* Fall back to allocating an array of single page folios. */
>  	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
>  	if (bp->b_page_count <= XB_PAGES) {
>  		bp->b_pages = bp->b_page_array;
> @@ -1474,7 +1514,7 @@ xfs_buf_submit_bio(
>  	bio->bi_private = bp;
>  	bio->bi_end_io = xfs_buf_bio_end_io;
>  
> -	if (bp->b_flags & _XBF_KMEM) {
> +	if (bp->b_page_count == 1) {
>  		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
>  				offset_in_page(bp->b_addr));
>  	} else {
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 14:05 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
@ 2025-03-05 18:22   ` Darrick J. Wong
  2025-03-05 21:20   ` Dave Chinner
  1 sibling, 0 replies; 34+ messages in thread
From: Darrick J. Wong @ 2025-03-05 18:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote:
> The fallback buffer allocation path currently open codes a suboptimal
> version of vmalloc to allocate pages that are then mapped into
> vmalloc space.  Switch to using vmalloc instead, which uses all the
> optimizations in the common vmalloc code, and removes the need to
> track the backing pages in the xfs_buf structure.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good now,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c     | 207 ++++++++++---------------------------------
>  fs/xfs/xfs_buf.h     |   7 --
>  fs/xfs/xfs_buf_mem.c |  11 +--
>  3 files changed, 48 insertions(+), 177 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 2b4b8c104b0c..f28ca5cb5bd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
>  	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
>  }
>  
> -static inline int
> -xfs_buf_vmap_len(
> -	struct xfs_buf	*bp)
> -{
> -	return (bp->b_page_count * PAGE_SIZE);
> -}
> -
>  /*
>   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
>   * b_lru_ref count so that the buffer is freed immediately when the buffer
> @@ -190,29 +183,6 @@ _xfs_buf_alloc(
>  	return 0;
>  }
>  
> -static void
> -xfs_buf_free_pages(
> -	struct xfs_buf	*bp)
> -{
> -	uint		i;
> -
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -
> -	if (is_vmalloc_addr(bp->b_addr))
> -		vm_unmap_ram(bp->b_addr, bp->b_page_count);
> -
> -	for (i = 0; i < bp->b_page_count; i++) {
> -		if (bp->b_pages[i])
> -			folio_put(page_folio(bp->b_pages[i]));
> -	}
> -	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
> -
> -	if (bp->b_pages != bp->b_page_array)
> -		kfree(bp->b_pages);
> -	bp->b_pages = NULL;
> -	bp->b_flags &= ~_XBF_PAGES;
> -}
> -
>  static void
>  xfs_buf_free_callback(
>  	struct callback_head	*cb)
> @@ -227,16 +197,23 @@ static void
>  xfs_buf_free(
>  	struct xfs_buf		*bp)
>  {
> +	unsigned int		size = BBTOB(bp->b_length);
> +
>  	trace_xfs_buf_free(bp, _RET_IP_);
>  
>  	ASSERT(list_empty(&bp->b_lru));
>  
> +	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
> +		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
> +
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		xmbuf_unmap_page(bp);
> -	else if (bp->b_flags & _XBF_PAGES)
> -		xfs_buf_free_pages(bp);
> +	else if (is_vmalloc_addr(bp->b_addr))
> +		vfree(bp->b_addr);
>  	else if (bp->b_flags & _XBF_KMEM)
>  		kfree(bp->b_addr);
> +	else
> +		folio_put(virt_to_folio(bp->b_addr));
>  
>  	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
>  }
> @@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
>  		bp->b_addr = NULL;
>  		return -ENOMEM;
>  	}
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = kmem_to_page(bp->b_addr);
> -	bp->b_page_count = 1;
>  	bp->b_flags |= _XBF_KMEM;
>  	return 0;
>  }
> @@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
>   * by the rest of the code - the buffer memory spans a single contiguous memory
>   * region that we don't have to map and unmap to access the data directly.
>   *
> - * The third type of buffer is the multi-page buffer. These are always made
> - * up of single pages so that they can be fed to vmap_ram() to return a
> - * contiguous memory region we can access the data through.
> + * The third type of buffer is the vmalloc()d buffer. This provides the buffer
> + * with the required contiguous memory region but backed by discontiguous
> + * physical pages.
>   */
>  static int
>  xfs_buf_alloc_backing_mem(
> @@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
>  	size_t		size = BBTOB(bp->b_length);
>  	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
>  	struct folio	*folio;
> -	long		filled = 0;
>  
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		return xmbuf_map_page(bp);
> @@ -347,98 +320,18 @@ xfs_buf_alloc_backing_mem(
>  		goto fallback;
>  	}
>  	bp->b_addr = folio_address(folio);
> -	bp->b_page_array[0] = &folio->page;
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_page_count = 1;
> -	bp->b_flags |= _XBF_PAGES;
>  	return 0;
>  
>  fallback:
> -	/* Fall back to allocating an array of single page folios. */
> -	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
> -	if (bp->b_page_count <= XB_PAGES) {
> -		bp->b_pages = bp->b_page_array;
> -	} else {
> -		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
> -					gfp_mask);
> -		if (!bp->b_pages)
> -			return -ENOMEM;
> -	}
> -	bp->b_flags |= _XBF_PAGES;
> -
> -	/*
> -	 * Bulk filling of pages can take multiple calls. Not filling the entire
> -	 * array is not an allocation failure, so don't back off if we get at
> -	 * least one extra page.
> -	 */
>  	for (;;) {
> -		long	last = filled;
> -
> -		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
> -					  bp->b_pages);
> -		if (filled == bp->b_page_count) {
> -			XFS_STATS_INC(bp->b_mount, xb_page_found);
> +		bp->b_addr = __vmalloc(size, gfp_mask);
> +		if (bp->b_addr)
>  			break;
> -		}
> -
> -		if (filled != last)
> -			continue;
> -
> -		if (flags & XBF_READ_AHEAD) {
> -			xfs_buf_free_pages(bp);
> +		if (flags & XBF_READ_AHEAD)
>  			return -ENOMEM;
> -		}
> -
>  		XFS_STATS_INC(bp->b_mount, xb_page_retries);
>  		memalloc_retry_wait(gfp_mask);
>  	}
> -	return 0;
> -}
> -
> -/*
> - *	Map buffer into kernel address-space if necessary.
> - */
> -STATIC int
> -_xfs_buf_map_pages(
> -	struct xfs_buf		*bp,
> -	xfs_buf_flags_t		flags)
> -{
> -	ASSERT(bp->b_flags & _XBF_PAGES);
> -	if (bp->b_page_count == 1) {
> -		/* A single page buffer is always mappable */
> -		bp->b_addr = page_address(bp->b_pages[0]);
> -	} else {
> -		int retried = 0;
> -		unsigned nofs_flag;
> -
> -		/*
> -		 * vm_map_ram() will allocate auxiliary structures (e.g.
> -		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
> -		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
> -		 * from the same call site that can be run from both above and
> -		 * below memory reclaim causes lockdep false positives. Hence we
> -		 * always need to force this allocation to nofs context because
> -		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
> -		 * prevent false positive lockdep reports.
> -		 *
> -		 * XXX(dgc): I think dquot reclaim is the only place we can get
> -		 * to this function from memory reclaim context now. If we fix
> -		 * that like we've fixed inode reclaim to avoid writeback from
> -		 * reclaim, this nofs wrapping can go away.
> -		 */
> -		nofs_flag = memalloc_nofs_save();
> -		do {
> -			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
> -						-1);
> -			if (bp->b_addr)
> -				break;
> -			vm_unmap_aliases();
> -		} while (retried++ <= 1);
> -		memalloc_nofs_restore(nofs_flag);
> -
> -		if (!bp->b_addr)
> -			return -ENOMEM;
> -	}
>  
>  	return 0;
>  }
> @@ -558,7 +451,7 @@ xfs_buf_find_lock(
>  			return -ENOENT;
>  		}
>  		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
> -		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
> +		bp->b_flags &= _XBF_KMEM;
>  		bp->b_ops = NULL;
>  	}
>  	return 0;
> @@ -744,18 +637,6 @@ xfs_buf_get_map(
>  			xfs_perag_put(pag);
>  	}
>  
> -	/* We do not hold a perag reference anymore. */
> -	if (!bp->b_addr) {
> -		error = _xfs_buf_map_pages(bp, flags);
> -		if (unlikely(error)) {
> -			xfs_warn_ratelimited(btp->bt_mount,
> -				"%s: failed to map %u pages", __func__,
> -				bp->b_page_count);
> -			xfs_buf_relse(bp);
> -			return error;
> -		}
> -	}
> -
>  	/*
>  	 * Clear b_error if this is a lookup from a caller that doesn't expect
>  	 * valid data to be found in the buffer.
> @@ -998,14 +879,6 @@ xfs_buf_get_uncached(
>  	if (error)
>  		goto fail_free_buf;
>  
> -	if (!bp->b_addr)
> -		error = _xfs_buf_map_pages(bp, 0);
> -	if (unlikely(error)) {
> -		xfs_warn(target->bt_mount,
> -			"%s: failed to map pages", __func__);
> -		goto fail_free_buf;
> -	}
> -
>  	trace_xfs_buf_get_uncached(bp, _RET_IP_);
>  	*bpp = bp;
>  	return 0;
> @@ -1339,7 +1212,7 @@ __xfs_buf_ioend(
>  	if (bp->b_flags & XBF_READ) {
>  		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
>  			invalidate_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +				roundup(BBTOB(bp->b_length), PAGE_SIZE));
>  		if (!bp->b_error && bp->b_ops)
>  			bp->b_ops->verify_read(bp);
>  		if (!bp->b_error)
> @@ -1500,29 +1373,43 @@ static void
>  xfs_buf_submit_bio(
>  	struct xfs_buf		*bp)
>  {
> -	unsigned int		size = BBTOB(bp->b_length);
> -	unsigned int		map = 0, p;
> +	unsigned int		map = 0;
>  	struct blk_plug		plug;
>  	struct bio		*bio;
>  
> -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> -			xfs_buf_bio_op(bp), GFP_NOIO);
> -	bio->bi_private = bp;
> -	bio->bi_end_io = xfs_buf_bio_end_io;
> +	if (is_vmalloc_addr(bp->b_addr)) {
> +		unsigned int	size = BBTOB(bp->b_length);
> +		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
> +		void		*data = bp->b_addr;
>  
> -	if (bp->b_page_count == 1) {
> -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> -				offset_in_page(bp->b_addr));
> -	} else {
> -		for (p = 0; p < bp->b_page_count; p++)
> -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
> +				xfs_buf_bio_op(bp), GFP_NOIO);
> +
> +		do {
> +			unsigned int	len = min(size, PAGE_SIZE);
>  
> -		if (is_vmalloc_addr(bp->b_addr))
> -			flush_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +			ASSERT(offset_in_page(data) == 0);
> +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> +			data += len;
> +			size -= len;
> +		} while (size);
> +
> +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> +	} else {
> +		/*
> +		 * Single folio or slab allocation.  Must be contiguous and thus
> +		 * only a single bvec is needed.
> +		 */
> +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> +				GFP_NOIO);
> +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> +				BBTOB(bp->b_length),
> +				offset_in_page(bp->b_addr));
>  	}
>  
> +	bio->bi_private = bp;
> +	bio->bi_end_io = xfs_buf_bio_end_io;
> +
>  	/*
>  	 * If there is more than one map segment, split out a new bio for each
>  	 * map except of the last one.  The last map is handled by the
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 8db522f19b0c..db43bdc17f55 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -36,7 +36,6 @@ struct xfs_buf;
>  #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
>  
>  /* flags used only internally */
> -#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
>  #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
>  #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
>  
> @@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
>  	{ XBF_STALE,		"STALE" }, \
>  	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
>  	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
> -	{ _XBF_PAGES,		"PAGES" }, \
>  	{ _XBF_KMEM,		"KMEM" }, \
>  	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
>  	/* The following interface flags should never be set */ \
> @@ -122,8 +120,6 @@ struct xfs_buftarg {
>  	struct xfs_buf_cache	bt_cache[];
>  };
>  
> -#define XB_PAGES	2
> -
>  struct xfs_buf_map {
>  	xfs_daddr_t		bm_bn;	/* block number for I/O */
>  	int			bm_len;	/* size of I/O */
> @@ -185,13 +181,10 @@ struct xfs_buf {
>  	struct xfs_buf_log_item	*b_log_item;
>  	struct list_head	b_li_list;	/* Log items list head */
>  	struct xfs_trans	*b_transp;
> -	struct page		**b_pages;	/* array of page pointers */
> -	struct page		*b_page_array[XB_PAGES]; /* inline pages */
>  	struct xfs_buf_map	*b_maps;	/* compound buffer map */
>  	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
>  	int			b_map_count;
>  	atomic_t		b_pin_count;	/* pin count */
> -	unsigned int		b_page_count;	/* size of page array */
>  	int			b_error;	/* error code on I/O */
>  	void			(*b_iodone)(struct xfs_buf *bp);
>  
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 5b64a2b3b113..b207754d2ee0 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -169,9 +169,6 @@ xmbuf_map_page(
>  	unlock_page(page);
>  
>  	bp->b_addr = page_address(page);
> -	bp->b_pages = bp->b_page_array;
> -	bp->b_pages[0] = page;
> -	bp->b_page_count = 1;
>  	return 0;
>  }
>  
> @@ -180,16 +177,10 @@ void
>  xmbuf_unmap_page(
>  	struct xfs_buf		*bp)
>  {
> -	struct page		*page = bp->b_pages[0];
> -
>  	ASSERT(xfs_buftarg_is_mem(bp->b_target));
>  
> -	put_page(page);
> -
> +	put_page(virt_to_page(bp->b_addr));
>  	bp->b_addr = NULL;
> -	bp->b_pages[0] = NULL;
> -	bp->b_pages = NULL;
> -	bp->b_page_count = 0;
>  }
>  
>  /* Is this a valid daddr within the buftarg? */
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache
  2025-03-05 14:05 ` [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache Christoph Hellwig
@ 2025-03-05 18:34   ` Darrick J. Wong
  0 siblings, 0 replies; 34+ messages in thread
From: Darrick J. Wong @ 2025-03-05 18:34 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:28AM -0700, Christoph Hellwig wrote:
> Directly assign b_addr based on the tmpfs folios without a detour
> through pages, reuse the folio_put path used for non-tmpfs buffers
> and replace all references to pages in comments with folios.
> 
> Partially based on a patch from Dave Chinner <dchinner@redhat.com>.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Seems fine to me, so
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_buf.c     |  6 ++----
>  fs/xfs/xfs_buf_mem.c | 34 ++++++++++------------------------
>  fs/xfs/xfs_buf_mem.h |  6 ++----
>  3 files changed, 14 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index f28ca5cb5bd8..c7f4cafda705 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -206,9 +206,7 @@ xfs_buf_free(
>  	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
>  		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
>  
> -	if (xfs_buftarg_is_mem(bp->b_target))
> -		xmbuf_unmap_page(bp);
> -	else if (is_vmalloc_addr(bp->b_addr))
> +	if (is_vmalloc_addr(bp->b_addr))
>  		vfree(bp->b_addr);
>  	else if (bp->b_flags & _XBF_KMEM)
>  		kfree(bp->b_addr);
> @@ -275,7 +273,7 @@ xfs_buf_alloc_backing_mem(
>  	struct folio	*folio;
>  
>  	if (xfs_buftarg_is_mem(bp->b_target))
> -		return xmbuf_map_page(bp);
> +		return xmbuf_map_backing_mem(bp);
>  
>  	/* Assure zeroed buffer for non-read cases. */
>  	if (!(flags & XBF_READ))
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index b207754d2ee0..b4ffd80b7cb6 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -74,7 +74,7 @@ xmbuf_alloc(
>  
>  	/*
>  	 * We don't want to bother with kmapping data during repair, so don't
> -	 * allow highmem pages to back this mapping.
> +	 * allow highmem folios to back this mapping.
>  	 */
>  	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
>  
> @@ -127,14 +127,13 @@ xmbuf_free(
>  	kfree(btp);
>  }
>  
> -/* Directly map a shmem page into the buffer cache. */
> +/* Directly map a shmem folio into the buffer cache. */
>  int
> -xmbuf_map_page(
> +xmbuf_map_backing_mem(
>  	struct xfs_buf		*bp)
>  {
>  	struct inode		*inode = file_inode(bp->b_target->bt_file);
>  	struct folio		*folio = NULL;
> -	struct page		*page;
>  	loff_t                  pos = BBTOB(xfs_buf_daddr(bp));
>  	int			error;
>  
> @@ -159,30 +158,17 @@ xmbuf_map_page(
>  		return -EIO;
>  	}
>  
> -	page = folio_file_page(folio, pos >> PAGE_SHIFT);
> -
>  	/*
> -	 * Mark the page dirty so that it won't be reclaimed once we drop the
> -	 * (potentially last) reference in xmbuf_unmap_page.
> +	 * Mark the folio dirty so that it won't be reclaimed once we drop the
> +	 * (potentially last) reference in xfs_buf_free.
>  	 */
> -	set_page_dirty(page);
> -	unlock_page(page);
> +	folio_set_dirty(folio);
> +	folio_unlock(folio);
>  
> -	bp->b_addr = page_address(page);
> +	bp->b_addr = folio_address(folio);

If tmpfs gives us a large folio, don't we need to add offset_in_folio to
b_addr?  Or does this just work and xfs/801 (aka force the use of
hugepages on tmpfs) passes with no complaints?

--D

>  	return 0;
>  }
>  
> -/* Unmap a shmem page that was mapped into the buffer cache. */
> -void
> -xmbuf_unmap_page(
> -	struct xfs_buf		*bp)
> -{
> -	ASSERT(xfs_buftarg_is_mem(bp->b_target));
> -
> -	put_page(virt_to_page(bp->b_addr));
> -	bp->b_addr = NULL;
> -}
> -
>  /* Is this a valid daddr within the buftarg? */
>  bool
>  xmbuf_verify_daddr(
> @@ -196,7 +182,7 @@ xmbuf_verify_daddr(
>  	return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
>  }
>  
> -/* Discard the page backing this buffer. */
> +/* Discard the folio backing this buffer. */
>  static void
>  xmbuf_stale(
>  	struct xfs_buf		*bp)
> @@ -211,7 +197,7 @@ xmbuf_stale(
>  }
>  
>  /*
> - * Finalize a buffer -- discard the backing page if it's stale, or run the
> + * Finalize a buffer -- discard the backing folio if it's stale, or run the
>   * write verifier to detect problems.
>   */
>  int
> diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
> index eed4a7b63232..67d525cc1513 100644
> --- a/fs/xfs/xfs_buf_mem.h
> +++ b/fs/xfs/xfs_buf_mem.h
> @@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
>  		struct xfs_buftarg **btpp);
>  void xmbuf_free(struct xfs_buftarg *btp);
>  
> -int xmbuf_map_page(struct xfs_buf *bp);
> -void xmbuf_unmap_page(struct xfs_buf *bp);
>  bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
>  void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
>  int xmbuf_finalize(struct xfs_buf *bp);
>  #else
>  # define xfs_buftarg_is_mem(...)	(false)
> -# define xmbuf_map_page(...)		(-ENOMEM)
> -# define xmbuf_unmap_page(...)		((void)0)
>  # define xmbuf_verify_daddr(...)	(false)
>  #endif /* CONFIG_XFS_MEMORY_BUFS */
>  
> +int xmbuf_map_backing_mem(struct xfs_buf *bp);
> +
>  #endif /* __XFS_BUF_MEM_H__ */
> -- 
> 2.45.2
> 
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 07/12] xfs: convert buffer cache to use high order folios
  2025-03-05 14:05 ` [PATCH 07/12] xfs: convert buffer cache to use high order folios Christoph Hellwig
  2025-03-05 18:20   ` Darrick J. Wong
@ 2025-03-05 20:50   ` Dave Chinner
  2025-03-05 23:33     ` Christoph Hellwig
  2025-03-10 13:18     ` Christoph Hellwig
  1 sibling, 2 replies; 34+ messages in thread
From: Dave Chinner @ 2025-03-05 20:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Darrick J. Wong, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:24AM -0700, Christoph Hellwig wrote:
> Now that we have the buffer cache using the folio API, we can extend
> the use of folios to allocate high order folios for multi-page
> buffers rather than an array of single pages that are then vmapped
> into a contiguous range.
> 
> This creates a new type of single folio buffers that can have arbitrary
> order in addition to the existing multi-folio buffers made up of many
> single page folios that get vmapped.  The single folio is for now
> stashed into the existing b_pages array, but that will go away entirely
> later in the series and remove the temporary page vs folio typing issues
> that only work because the two structures currently can be used largely
> interchangeable.
> 
> The code that allocates buffers will optimistically attempt a high
> order folio allocation as a fast path if the buffer size is a power
> of two and thus fits into a folio. If this high order allocation
> fails, then we fall back to the existing multi-folio allocation
> code. This now forms the slow allocation path, and hopefully will be
> largely unused in normal conditions except for buffers with size
> that are not a power of two like larger remote xattrs.
> 
> This should improve performance of large buffer operations (e.g.
> large directory block sizes) as we should now mostly avoid the
> expense of vmapping large buffers (and the vmap lock contention that
> can occur) as well as avoid the runtime pressure that frequently
> accessing kernel vmapped pages put on the TLBs.
> 
> Based on a patch from Dave Chinner <dchinner@redhat.com>, but mutilated
> beyond recognition.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_buf.c | 52 ++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 46 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 073246d4352f..f0666ef57bd2 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -203,9 +203,9 @@ xfs_buf_free_pages(
>  
>  	for (i = 0; i < bp->b_page_count; i++) {
>  		if (bp->b_pages[i])
> -			__free_page(bp->b_pages[i]);
> +			folio_put(page_folio(bp->b_pages[i]));
>  	}
> -	mm_account_reclaimed_pages(bp->b_page_count);
> +	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
>  
>  	if (bp->b_pages != bp->b_page_array)
>  		kfree(bp->b_pages);
> @@ -277,12 +277,17 @@ xfs_buf_alloc_kmem(
>   * For tmpfs-backed buffers used by in-memory btrees this directly maps the
>   * tmpfs page cache folios.
>   *
> - * For real file system buffers there are two different kinds backing memory:
> + * For real file system buffers there are three different kinds backing memory:
>   *
>   * The first type backs the buffer by a kmalloc allocation.  This is done for
>   * less than PAGE_SIZE allocations to avoid wasting memory.
>   *
> - * The second type of buffer is the multi-page buffer. These are always made
> + * The second type is a single folio buffer - this may be a high order folio or
> + * just a single page sized folio, but either way they get treated the same way
> + * by the rest of the code - the buffer memory spans a single contiguous memory
> + * region that we don't have to map and unmap to access the data directly.
> + *
> + * The third type of buffer is the multi-page buffer. These are always made
>   * up of single pages so that they can be fed to vmap_ram() to return a
>   * contiguous memory region we can access the data through, or mark it as
>   * XBF_UNMAPPED and access the data directly through individual page_address()
> @@ -295,6 +300,7 @@ xfs_buf_alloc_backing_mem(
>  {
>  	size_t		size = BBTOB(bp->b_length);
>  	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
> +	struct folio	*folio;
>  	long		filled = 0;
>  
>  	if (xfs_buftarg_is_mem(bp->b_target))
> @@ -316,7 +322,41 @@ xfs_buf_alloc_backing_mem(
>  	if (size < PAGE_SIZE && is_power_of_2(size))
>  		return xfs_buf_alloc_kmem(bp, size, gfp_mask);
>  
> -	/* Make sure that we have a page list */
> +	/*
> +	 * Don't bother with the retry loop for single PAGE allocations: vmalloc
> +	 * won't do any better.
> +	 */
> +	if (size <= PAGE_SIZE)
> +		gfp_mask |= __GFP_NOFAIL;
> +
> +	/*
> +	 * Optimistically attempt a single high order folio allocation for
> +	 * larger than PAGE_SIZE buffers.
> +	 *
> +	 * Allocating a high order folio makes the assumption that buffers are a
> +	 * power-of-2 size, matching the power-of-2 folios sizes available.
> +	 *
> +	 * The exception here are user xattr data buffers, which can be arbitrarily
> +	 * sized up to 64kB plus structure metadata, skip straight to the vmalloc
> +	 * path for them instead of wasting memory here.
> +	 */
> +	if (size > PAGE_SIZE && !is_power_of_2(size))
> +		goto fallback;
> +	folio = folio_alloc(gfp_mask, get_order(size));

The only thing extra that I would do here is take a leaf from the
kmalloc() call in xlog_kvmalloc() and turn off direct reclaim for
this allocation because >= 32kB allocations are considered "costly"
and so will enter the compaction code if direct reclaim is enabled.

Given that we fall back to vmalloc, clearing __GFP_DIRECT_RECLAIM
and setting __GFP_NORETRY here means that we don't burn lots of CPU
on memory compaction if there is no high order folios available for
immediate allocation. And on a busy machine, compaction is likely to
fail frequently and so this is all wasted CPU time.

This may be one of the reasons why you don't see any change in real
performance with 64kB directory blocks - we spend more time in
folio allocation because of compaction overhead than we gain back
from avoiding the use of vmapped buffers....

i.e.
	if (size > PAGE_SIZE) {
		if (!is_power_of_2(size))
			goto fallback;
		gfp_mask ~= __GFP_DIRECT_RECLAIM;
		gfp_mask |= __GFP_NORETRY;
	}
	folio = folio_alloc(gfp_mask, get_order(size));

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback
  2025-03-05 14:05 ` [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback Christoph Hellwig
  2025-03-05 18:18   ` Darrick J. Wong
@ 2025-03-05 21:02   ` Dave Chinner
  2025-03-05 23:38     ` Christoph Hellwig
  1 sibling, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2025-03-05 21:02 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Darrick J. Wong, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:23AM -0700, Christoph Hellwig wrote:
> Since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment
> for kmalloc(power-of-two)", kmalloc and friends guarantee that power of
> two sized allocations are naturally aligned.  Limit our use of kmalloc
> for buffers to these power of two sizes and remove the fallback to
> the page allocator for this case, but keep a check in addition to
> trusting the slab allocator to get the alignment right.
> 
> Also refactor the kmalloc path to reuse various calculations for the
> size and gfp flags.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
.....
> @@ -300,18 +300,22 @@ xfs_buf_alloc_backing_mem(
>  	if (xfs_buftarg_is_mem(bp->b_target))
>  		return xmbuf_map_page(bp);
>  
> -	/*
> -	 * For buffers that fit entirely within a single page, first attempt to
> -	 * allocate the memory from the heap to minimise memory usage.  If we
> -	 * can't get heap memory for these small buffers, we fall back to using
> -	 * the page allocator.
> -	 */
> -	if (size < PAGE_SIZE && xfs_buf_alloc_kmem(new_bp, flags) == 0)
> -		return 0;
> +	/* Assure zeroed buffer for non-read cases. */
> +	if (!(flags & XBF_READ))
> +		gfp_mask |= __GFP_ZERO;

We should probably drop this zeroing altogether.

The higher level code cannot assume a buffer gained for write has
been zeroed by the xfs_trans_get_buf() path contains zeros. e.g. if
the buffer was in cache when the get_buf() call occurs, it
will contain the whatever was in the buffer, not zeros. This occurs
even if the buffer was STALE in cache at the time of the get()
operation.

Hence callers must always initialise the entire buffer themselves
(and they do!), hence allocating zeroed buffers when we are going
to zero it ourselves anyway is really unnecessary overhead...

This may not matter for 4kB block size filesystems, but it may make
a difference for 64kB block size filesystems, especially when we are
only doing a get() on the buffer to mark it stale in a transaction
and never actually use the contents of it...

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 14:05 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
  2025-03-05 18:22   ` Darrick J. Wong
@ 2025-03-05 21:20   ` Dave Chinner
  2025-03-05 22:54     ` Darrick J. Wong
  2025-03-05 23:35     ` Christoph Hellwig
  1 sibling, 2 replies; 34+ messages in thread
From: Dave Chinner @ 2025-03-05 21:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Darrick J. Wong, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote:
> The fallback buffer allocation path currently open codes a suboptimal
> version of vmalloc to allocate pages that are then mapped into
> vmalloc space.  Switch to using vmalloc instead, which uses all the
> optimizations in the common vmalloc code, and removes the need to
> track the backing pages in the xfs_buf structure.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
.....

> @@ -1500,29 +1373,43 @@ static void
>  xfs_buf_submit_bio(
>  	struct xfs_buf		*bp)
>  {
> -	unsigned int		size = BBTOB(bp->b_length);
> -	unsigned int		map = 0, p;
> +	unsigned int		map = 0;
>  	struct blk_plug		plug;
>  	struct bio		*bio;
>  
> -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> -			xfs_buf_bio_op(bp), GFP_NOIO);
> -	bio->bi_private = bp;
> -	bio->bi_end_io = xfs_buf_bio_end_io;
> +	if (is_vmalloc_addr(bp->b_addr)) {
> +		unsigned int	size = BBTOB(bp->b_length);
> +		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
> +		void		*data = bp->b_addr;
>  
> -	if (bp->b_page_count == 1) {
> -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> -				offset_in_page(bp->b_addr));
> -	} else {
> -		for (p = 0; p < bp->b_page_count; p++)
> -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> +		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
> +				xfs_buf_bio_op(bp), GFP_NOIO);
> +
> +		do {
> +			unsigned int	len = min(size, PAGE_SIZE);
>  
> -		if (is_vmalloc_addr(bp->b_addr))
> -			flush_kernel_vmap_range(bp->b_addr,
> -					xfs_buf_vmap_len(bp));
> +			ASSERT(offset_in_page(data) == 0);
> +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> +			data += len;
> +			size -= len;
> +		} while (size);
> +
> +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> +	} else {
> +		/*
> +		 * Single folio or slab allocation.  Must be contiguous and thus
> +		 * only a single bvec is needed.
> +		 */
> +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> +				GFP_NOIO);
> +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> +				BBTOB(bp->b_length),
> +				offset_in_page(bp->b_addr));
>  	}

How does offset_in_page() work with a high order folio? It can only
return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this
be:

		folio = kmem_to_folio(bp->b_addr);

		bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length),
				offset_in_folio(folio, bp->b_addr));


-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 21:20   ` Dave Chinner
@ 2025-03-05 22:54     ` Darrick J. Wong
  2025-03-05 23:28       ` Dave Chinner
  2025-03-05 23:35     ` Christoph Hellwig
  1 sibling, 1 reply; 34+ messages in thread
From: Darrick J. Wong @ 2025-03-05 22:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs

On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote:
> On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote:
> > The fallback buffer allocation path currently open codes a suboptimal
> > version of vmalloc to allocate pages that are then mapped into
> > vmalloc space.  Switch to using vmalloc instead, which uses all the
> > optimizations in the common vmalloc code, and removes the need to
> > track the backing pages in the xfs_buf structure.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> .....
> 
> > @@ -1500,29 +1373,43 @@ static void
> >  xfs_buf_submit_bio(
> >  	struct xfs_buf		*bp)
> >  {
> > -	unsigned int		size = BBTOB(bp->b_length);
> > -	unsigned int		map = 0, p;
> > +	unsigned int		map = 0;
> >  	struct blk_plug		plug;
> >  	struct bio		*bio;
> >  
> > -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> > -			xfs_buf_bio_op(bp), GFP_NOIO);
> > -	bio->bi_private = bp;
> > -	bio->bi_end_io = xfs_buf_bio_end_io;
> > +	if (is_vmalloc_addr(bp->b_addr)) {
> > +		unsigned int	size = BBTOB(bp->b_length);
> > +		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
> > +		void		*data = bp->b_addr;
> >  
> > -	if (bp->b_page_count == 1) {
> > -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> > -				offset_in_page(bp->b_addr));
> > -	} else {
> > -		for (p = 0; p < bp->b_page_count; p++)
> > -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> > -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> > +		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
> > +				xfs_buf_bio_op(bp), GFP_NOIO);
> > +
> > +		do {
> > +			unsigned int	len = min(size, PAGE_SIZE);
> >  
> > -		if (is_vmalloc_addr(bp->b_addr))
> > -			flush_kernel_vmap_range(bp->b_addr,
> > -					xfs_buf_vmap_len(bp));
> > +			ASSERT(offset_in_page(data) == 0);
> > +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> > +			data += len;
> > +			size -= len;
> > +		} while (size);
> > +
> > +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> > +	} else {
> > +		/*
> > +		 * Single folio or slab allocation.  Must be contiguous and thus
> > +		 * only a single bvec is needed.
> > +		 */
> > +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> > +				GFP_NOIO);
> > +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> > +				BBTOB(bp->b_length),
> > +				offset_in_page(bp->b_addr));
> >  	}
> 
> How does offset_in_page() work with a high order folio? It can only
> return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this
> be:
> 
> 		folio = kmem_to_folio(bp->b_addr);
> 
> 		bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length),
> 				offset_in_folio(folio, bp->b_addr));

I think offset_in_folio() returns 0 in the !kmem && !vmalloc case
because we allocate the folio and set b_addr to folio_address(folio);
and we never call the kmem alloc code for sizes greater than PAGE_SIZE.

--D

> 
> 
> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 22:54     ` Darrick J. Wong
@ 2025-03-05 23:28       ` Dave Chinner
  2025-03-05 23:45         ` Christoph Hellwig
  0 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2025-03-05 23:28 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 02:54:07PM -0800, Darrick J. Wong wrote:
> On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote:
> > On Wed, Mar 05, 2025 at 07:05:27AM -0700, Christoph Hellwig wrote:
> > > The fallback buffer allocation path currently open codes a suboptimal
> > > version of vmalloc to allocate pages that are then mapped into
> > > vmalloc space.  Switch to using vmalloc instead, which uses all the
> > > optimizations in the common vmalloc code, and removes the need to
> > > track the backing pages in the xfs_buf structure.
> > > 
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > .....
> > 
> > > @@ -1500,29 +1373,43 @@ static void
> > >  xfs_buf_submit_bio(
> > >  	struct xfs_buf		*bp)
> > >  {
> > > -	unsigned int		size = BBTOB(bp->b_length);
> > > -	unsigned int		map = 0, p;
> > > +	unsigned int		map = 0;
> > >  	struct blk_plug		plug;
> > >  	struct bio		*bio;
> > >  
> > > -	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
> > > -			xfs_buf_bio_op(bp), GFP_NOIO);
> > > -	bio->bi_private = bp;
> > > -	bio->bi_end_io = xfs_buf_bio_end_io;
> > > +	if (is_vmalloc_addr(bp->b_addr)) {
> > > +		unsigned int	size = BBTOB(bp->b_length);
> > > +		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
> > > +		void		*data = bp->b_addr;
> > >  
> > > -	if (bp->b_page_count == 1) {
> > > -		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
> > > -				offset_in_page(bp->b_addr));
> > > -	} else {
> > > -		for (p = 0; p < bp->b_page_count; p++)
> > > -			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
> > > -		bio->bi_iter.bi_size = size; /* limit to the actual size used */
> > > +		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
> > > +				xfs_buf_bio_op(bp), GFP_NOIO);
> > > +
> > > +		do {
> > > +			unsigned int	len = min(size, PAGE_SIZE);
> > >  
> > > -		if (is_vmalloc_addr(bp->b_addr))
> > > -			flush_kernel_vmap_range(bp->b_addr,
> > > -					xfs_buf_vmap_len(bp));
> > > +			ASSERT(offset_in_page(data) == 0);
> > > +			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
> > > +			data += len;
> > > +			size -= len;
> > > +		} while (size);
> > > +
> > > +		flush_kernel_vmap_range(bp->b_addr, alloc_size);
> > > +	} else {
> > > +		/*
> > > +		 * Single folio or slab allocation.  Must be contiguous and thus
> > > +		 * only a single bvec is needed.
> > > +		 */
> > > +		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
> > > +				GFP_NOIO);
> > > +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> > > +				BBTOB(bp->b_length),
> > > +				offset_in_page(bp->b_addr));
> > >  	}
> > 
> > How does offset_in_page() work with a high order folio? It can only
> > return a value between 0 and (PAGE_SIZE - 1). i.e. shouldn't this
> > be:
> > 
> > 		folio = kmem_to_folio(bp->b_addr);
> > 
> > 		bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length),
> > 				offset_in_folio(folio, bp->b_addr));
> 
> I think offset_in_folio() returns 0 in the !kmem && !vmalloc case
> because we allocate the folio and set b_addr to folio_address(folio);
> and we never call the kmem alloc code for sizes greater than PAGE_SIZE.

Yes, but that misses my point: this is a folio conversion, whilst
this treats a folio as a page. We're trying to get rid of this sort
of page/folio type confusion (i.e. stuff like "does offset_in_page()
work correctly on large folios"). New code shouldn't be adding
new issues like these, especially when there are existing
folio-based APIs that are guaranteed to work correctly and won't
need fixing in future before pages and folios can be fully
separated.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback
  2025-03-05 18:18   ` Darrick J. Wong
@ 2025-03-05 23:32     ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 23:32 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Carlos Maiolino, Dave Chinner, linux-xfs

On Wed, Mar 05, 2025 at 10:18:12AM -0800, Darrick J. Wong wrote:
> > +	/*
> > +	 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
> > +	 * is properly aligned.  The slab allocator now guarantees an aligned
> > +	 * allocation for all power of two sizes, we matches most of the smaller
> 
> Same suggestion:
> 
> "...which matches most of the smaller than PAGE_SIZE buffers..."

Hmm, I thought I had fixed that, but I for sure now have now.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 07/12] xfs: convert buffer cache to use high order folios
  2025-03-05 20:50   ` Dave Chinner
@ 2025-03-05 23:33     ` Christoph Hellwig
  2025-03-10 13:18     ` Christoph Hellwig
  1 sibling, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 23:33 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Darrick J. Wong, Dave Chinner,
	linux-xfs

On Thu, Mar 06, 2025 at 07:50:37AM +1100, Dave Chinner wrote:
> This may be one of the reasons why you don't see any change in real
> performance with 64kB directory blocks - we spend more time in
> folio allocation because of compaction overhead than we gain back
> from avoiding the use of vmapped buffers....
> 
> i.e.
> 	if (size > PAGE_SIZE) {
> 		if (!is_power_of_2(size))
> 			goto fallback;
> 		gfp_mask ~= __GFP_DIRECT_RECLAIM;
> 		gfp_mask |= __GFP_NORETRY;
> 	}
> 	folio = folio_alloc(gfp_mask, get_order(size));

I'll give it a try.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 21:20   ` Dave Chinner
  2025-03-05 22:54     ` Darrick J. Wong
@ 2025-03-05 23:35     ` Christoph Hellwig
  2025-03-06  0:57       ` Dave Chinner
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 23:35 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Darrick J. Wong, Dave Chinner,
	linux-xfs

On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote:
> > +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> > +				BBTOB(bp->b_length),
> > +				offset_in_page(bp->b_addr));
> >  	}
> 
> How does offset_in_page() work with a high order folio? It can only
> return a value between 0 and (PAGE_SIZE - 1).

Yes.

> i.e. shouldn't this
> be:
> 
> 		folio = kmem_to_folio(bp->b_addr);
> 
> 		bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length),
> 				offset_in_folio(folio, bp->b_addr));
> 

That is also correct, but does a lot more work underneath as the
bio_vecs work in terms of pages.  In the long run this should use
a bio_add_virt that hides all that (and the bio_vecs should move to
store physical addresses).  For now the above is the simplest and
most efficient version.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback
  2025-03-05 21:02   ` Dave Chinner
@ 2025-03-05 23:38     ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 23:38 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Darrick J. Wong, Dave Chinner,
	linux-xfs, Bill O'Donnell

On Thu, Mar 06, 2025 at 08:02:33AM +1100, Dave Chinner wrote:
> > +	if (!(flags & XBF_READ))
> > +		gfp_mask |= __GFP_ZERO;
> 
> We should probably drop this zeroing altogether.

Maybe.  Not in this patch or series, though and whoever wants to submit
it needs to explain why the rationale used for adding it in commit
3219e8cf0dade9884d3c6cb432d433b4ca56875d doesn't apply any more.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 23:28       ` Dave Chinner
@ 2025-03-05 23:45         ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-05 23:45 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Darrick J. Wong, Christoph Hellwig, Carlos Maiolino, Dave Chinner,
	linux-xfs

On Thu, Mar 06, 2025 at 10:28:30AM +1100, Dave Chinner wrote:
> Yes, but that misses my point: this is a folio conversion, whilst
> this treats a folio as a page.

The code covers both folios and slab.  willy has explicitly NAKed using
folio helpers for slab.  So if we'd really want to use folio helpers
here we'd need to special case them vs slab.  That's why I keep using
the existing API for now until the block layer grows better helpers,
on which I'm waiting.

And this is not new code, it's relatively minor refactoring of the
existing code.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-05 23:35     ` Christoph Hellwig
@ 2025-03-06  0:57       ` Dave Chinner
  2025-03-06  1:40         ` Christoph Hellwig
  0 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2025-03-06  0:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Carlos Maiolino, Darrick J. Wong, Dave Chinner, linux-xfs

On Thu, Mar 06, 2025 at 12:35:36AM +0100, Christoph Hellwig wrote:
> On Thu, Mar 06, 2025 at 08:20:08AM +1100, Dave Chinner wrote:
> > > +		__bio_add_page(bio, virt_to_page(bp->b_addr),
> > > +				BBTOB(bp->b_length),
> > > +				offset_in_page(bp->b_addr));
> > >  	}
> > 
> > How does offset_in_page() work with a high order folio? It can only
> > return a value between 0 and (PAGE_SIZE - 1).
> 
> Yes.
> 
> > i.e. shouldn't this
> > be:
> > 
> > 		folio = kmem_to_folio(bp->b_addr);
> > 
> > 		bio_add_folio_nofail(bio, folio, BBTOB(bp->b_length),
> > 				offset_in_folio(folio, bp->b_addr));
> > 
> 
> That is also correct, but does a lot more work underneath as the
> bio_vecs work in terms of pages.  In the long run this should use
> a bio_add_virt that hides all that (and the bio_vecs should move to
> store physical addresses).  For now the above is the simplest and
> most efficient version.

Can you add a comment that the code is done this way because
of the mismatch between block layer API and mm object (folio/slab)
handling APIs? Otherwise this code is going to look wrong every time
I look at in future....

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-06  0:57       ` Dave Chinner
@ 2025-03-06  1:40         ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-06  1:40 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Darrick J. Wong, Dave Chinner,
	linux-xfs

On Thu, Mar 06, 2025 at 11:57:29AM +1100, Dave Chinner wrote:
> Can you add a comment that the code is done this way because
> of the mismatch between block layer API and mm object (folio/slab)
> handling APIs? Otherwise this code is going to look wrong every time
> I look at in future....

Sure.  Although I hope we'll have the bio_add_virt helper by the
following merge window.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 07/12] xfs: convert buffer cache to use high order folios
  2025-03-05 20:50   ` Dave Chinner
  2025-03-05 23:33     ` Christoph Hellwig
@ 2025-03-10 13:18     ` Christoph Hellwig
  1 sibling, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-10 13:18 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Christoph Hellwig, Carlos Maiolino, Darrick J. Wong, Dave Chinner,
	linux-xfs

On Thu, Mar 06, 2025 at 07:50:37AM +1100, Dave Chinner wrote:
> The only thing extra that I would do here is take a leaf from the
> kmalloc() call in xlog_kvmalloc() and turn off direct reclaim for
> this allocation because >= 32kB allocations are considered "costly"
> and so will enter the compaction code if direct reclaim is enabled.
> 
> Given that we fall back to vmalloc, clearing __GFP_DIRECT_RECLAIM
> and setting __GFP_NORETRY here means that we don't burn lots of CPU
> on memory compaction if there is no high order folios available for
> immediate allocation. And on a busy machine, compaction is likely to
> fail frequently and so this is all wasted CPU time.
> 
> This may be one of the reasons why you don't see any change in real
> performance with 64kB directory blocks - we spend more time in
> folio allocation because of compaction overhead than we gain back
> from avoiding the use of vmapped buffers....

FYI, this did not make any difference in my testing.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory
  2025-03-10 13:19 use folios and vmalloc for buffer cache backing memory v3 Christoph Hellwig
@ 2025-03-10 13:19 ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2025-03-10 13:19 UTC (permalink / raw)
  To: Carlos Maiolino; +Cc: Darrick J. Wong, Dave Chinner, linux-xfs

The fallback buffer allocation path currently open codes a suboptimal
version of vmalloc to allocate pages that are then mapped into
vmalloc space.  Switch to using vmalloc instead, which uses all the
optimizations in the common vmalloc code, and removes the need to
track the backing pages in the xfs_buf structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_buf.c     | 212 +++++++++++--------------------------------
 fs/xfs/xfs_buf.h     |   7 --
 fs/xfs/xfs_buf_mem.c |  11 +--
 3 files changed, 53 insertions(+), 177 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5ec7d83210f..4aaa588330e4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -55,13 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
 	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
 }
 
-static inline int
-xfs_buf_vmap_len(
-	struct xfs_buf	*bp)
-{
-	return (bp->b_page_count * PAGE_SIZE);
-}
-
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -190,29 +183,6 @@ _xfs_buf_alloc(
 	return 0;
 }
 
-static void
-xfs_buf_free_pages(
-	struct xfs_buf	*bp)
-{
-	uint		i;
-
-	ASSERT(bp->b_flags & _XBF_PAGES);
-
-	if (is_vmalloc_addr(bp->b_addr))
-		vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
-	for (i = 0; i < bp->b_page_count; i++) {
-		if (bp->b_pages[i])
-			folio_put(page_folio(bp->b_pages[i]));
-	}
-	mm_account_reclaimed_pages(howmany(BBTOB(bp->b_length), PAGE_SIZE));
-
-	if (bp->b_pages != bp->b_page_array)
-		kfree(bp->b_pages);
-	bp->b_pages = NULL;
-	bp->b_flags &= ~_XBF_PAGES;
-}
-
 static void
 xfs_buf_free_callback(
 	struct callback_head	*cb)
@@ -227,16 +197,23 @@ static void
 xfs_buf_free(
 	struct xfs_buf		*bp)
 {
+	unsigned int		size = BBTOB(bp->b_length);
+
 	trace_xfs_buf_free(bp, _RET_IP_);
 
 	ASSERT(list_empty(&bp->b_lru));
 
+	if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+		mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
 	if (xfs_buftarg_is_mem(bp->b_target))
 		xmbuf_unmap_page(bp);
-	else if (bp->b_flags & _XBF_PAGES)
-		xfs_buf_free_pages(bp);
+	else if (is_vmalloc_addr(bp->b_addr))
+		vfree(bp->b_addr);
 	else if (bp->b_flags & _XBF_KMEM)
 		kfree(bp->b_addr);
+	else
+		folio_put(virt_to_folio(bp->b_addr));
 
 	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
@@ -264,9 +241,6 @@ xfs_buf_alloc_kmem(
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = kmem_to_page(bp->b_addr);
-	bp->b_page_count = 1;
 	bp->b_flags |= _XBF_KMEM;
 	return 0;
 }
@@ -287,9 +261,9 @@ xfs_buf_alloc_kmem(
  * by the rest of the code - the buffer memory spans a single contiguous memory
  * region that we don't have to map and unmap to access the data directly.
  *
- * The third type of buffer is the multi-page buffer. These are always made
- * up of single pages so that they can be fed to vmap_ram() to return a
- * contiguous memory region we can access the data through.
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
  */
 static int
 xfs_buf_alloc_backing_mem(
@@ -299,7 +273,6 @@ xfs_buf_alloc_backing_mem(
 	size_t		size = BBTOB(bp->b_length);
 	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
 	struct folio	*folio;
-	long		filled = 0;
 
 	if (xfs_buftarg_is_mem(bp->b_target))
 		return xmbuf_map_page(bp);
@@ -351,98 +324,18 @@ xfs_buf_alloc_backing_mem(
 		goto fallback;
 	}
 	bp->b_addr = folio_address(folio);
-	bp->b_page_array[0] = &folio->page;
-	bp->b_pages = bp->b_page_array;
-	bp->b_page_count = 1;
-	bp->b_flags |= _XBF_PAGES;
 	return 0;
 
 fallback:
-	/* Fall back to allocating an array of single page folios. */
-	bp->b_page_count = DIV_ROUND_UP(size, PAGE_SIZE);
-	if (bp->b_page_count <= XB_PAGES) {
-		bp->b_pages = bp->b_page_array;
-	} else {
-		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
-					gfp_mask);
-		if (!bp->b_pages)
-			return -ENOMEM;
-	}
-	bp->b_flags |= _XBF_PAGES;
-
-	/*
-	 * Bulk filling of pages can take multiple calls. Not filling the entire
-	 * array is not an allocation failure, so don't back off if we get at
-	 * least one extra page.
-	 */
 	for (;;) {
-		long	last = filled;
-
-		filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
-					  bp->b_pages);
-		if (filled == bp->b_page_count) {
-			XFS_STATS_INC(bp->b_mount, xb_page_found);
+		bp->b_addr = __vmalloc(size, gfp_mask);
+		if (bp->b_addr)
 			break;
-		}
-
-		if (filled != last)
-			continue;
-
-		if (flags & XBF_READ_AHEAD) {
-			xfs_buf_free_pages(bp);
+		if (flags & XBF_READ_AHEAD)
 			return -ENOMEM;
-		}
-
 		XFS_STATS_INC(bp->b_mount, xb_page_retries);
 		memalloc_retry_wait(gfp_mask);
 	}
-	return 0;
-}
-
-/*
- *	Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-	struct xfs_buf		*bp,
-	xfs_buf_flags_t		flags)
-{
-	ASSERT(bp->b_flags & _XBF_PAGES);
-	if (bp->b_page_count == 1) {
-		/* A single page buffer is always mappable */
-		bp->b_addr = page_address(bp->b_pages[0]);
-	} else {
-		int retried = 0;
-		unsigned nofs_flag;
-
-		/*
-		 * vm_map_ram() will allocate auxiliary structures (e.g.
-		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
-		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
-		 * from the same call site that can be run from both above and
-		 * below memory reclaim causes lockdep false positives. Hence we
-		 * always need to force this allocation to nofs context because
-		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
-		 * prevent false positive lockdep reports.
-		 *
-		 * XXX(dgc): I think dquot reclaim is the only place we can get
-		 * to this function from memory reclaim context now. If we fix
-		 * that like we've fixed inode reclaim to avoid writeback from
-		 * reclaim, this nofs wrapping can go away.
-		 */
-		nofs_flag = memalloc_nofs_save();
-		do {
-			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-						-1);
-			if (bp->b_addr)
-				break;
-			vm_unmap_aliases();
-		} while (retried++ <= 1);
-		memalloc_nofs_restore(nofs_flag);
-
-		if (!bp->b_addr)
-			return -ENOMEM;
-	}
 
 	return 0;
 }
@@ -562,7 +455,7 @@ xfs_buf_find_lock(
 			return -ENOENT;
 		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM;
 		bp->b_ops = NULL;
 	}
 	return 0;
@@ -748,18 +641,6 @@ xfs_buf_get_map(
 			xfs_perag_put(pag);
 	}
 
-	/* We do not hold a perag reference anymore. */
-	if (!bp->b_addr) {
-		error = _xfs_buf_map_pages(bp, flags);
-		if (unlikely(error)) {
-			xfs_warn_ratelimited(btp->bt_mount,
-				"%s: failed to map %u pages", __func__,
-				bp->b_page_count);
-			xfs_buf_relse(bp);
-			return error;
-		}
-	}
-
 	/*
 	 * Clear b_error if this is a lookup from a caller that doesn't expect
 	 * valid data to be found in the buffer.
@@ -1002,14 +883,6 @@ xfs_buf_get_uncached(
 	if (error)
 		goto fail_free_buf;
 
-	if (!bp->b_addr)
-		error = _xfs_buf_map_pages(bp, 0);
-	if (unlikely(error)) {
-		xfs_warn(target->bt_mount,
-			"%s: failed to map pages", __func__);
-		goto fail_free_buf;
-	}
-
 	trace_xfs_buf_get_uncached(bp, _RET_IP_);
 	*bpp = bp;
 	return 0;
@@ -1343,7 +1216,7 @@ __xfs_buf_ioend(
 	if (bp->b_flags & XBF_READ) {
 		if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
 			invalidate_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+				roundup(BBTOB(bp->b_length), PAGE_SIZE));
 		if (!bp->b_error && bp->b_ops)
 			bp->b_ops->verify_read(bp);
 		if (!bp->b_error)
@@ -1504,29 +1377,48 @@ static void
 xfs_buf_submit_bio(
 	struct xfs_buf		*bp)
 {
-	unsigned int		size = BBTOB(bp->b_length);
-	unsigned int		map = 0, p;
+	unsigned int		map = 0;
 	struct blk_plug		plug;
 	struct bio		*bio;
 
-	bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
-			xfs_buf_bio_op(bp), GFP_NOIO);
-	bio->bi_private = bp;
-	bio->bi_end_io = xfs_buf_bio_end_io;
+	if (is_vmalloc_addr(bp->b_addr)) {
+		unsigned int	size = BBTOB(bp->b_length);
+		unsigned int	alloc_size = roundup(size, PAGE_SIZE);
+		void		*data = bp->b_addr;
 
-	if (bp->b_page_count == 1) {
-		__bio_add_page(bio, virt_to_page(bp->b_addr), size,
-				offset_in_page(bp->b_addr));
-	} else {
-		for (p = 0; p < bp->b_page_count; p++)
-			__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
-		bio->bi_iter.bi_size = size; /* limit to the actual size used */
+		bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
+				xfs_buf_bio_op(bp), GFP_NOIO);
+
+		do {
+			unsigned int	len = min(size, PAGE_SIZE);
 
-		if (is_vmalloc_addr(bp->b_addr))
-			flush_kernel_vmap_range(bp->b_addr,
-					xfs_buf_vmap_len(bp));
+			ASSERT(offset_in_page(data) == 0);
+			__bio_add_page(bio, vmalloc_to_page(data), len, 0);
+			data += len;
+			size -= len;
+		} while (size);
+
+		flush_kernel_vmap_range(bp->b_addr, alloc_size);
+	} else {
+		/*
+		 * Single folio or slab allocation.  Must be contiguous and thus
+		 * only a single bvec is needed.
+		 *
+		 * This uses the page based bio add helper for now as that is
+		 * the lowest common denominator between folios and slab
+		 * allocations.  To be replaced with a better block layer
+		 * helper soon (hopefully).
+		 */
+		bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+				GFP_NOIO);
+		__bio_add_page(bio, virt_to_page(bp->b_addr),
+				BBTOB(bp->b_length),
+				offset_in_page(bp->b_addr));
 	}
 
+	bio->bi_private = bp;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+
 	/*
 	 * If there is more than one map segment, split out a new bio for each
 	 * map except of the last one.  The last map is handled by the
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8db522f19b0c..db43bdc17f55 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -36,7 +36,6 @@ struct xfs_buf;
 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
 
 /* flags used only internally */
-#define _XBF_PAGES	 (1u << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1u << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
@@ -61,7 +60,6 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_STALE,		"STALE" }, \
 	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
 	{ _XBF_LOGRECOVERY,	"LOG_RECOVERY" }, \
-	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
@@ -122,8 +120,6 @@ struct xfs_buftarg {
 	struct xfs_buf_cache	bt_cache[];
 };
 
-#define XB_PAGES	2
-
 struct xfs_buf_map {
 	xfs_daddr_t		bm_bn;	/* block number for I/O */
 	int			bm_len;	/* size of I/O */
@@ -185,13 +181,10 @@ struct xfs_buf {
 	struct xfs_buf_log_item	*b_log_item;
 	struct list_head	b_li_list;	/* Log items list head */
 	struct xfs_trans	*b_transp;
-	struct page		**b_pages;	/* array of page pointers */
-	struct page		*b_page_array[XB_PAGES]; /* inline pages */
 	struct xfs_buf_map	*b_maps;	/* compound buffer map */
 	struct xfs_buf_map	__b_map;	/* inline compound buffer map */
 	int			b_map_count;
 	atomic_t		b_pin_count;	/* pin count */
-	unsigned int		b_page_count;	/* size of page array */
 	int			b_error;	/* error code on I/O */
 	void			(*b_iodone)(struct xfs_buf *bp);
 
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 5b64a2b3b113..b207754d2ee0 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -169,9 +169,6 @@ xmbuf_map_page(
 	unlock_page(page);
 
 	bp->b_addr = page_address(page);
-	bp->b_pages = bp->b_page_array;
-	bp->b_pages[0] = page;
-	bp->b_page_count = 1;
 	return 0;
 }
 
@@ -180,16 +177,10 @@ void
 xmbuf_unmap_page(
 	struct xfs_buf		*bp)
 {
-	struct page		*page = bp->b_pages[0];
-
 	ASSERT(xfs_buftarg_is_mem(bp->b_target));
 
-	put_page(page);
-
+	put_page(virt_to_page(bp->b_addr));
 	bp->b_addr = NULL;
-	bp->b_pages[0] = NULL;
-	bp->b_pages = NULL;
-	bp->b_page_count = 0;
 }
 
 /* Is this a valid daddr within the buftarg? */
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2025-03-10 13:20 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-05 14:05 use folios and vmalloc for buffer cache backing memory v2 Christoph Hellwig
2025-03-05 14:05 ` [PATCH 01/12] xfs: unmapped buffer item size straddling mismatch Christoph Hellwig
2025-03-05 14:05 ` [PATCH 02/12] xfs: add a fast path to xfs_buf_zero when b_addr is set Christoph Hellwig
2025-03-05 14:05 ` [PATCH 03/12] xfs: remove xfs_buf.b_offset Christoph Hellwig
2025-03-05 14:05 ` [PATCH 04/12] xfs: remove xfs_buf_is_vmapped Christoph Hellwig
2025-03-05 14:05 ` [PATCH 05/12] xfs: refactor backing memory allocations for buffers Christoph Hellwig
2025-03-05 14:05 ` [PATCH 06/12] xfs: remove the kmalloc to page allocator fallback Christoph Hellwig
2025-03-05 18:18   ` Darrick J. Wong
2025-03-05 23:32     ` Christoph Hellwig
2025-03-05 21:02   ` Dave Chinner
2025-03-05 23:38     ` Christoph Hellwig
2025-03-05 14:05 ` [PATCH 07/12] xfs: convert buffer cache to use high order folios Christoph Hellwig
2025-03-05 18:20   ` Darrick J. Wong
2025-03-05 20:50   ` Dave Chinner
2025-03-05 23:33     ` Christoph Hellwig
2025-03-10 13:18     ` Christoph Hellwig
2025-03-05 14:05 ` [PATCH 08/12] xfs: kill XBF_UNMAPPED Christoph Hellwig
2025-03-05 14:05 ` [PATCH 09/12] xfs: buffer items don't straddle pages anymore Christoph Hellwig
2025-03-05 14:05 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
2025-03-05 18:22   ` Darrick J. Wong
2025-03-05 21:20   ` Dave Chinner
2025-03-05 22:54     ` Darrick J. Wong
2025-03-05 23:28       ` Dave Chinner
2025-03-05 23:45         ` Christoph Hellwig
2025-03-05 23:35     ` Christoph Hellwig
2025-03-06  0:57       ` Dave Chinner
2025-03-06  1:40         ` Christoph Hellwig
2025-03-05 14:05 ` [PATCH 11/12] xfs: cleanup mapping tmpfs folios into the buffer cache Christoph Hellwig
2025-03-05 18:34   ` Darrick J. Wong
2025-03-05 14:05 ` [PATCH 12/12] xfs: trace what memory backs a buffer Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2025-03-10 13:19 use folios and vmalloc for buffer cache backing memory v3 Christoph Hellwig
2025-03-10 13:19 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer backing memory Christoph Hellwig
2025-02-26 15:51 use folios and vmalloc for buffer cache " Christoph Hellwig
2025-02-26 15:51 ` [PATCH 10/12] xfs: use vmalloc instead of vm_map_area for buffer " Christoph Hellwig
2025-02-26 18:02   ` Darrick J. Wong
2025-03-04 14:10     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox