* [PATCH] xfs: export buffer cache usage via stats
@ 2025-04-28 18:11 Wengang Wang
2025-04-30 2:38 ` Dave Chinner
0 siblings, 1 reply; 3+ messages in thread
From: Wengang Wang @ 2025-04-28 18:11 UTC (permalink / raw)
To: linux-xfs; +Cc: wen.gang.wang
This patch introduces new fields to per-mount and global stats,
and export them to user space.
@page_alloc -- number of pages allocated from buddy to buffer cache
@page_free -- number of pages freed to buddy from buffer cache
@kbb_alloc -- number of BBs allocated from kmalloc slab to buffer cache
@kbb_free -- number of BBs freed to kmalloc slab from buffer cache
@vbb_alloc -- number of BBs allocated from vmalloc system to buffer cache
@vbb_free -- number of BBs freed to vmalloc system from buffer cache
By looking at above stats fields, user space can easily know the buffer
cache usage.
Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
fs/xfs/xfs_buf.c | 15 +++++++++++----
fs/xfs/xfs_stats.c | 16 ++++++++++++++++
fs/xfs/xfs_stats.h | 8 ++++++++
3 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1a2b3f06fa71..db3cb94eabee 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -113,12 +113,17 @@ xfs_buf_free(
if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
- if (is_vmalloc_addr(bp->b_addr))
+ if (is_vmalloc_addr(bp->b_addr)) {
vfree(bp->b_addr);
- else if (bp->b_flags & _XBF_KMEM)
+ XFS_STATS_ADD(bp->b_mount, xs_buf_vbb_free, bp->b_length);
+ } else if (bp->b_flags & _XBF_KMEM) {
kfree(bp->b_addr);
- else
+ XFS_STATS_ADD(bp->b_mount, xs_buf_kbb_free, bp->b_length);
+ } else {
folio_put(virt_to_folio(bp->b_addr));
+ XFS_STATS_ADD(bp->b_mount, xs_buf_page_free,
+ BBTOB(bp->b_length) >> PAGE_SHIFT);
+ }
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
@@ -147,6 +152,7 @@ xfs_buf_alloc_kmem(
return -ENOMEM;
}
bp->b_flags |= _XBF_KMEM;
+ XFS_STATS_ADD(bp->b_mount, xs_buf_kbb_alloc, bp->b_length);
trace_xfs_buf_backing_kmem(bp, _RET_IP_);
return 0;
}
@@ -232,6 +238,7 @@ xfs_buf_alloc_backing_mem(
}
bp->b_addr = folio_address(folio);
trace_xfs_buf_backing_folio(bp, _RET_IP_);
+ XFS_STATS_ADD(bp->b_mount, xs_buf_page_alloc, size >> PAGE_SHIFT);
return 0;
fallback:
@@ -244,7 +251,7 @@ xfs_buf_alloc_backing_mem(
XFS_STATS_INC(bp->b_mount, xb_page_retries);
memalloc_retry_wait(gfp_mask);
}
-
+ XFS_STATS_ADD(bp->b_mount, xs_buf_vbb_alloc, bp->b_length);
trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
return 0;
}
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 35c7fb3ba324..a0f6813dc782 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -24,6 +24,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
uint64_t defer_relog = 0;
+ uint64_t pg_alloc = 0;
+ uint64_t pg_free = 0;
+ uint64_t kbb_alloc = 0;
+ uint64_t kbb_free = 0;
+ uint64_t vbb_alloc = 0;
+ uint64_t vbb_free = 0;
static const struct xstats_entry {
char *desc;
@@ -77,6 +83,12 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
+ pg_alloc += per_cpu_ptr(stats, i)->s.xs_buf_page_alloc;
+ pg_free += per_cpu_ptr(stats, i)->s.xs_buf_page_free;
+ kbb_alloc += per_cpu_ptr(stats, i)->s.xs_buf_kbb_alloc;
+ kbb_free += per_cpu_ptr(stats, i)->s.xs_buf_kbb_free;
+ vbb_alloc += per_cpu_ptr(stats, i)->s.xs_buf_vbb_alloc;
+ vbb_free += per_cpu_ptr(stats, i)->s.xs_buf_vbb_free;
}
len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n",
@@ -89,6 +101,10 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
#else
0);
#endif
+ len += scnprintf(buf + len, PATH_MAX-len,
+ "cache %llu %llu %llu %llu %llu %llu\n",
+ pg_alloc, pg_free, kbb_alloc, kbb_free,
+ vbb_alloc, vbb_free);
return len;
}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 15ba1abcf253..5e186880d8d0 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -143,6 +143,14 @@ struct __xfsstats {
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
uint64_t defer_relog;
+
+ /* number of pages/bbs allocated/freed in buffer cache */
+ uint64_t xs_buf_page_alloc;
+ uint64_t xs_buf_page_free;
+ uint64_t xs_buf_kbb_alloc;
+ uint64_t xs_buf_kbb_free;
+ uint64_t xs_buf_vbb_alloc;
+ uint64_t xs_buf_vbb_free;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
--
2.39.5 (Apple Git-154)
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] xfs: export buffer cache usage via stats
2025-04-28 18:11 [PATCH] xfs: export buffer cache usage via stats Wengang Wang
@ 2025-04-30 2:38 ` Dave Chinner
2025-04-30 15:25 ` Wengang Wang
0 siblings, 1 reply; 3+ messages in thread
From: Dave Chinner @ 2025-04-30 2:38 UTC (permalink / raw)
To: Wengang Wang; +Cc: linux-xfs
On Mon, Apr 28, 2025 at 11:11:35AM -0700, Wengang Wang wrote:
> This patch introduces new fields to per-mount and global stats,
> and export them to user space.
>
> @page_alloc -- number of pages allocated from buddy to buffer cache
> @page_free -- number of pages freed to buddy from buffer cache
> @kbb_alloc -- number of BBs allocated from kmalloc slab to buffer cache
> @kbb_free -- number of BBs freed to kmalloc slab from buffer cache
> @vbb_alloc -- number of BBs allocated from vmalloc system to buffer cache
> @vbb_free -- number of BBs freed to vmalloc system from buffer cache
This forms a permanent user API once created, so exposing internal
implementation details like this doesn't make me feel good. We've
changed how we allocate memory for buffers quite a bit recently
to do things like support large folios and minimise vmap usage,
then to use vmalloc instead of vmap, etc. e.g. we don't use pages
at all in the buffer cache anymore..
I'm actually looking further simplifying the implementation - I
think the custom folio/vmalloc stuff can be replaced entirely by a
single call to kvmalloc() now, which means some stuff will come from
slabs, some from the buddy and some from vmalloc. We won't know
where it comes from at all, and if this stats interface already
existed then such a change would render it completely useless.
> By looking at above stats fields, user space can easily know the buffer
> cache usage.
Not easily - the implementation only aggregates alloc/free values so
the user has to manually do the (alloc - free) calculation to
determine how much memory is currenlty in use. And then we don't
really know what size buffers are actually using that memory...
i.e. buffers for everything other than xattrs are fixed sizes (single
sector, single block, directory block, inode cluster), so it makes
make more sense to me to dump a buffer size histogram for memory
usage. We can infer things like inode cluster memory usage from such
output, so not only would we get memory usage we also get some
insight into what is consuming the memory.
Hence I think it would be better to track a set of buffer size based
buckets so we get output something like:
buffer size count Total Bytes
----------- ----- -----------
< 4kB <n> <aggregate count of b_length>
4kB
<= 8kB
<= 16kB
<= 32kB
<= 64kB
I also think that it might be better to dump this in a separate
sysfs file rather than add it to the existing stats file.
With this information on any given system, we can infer what
allocated from slab based on the buffer sizes and system PAGE_SIZE.
However, my main point is that for the general case of "how much
memory is in use by the buffer cache", we really don't want to tie
it to the internal allocation implementation. A histogram output like the
above is not tied to the internal implementation, whilst giving
additional insight into what size allocations are generating all the
memory usage...
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] xfs: export buffer cache usage via stats
2025-04-30 2:38 ` Dave Chinner
@ 2025-04-30 15:25 ` Wengang Wang
0 siblings, 0 replies; 3+ messages in thread
From: Wengang Wang @ 2025-04-30 15:25 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs@vger.kernel.org
Hi Dave,
Thanks for advising. I will try to dump size histolgram in next drop.
Wengang
> On Apr 29, 2025, at 7:38 PM, Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Apr 28, 2025 at 11:11:35AM -0700, Wengang Wang wrote:
>> This patch introduces new fields to per-mount and global stats,
>> and export them to user space.
>>
>> @page_alloc -- number of pages allocated from buddy to buffer cache
>> @page_free -- number of pages freed to buddy from buffer cache
>> @kbb_alloc -- number of BBs allocated from kmalloc slab to buffer cache
>> @kbb_free -- number of BBs freed to kmalloc slab from buffer cache
>> @vbb_alloc -- number of BBs allocated from vmalloc system to buffer cache
>> @vbb_free -- number of BBs freed to vmalloc system from buffer cache
>
> This forms a permanent user API once created, so exposing internal
> implementation details like this doesn't make me feel good. We've
> changed how we allocate memory for buffers quite a bit recently
> to do things like support large folios and minimise vmap usage,
> then to use vmalloc instead of vmap, etc. e.g. we don't use pages
> at all in the buffer cache anymore..
>
> I'm actually looking further simplifying the implementation - I
> think the custom folio/vmalloc stuff can be replaced entirely by a
> single call to kvmalloc() now, which means some stuff will come from
> slabs, some from the buddy and some from vmalloc. We won't know
> where it comes from at all, and if this stats interface already
> existed then such a change would render it completely useless.
>
>> By looking at above stats fields, user space can easily know the buffer
>> cache usage.
>
> Not easily - the implementation only aggregates alloc/free values so
> the user has to manually do the (alloc - free) calculation to
> determine how much memory is currenlty in use. And then we don't
> really know what size buffers are actually using that memory...
>
> i.e. buffers for everything other than xattrs are fixed sizes (single
> sector, single block, directory block, inode cluster), so it makes
> make more sense to me to dump a buffer size histogram for memory
> usage. We can infer things like inode cluster memory usage from such
> output, so not only would we get memory usage we also get some
> insight into what is consuming the memory.
>
> Hence I think it would be better to track a set of buffer size based
> buckets so we get output something like:
>
> buffer size count Total Bytes
> ----------- ----- -----------
> < 4kB <n> <aggregate count of b_length>
> 4kB
> <= 8kB
> <= 16kB
> <= 32kB
> <= 64kB
>
> I also think that it might be better to dump this in a separate
> sysfs file rather than add it to the existing stats file.
>
> With this information on any given system, we can infer what
> allocated from slab based on the buffer sizes and system PAGE_SIZE.
>
> However, my main point is that for the general case of "how much
> memory is in use by the buffer cache", we really don't want to tie
> it to the internal allocation implementation. A histogram output like the
> above is not tied to the internal implementation, whilst giving
> additional insight into what size allocations are generating all the
> memory usage...
>
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-04-30 15:25 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-28 18:11 [PATCH] xfs: export buffer cache usage via stats Wengang Wang
2025-04-30 2:38 ` Dave Chinner
2025-04-30 15:25 ` Wengang Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox