* [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-20 9:48 ` Masami Hiramatsu
2023-12-19 18:54 ` [PATCH v5 02/15] ring-buffer: Page size per ring buffer Steven Rostedt
` (14 subsequent siblings)
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
In order to introduce sub-buffer size per ring buffer, some internal
refactoring is needed. As ring_buffer_print_page_header() will depend on
the trace_buffer structure, it is moved after the structure definition.
Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 60 +++++++++++++++++++-------------------
1 file changed, 30 insertions(+), 30 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f7dc74e45ebf..2400c8e68fd3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -379,36 +379,6 @@ static inline bool test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -556,6 +526,36 @@ struct ring_buffer_iter {
int missed_events;
};
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+ struct buffer_data_page field;
+
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
+
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
+}
+
static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation
2023-12-19 18:54 ` [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation Steven Rostedt
@ 2023-12-20 9:48 ` Masami Hiramatsu
2023-12-20 12:53 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-20 9:48 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:15 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
>
> In order to introduce sub-buffer size per ring buffer, some internal
> refactoring is needed. As ring_buffer_print_page_header() will depend on
> the trace_buffer structure, it is moved after the structure definition.
>
> Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com
>
OK, but the title is too generic. Something like
"Move ring_buffer_print_page_header() after ring_buffer_iter"
will be preferable.
Thank you,
> Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> kernel/trace/ring_buffer.c | 60 +++++++++++++++++++-------------------
> 1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index f7dc74e45ebf..2400c8e68fd3 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -379,36 +379,6 @@ static inline bool test_time_stamp(u64 delta)
> /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
> #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
>
> -int ring_buffer_print_page_header(struct trace_seq *s)
> -{
> - struct buffer_data_page field;
> -
> - trace_seq_printf(s, "\tfield: u64 timestamp;\t"
> - "offset:0;\tsize:%u;\tsigned:%u;\n",
> - (unsigned int)sizeof(field.time_stamp),
> - (unsigned int)is_signed_type(u64));
> -
> - trace_seq_printf(s, "\tfield: local_t commit;\t"
> - "offset:%u;\tsize:%u;\tsigned:%u;\n",
> - (unsigned int)offsetof(typeof(field), commit),
> - (unsigned int)sizeof(field.commit),
> - (unsigned int)is_signed_type(long));
> -
> - trace_seq_printf(s, "\tfield: int overwrite;\t"
> - "offset:%u;\tsize:%u;\tsigned:%u;\n",
> - (unsigned int)offsetof(typeof(field), commit),
> - 1,
> - (unsigned int)is_signed_type(long));
> -
> - trace_seq_printf(s, "\tfield: char data;\t"
> - "offset:%u;\tsize:%u;\tsigned:%u;\n",
> - (unsigned int)offsetof(typeof(field), data),
> - (unsigned int)BUF_PAGE_SIZE,
> - (unsigned int)is_signed_type(char));
> -
> - return !trace_seq_has_overflowed(s);
> -}
> -
> struct rb_irq_work {
> struct irq_work work;
> wait_queue_head_t waiters;
> @@ -556,6 +526,36 @@ struct ring_buffer_iter {
> int missed_events;
> };
>
> +int ring_buffer_print_page_header(struct trace_seq *s)
> +{
> + struct buffer_data_page field;
> +
> + trace_seq_printf(s, "\tfield: u64 timestamp;\t"
> + "offset:0;\tsize:%u;\tsigned:%u;\n",
> + (unsigned int)sizeof(field.time_stamp),
> + (unsigned int)is_signed_type(u64));
> +
> + trace_seq_printf(s, "\tfield: local_t commit;\t"
> + "offset:%u;\tsize:%u;\tsigned:%u;\n",
> + (unsigned int)offsetof(typeof(field), commit),
> + (unsigned int)sizeof(field.commit),
> + (unsigned int)is_signed_type(long));
> +
> + trace_seq_printf(s, "\tfield: int overwrite;\t"
> + "offset:%u;\tsize:%u;\tsigned:%u;\n",
> + (unsigned int)offsetof(typeof(field), commit),
> + 1,
> + (unsigned int)is_signed_type(long));
> +
> + trace_seq_printf(s, "\tfield: char data;\t"
> + "offset:%u;\tsize:%u;\tsigned:%u;\n",
> + (unsigned int)offsetof(typeof(field), data),
> + (unsigned int)BUF_PAGE_SIZE,
> + (unsigned int)is_signed_type(char));
> +
> + return !trace_seq_has_overflowed(s);
> +}
> +
> static inline void rb_time_read(rb_time_t *t, u64 *ret)
> {
> *ret = local64_read(&t->time);
> --
> 2.42.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation
2023-12-20 9:48 ` Masami Hiramatsu
@ 2023-12-20 12:53 ` Steven Rostedt
0 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-20 12:53 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Wed, 20 Dec 2023 18:48:43 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
> >
> > In order to introduce sub-buffer size per ring buffer, some internal
> > refactoring is needed. As ring_buffer_print_page_header() will depend on
> > the trace_buffer structure, it is moved after the structure definition.
> >
> > Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com
> >
>
> OK, but the title is too generic. Something like
> "Move ring_buffer_print_page_header() after ring_buffer_iter"
> will be preferable.
>
Yeah, I can update the subject. Thanks!
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-20 8:48 ` David Laight
2023-12-19 18:54 ` [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size Steven Rostedt
` (13 subsequent siblings)
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Currently the size of one sub buffer page is global for all buffers and
it is hard coded to one system page. In order to introduce configurable
ring buffer sub page size, the internal logic should be refactored to
work with sub page size per ring buffer.
Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/ring_buffer.h | 2 +-
kernel/trace/ring_buffer.c | 68 +++++++++++++++++++++----------------
kernel/trace/trace.c | 2 +-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 59 ++++++++++++++++++++++++--------
5 files changed, 86 insertions(+), 46 deletions(-)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1b03b2c0f08..ce46218ce46d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
struct trace_seq;
int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2400c8e68fd3..d9f656502400 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -374,11 +374,6 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -510,6 +505,9 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -523,10 +521,11 @@ struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-int ring_buffer_print_page_header(struct trace_seq *s)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
struct buffer_data_page field;
@@ -550,7 +549,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)buffer->subbuf_size,
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -1625,7 +1624,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1944,7 +1949,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -1966,7 +1971,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2213,7 +2218,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2446,6 +2451,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2454,13 +2460,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2488,7 +2494,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
@@ -2503,19 +2509,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -3469,7 +3475,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3600,7 +3606,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
- if (info.length > BUF_MAX_DATA_SIZE)
+ if (info.length > cpu_buffer->buffer->max_data_size)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
@@ -3675,7 +3681,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3825,7 +3831,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4405,6 +4411,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
@@ -4540,7 +4547,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
@@ -4984,7 +4991,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
return NULL;
/* Holds the entire event: data and meta data */
- iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
/*
* Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
+ * buffer->subbuf_size * buffer->nr_pages
* Since the nr_pages field is now removed, we have converted this to
* return the per cpu buffer value.
*/
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
@@ -5123,8 +5131,8 @@ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
{
/* If abs timestamp is requested, events have a timestamp too */
if (ring_buffer_time_stamp_abs(buffer))
- return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND;
- return BUF_MAX_DATA_SIZE;
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
}
EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
@@ -5730,7 +5738,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5742,8 +5750,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55dabee4c78b..76dd0a4c8cb5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5018,7 +5018,7 @@ static int tracing_release(struct inode *inode, struct file *file)
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79180aed13ee..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b70d03818038..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_array *tr = filp->private_data;
struct trace_seq *s;
int r;
@@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -3794,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
return 1;
}
- if (strcmp(name, "header_page") == 0)
- *data = ring_buffer_print_page_header;
-
- else if (strcmp(name, "header_event") == 0)
- *data = ring_buffer_print_entry_header;
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
- else
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
return 0;
- *mode = TRACE_MODE_READ;
- *fops = &ftrace_show_header_fops;
return 1;
}
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* RE: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-19 18:54 ` [PATCH v5 02/15] ring-buffer: Page size per ring buffer Steven Rostedt
@ 2023-12-20 8:48 ` David Laight
2023-12-20 13:01 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: David Laight @ 2023-12-20 8:48 UTC (permalink / raw)
To: 'Steven Rostedt', linux-kernel@vger.kernel.org,
linux-trace-kernel@vger.kernel.org
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: Steven Rostedt
> Sent: 19 December 2023 18:54
> From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
>
> Currently the size of one sub buffer page is global for all buffers and
> it is hard coded to one system page. In order to introduce configurable
> ring buffer sub page size, the internal logic should be refactored to
> work with sub page size per ring buffer.
>
...
> - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
> + /* Default buffer page size - one system page */
> + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
> +
> + /* Max payload is buffer page size - header (8bytes) */
> + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
> +
> + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
While not new, does this really make any sense for systems with 64k pages?
Wouldn't it be better to have units of 4k?
...
> @@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
> {
> /*
> * Earlier, this method returned
> - * BUF_PAGE_SIZE * buffer->nr_pages
> + * buffer->subbuf_size * buffer->nr_pages
> * Since the nr_pages field is now removed, we have converted this to
> * return the per cpu buffer value.
Overenthusiastic global replace...
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-20 8:48 ` David Laight
@ 2023-12-20 13:01 ` Steven Rostedt
2023-12-21 9:17 ` David Laight
0 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-20 13:01 UTC (permalink / raw)
To: David Laight
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
On Wed, 20 Dec 2023 08:48:02 +0000
David Laight <David.Laight@ACULAB.COM> wrote:
> From: Steven Rostedt
> > Sent: 19 December 2023 18:54
> > From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
> >
> > Currently the size of one sub buffer page is global for all buffers and
> > it is hard coded to one system page. In order to introduce configurable
> > ring buffer sub page size, the internal logic should be refactored to
> > work with sub page size per ring buffer.
> >
> ...
> > - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
> > + /* Default buffer page size - one system page */
> > + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
> > +
> > + /* Max payload is buffer page size - header (8bytes) */
> > + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
> > +
> > + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
>
> While not new, does this really make any sense for systems with 64k pages?
> Wouldn't it be better to have units of 4k?
Unfortunately, it has to be PAGE_SIZE (and for now it's a power of 2 to
make masking easy). It's used for splice and will also be used for memory
mapping with user space.
>
> ...
> > @@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
> > {
> > /*
> > * Earlier, this method returned
> > - * BUF_PAGE_SIZE * buffer->nr_pages
> > + * buffer->subbuf_size * buffer->nr_pages
> > * Since the nr_pages field is now removed, we have converted this to
> > * return the per cpu buffer value.
>
> Overenthusiastic global replace...
Possibly, but the comment still applies, and should probably be removed, as
it's rather old (2012). It's basically just saying that the size use to be
calculated from buffer->nr_pages and now it's calculated by
buffer->buffers[cpu]->nr_pages.
I think I'll just add a patch to remove that comment.
Thanks,
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread* RE: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-20 13:01 ` Steven Rostedt
@ 2023-12-21 9:17 ` David Laight
2023-12-21 14:19 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: David Laight @ 2023-12-21 9:17 UTC (permalink / raw)
To: 'Steven Rostedt'
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: Steven Rostedt
> Sent: 20 December 2023 13:01
>
> On Wed, 20 Dec 2023 08:48:02 +0000
> David Laight <David.Laight@ACULAB.COM> wrote:
>
> > From: Steven Rostedt
> > > Sent: 19 December 2023 18:54
> > > From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
> > >
> > > Currently the size of one sub buffer page is global for all buffers and
> > > it is hard coded to one system page. In order to introduce configurable
> > > ring buffer sub page size, the internal logic should be refactored to
> > > work with sub page size per ring buffer.
> > >
> > ...
> > > - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
> > > + /* Default buffer page size - one system page */
> > > + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
> > > +
> > > + /* Max payload is buffer page size - header (8bytes) */
> > > + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
> > > +
> > > + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
> >
> > While not new, does this really make any sense for systems with 64k pages?
> > Wouldn't it be better to have units of 4k?
>
> Unfortunately, it has to be PAGE_SIZE (and for now it's a power of 2 to
> make masking easy). It's used for splice and will also be used for memory
> mapping with user space.
Perhaps then the sysctl to set the size should be powers of 4k
with a minimum size of PAGE_SIZE.
Then you don't have to know the page size when setting things up.
I'm also guessing that no Linux kernels have a PAGE_SIZE of 2k?
IIRC some old mmu (maybe 68020 era) used 2k pages.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-21 9:17 ` David Laight
@ 2023-12-21 14:19 ` Steven Rostedt
2023-12-21 14:51 ` David Laight
0 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-21 14:19 UTC (permalink / raw)
To: David Laight
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
On Thu, 21 Dec 2023 09:17:55 +0000
David Laight <David.Laight@ACULAB.COM> wrote:
> > Unfortunately, it has to be PAGE_SIZE (and for now it's a power of 2 to
> > make masking easy). It's used for splice and will also be used for memory
> > mapping with user space.
>
> Perhaps then the sysctl to set the size should be powers of 4k
It's not a sysctl but a file in tracefs
> with a minimum size of PAGE_SIZE.
> Then you don't have to know the page size when setting things up.
The user shouldn't need to know either. But the size of the sub-buffer
limits the biggest size of an event, so the user only needs to make sure
the sub-buffer is bigger than their biggest event.
>
> I'm also guessing that no Linux kernels have a PAGE_SIZE of 2k?
> IIRC some old mmu (maybe 68020 era) used 2k pages.
I think 1kb units is perfectly fine (patch 15 changes to kb units). The
interface says its to define the minimal size of the sub-buffer, not the
actual size.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* RE: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-21 14:19 ` Steven Rostedt
@ 2023-12-21 14:51 ` David Laight
2023-12-21 14:57 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: David Laight @ 2023-12-21 14:51 UTC (permalink / raw)
To: 'Steven Rostedt'
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
> I think 1kb units is perfectly fine (patch 15 changes to kb units). The
> interface says its to define the minimal size of the sub-buffer, not the
> actual size.
I didn't read that far through :-(
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v5 02/15] ring-buffer: Page size per ring buffer
2023-12-21 14:51 ` David Laight
@ 2023-12-21 14:57 ` Steven Rostedt
0 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-21 14:57 UTC (permalink / raw)
To: David Laight
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
On Thu, 21 Dec 2023 14:51:29 +0000
David Laight <David.Laight@ACULAB.COM> wrote:
> > I think 1kb units is perfectly fine (patch 15 changes to kb units). The
> > interface says its to define the minimal size of the sub-buffer, not the
> > actual size.
>
> I didn't read that far through :-(
>
Well, this isn't a normal patch series as I took the work from Tzvetomir
back from 2021, and started massaging them. I wanted to keep Tzvetomir's
original authorship even though he's not working on it, so I just applied
his patches as-is and then added patches on top of them, to denote what I
did and what he did.
This is why I never modified the first 5 patches of this series, except for
subject lines and change logs.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 01/15] ring-buffer: Refactor ring buffer implementation Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 02/15] ring-buffer: Page size per ring buffer Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-20 14:26 ` Masami Hiramatsu
2023-12-19 18:54 ` [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page Steven Rostedt
` (12 subsequent siblings)
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
The trace ring buffer sub page size can be configured, per trace
instance. A new ftrace file "buffer_subbuf_order" is added to get and
set the size of the ring buffer sub page for current trace instance.
The size must be an order of system page size, that's why the new
interface works with system page order, instead of absolute page size:
0 means the ring buffer sub page is equal to 1 system page and so
forth:
0 - 1 system page
1 - 2 system pages
2 - 4 system pages
...
The ring buffer sub page size is limited between 1 and 128 system
pages. The default value is 1 system page.
New ring buffer APIs are introduced:
ring_buffer_subbuf_order_set()
ring_buffer_subbuf_order_get()
ring_buffer_subbuf_size_get()
Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/ring_buffer.h | 4 ++
kernel/trace/ring_buffer.c | 73 +++++++++++++++++++++++++++++++++++++
kernel/trace/trace.c | 48 ++++++++++++++++++++++++
3 files changed, 125 insertions(+)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ce46218ce46d..12573306b889 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,10 @@ struct trace_seq;
int ring_buffer_print_entry_header(struct trace_seq *s);
int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
+
enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
};
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9f656502400..20fc0121735d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -507,6 +507,7 @@ struct trace_buffer {
bool time_stamp_abs;
unsigned int subbuf_size;
+ unsigned int subbuf_order;
unsigned int max_data_size;
};
@@ -5761,6 +5762,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ int psize;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Todo: reset the buffer with the new page size */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 76dd0a4c8cb5..a010aba4c4a4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9358,6 +9358,51 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_order_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* limit between 1 and 128 system pages */
+ if (val < 0 || val > 7)
+ return -EINVAL;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
+ if (ret)
+ return ret;
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static const struct file_operations buffer_order_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_order_read,
+ .write = buffer_order_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -9824,6 +9869,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_order_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size
2023-12-19 18:54 ` [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size Steven Rostedt
@ 2023-12-20 14:26 ` Masami Hiramatsu
2023-12-20 14:40 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-20 14:26 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:17 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> +/**
> + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
> + * @buffer: The ring_buffer to set the new page size.
> + * @order: Order of the system pages in one sub buffer page
> + *
> + * By default, one ring buffer pages equals to one system page. This API can be
> + * used to set new size of the ring buffer page. The size must be order of
> + * system page size, that's why the input parameter @order is the order of
> + * system pages that are allocated for one ring buffer page:
> + * 0 - 1 system page
> + * 1 - 2 system pages
> + * 3 - 4 system pages
> + * ...
Don't we have the max order of the pages?
> + *
> + * Returns 0 on success or < 0 in case of an error.
> + */
> +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
> +{
> + int psize;
> +
> + if (!buffer || order < 0)
> + return -EINVAL;
> +
> + if (buffer->subbuf_order == order)
> + return 0;
> +
> + psize = (1 << order) * PAGE_SIZE;
> + if (psize <= BUF_PAGE_HDR_SIZE)
> + return -EINVAL;
> +
> + buffer->subbuf_order = order;
> + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
> +
> + /* Todo: reset the buffer with the new page size */
> +
I just wonder why there is no reallocate the sub buffers here.
Is it OK to change the sub buffer page size and order while
using the ring buffer?
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size
2023-12-20 14:26 ` Masami Hiramatsu
@ 2023-12-20 14:40 ` Steven Rostedt
2023-12-21 0:10 ` Masami Hiramatsu
0 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-20 14:40 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Wed, 20 Dec 2023 23:26:19 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> On Tue, 19 Dec 2023 13:54:17 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > +/**
> > + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
> > + * @buffer: The ring_buffer to set the new page size.
> > + * @order: Order of the system pages in one sub buffer page
> > + *
> > + * By default, one ring buffer pages equals to one system page. This API can be
> > + * used to set new size of the ring buffer page. The size must be order of
> > + * system page size, that's why the input parameter @order is the order of
> > + * system pages that are allocated for one ring buffer page:
> > + * 0 - 1 system page
> > + * 1 - 2 system pages
> > + * 3 - 4 system pages
> > + * ...
>
> Don't we have the max order of the pages?
Actually there is. I think it's 7?
Honestly, anything over 5 is probably too much. But hey.
>
> > + *
> > + * Returns 0 on success or < 0 in case of an error.
> > + */
> > +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
> > +{
> > + int psize;
> > +
> > + if (!buffer || order < 0)
> > + return -EINVAL;
> > +
> > + if (buffer->subbuf_order == order)
> > + return 0;
> > +
> > + psize = (1 << order) * PAGE_SIZE;
> > + if (psize <= BUF_PAGE_HDR_SIZE)
> > + return -EINVAL;
> > +
> > + buffer->subbuf_order = order;
> > + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
> > +
> > + /* Todo: reset the buffer with the new page size */
> > +
>
> I just wonder why there is no reallocate the sub buffers here.
> Is it OK to change the sub buffer page size and order while
> using the ring buffer?
Currently we disable the ring buffer to do the update.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size
2023-12-20 14:40 ` Steven Rostedt
@ 2023-12-21 0:10 ` Masami Hiramatsu
0 siblings, 0 replies; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-21 0:10 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Wed, 20 Dec 2023 09:40:30 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 20 Dec 2023 23:26:19 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
> > On Tue, 19 Dec 2023 13:54:17 -0500
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > > +/**
> > > + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
> > > + * @buffer: The ring_buffer to set the new page size.
> > > + * @order: Order of the system pages in one sub buffer page
> > > + *
> > > + * By default, one ring buffer pages equals to one system page. This API can be
> > > + * used to set new size of the ring buffer page. The size must be order of
> > > + * system page size, that's why the input parameter @order is the order of
> > > + * system pages that are allocated for one ring buffer page:
> > > + * 0 - 1 system page
> > > + * 1 - 2 system pages
> > > + * 3 - 4 system pages
> > > + * ...
> >
> > Don't we have the max order of the pages?
>
> Actually there is. I think it's 7?
>
> Honestly, anything over 5 is probably too much. But hey.
Ah, I see. It is checked in subbuf_order_write() method (and it is embedded
directly). I think that 7 should be replaced with a macro, something like
RB_SUBBUF_ORDER_MAX and check it in this exposed function instead of write
method.
Thank you,
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (2 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 03/15] ring-buffer: Add interface for configuring trace sub buffer size Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-20 16:34 ` Masami Hiramatsu
2023-12-19 18:54 ` [PATCH v5 05/15] ring-buffer: Read and write to ring buffers with custom sub buffer size Steven Rostedt
` (11 subsequent siblings)
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
There are two approaches when changing the size of the ring buffer
sub page:
1. Destroying all pages and allocating new pages with the new size.
2. Allocating new pages, copying the content of the old pages before
destroying them.
The first approach is easier, it is selected in the proposed
implementation. Changing the ring buffer sub page size is supposed to
not happen frequently. Usually, that size should be set only once,
when the buffer is not in use yet and is supposed to be empty.
Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 80 ++++++++++++++++++++++++++++++++++----
1 file changed, 73 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 20fc0121735d..b928bd03dd0e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -332,6 +332,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -362,7 +363,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -1460,10 +1461,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1542,7 +1545,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1626,6 +1630,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
goto fail_free_buffer;
/* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
@@ -5503,8 +5508,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
if (bpage)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -5553,7 +5558,7 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, buffer->subbuf_order);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5813,7 +5818,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
*/
int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
{
+ struct ring_buffer_per_cpu **cpu_buffers;
+ int old_order, old_size;
+ int nr_pages;
int psize;
+ int bsize;
+ int err;
+ int cpu;
if (!buffer || order < 0)
return -EINVAL;
@@ -5825,12 +5836,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
if (psize <= BUF_PAGE_HDR_SIZE)
return -EINVAL;
+ bsize = sizeof(void *) * buffer->cpus;
+ cpu_buffers = kzalloc(bsize, GFP_KERNEL);
+ if (!cpu_buffers)
+ return -ENOMEM;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
buffer->subbuf_order = order;
buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
- /* Todo: reset the buffer with the new page size */
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ nr_pages = buffer->buffers[cpu]->nr_pages;
+ cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!cpu_buffers[cpu]) {
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ buffer->buffers[cpu] = cpu_buffers[cpu];
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ kfree(cpu_buffers);
return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ if (!cpu_buffers[cpu])
+ continue;
+ kfree(cpu_buffers[cpu]);
+ }
+ kfree(cpu_buffers);
+
+ return err;
}
EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page
2023-12-19 18:54 ` [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page Steven Rostedt
@ 2023-12-20 16:34 ` Masami Hiramatsu
2023-12-20 16:56 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-20 16:34 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:18 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
>
> There are two approaches when changing the size of the ring buffer
> sub page:
> 1. Destroying all pages and allocating new pages with the new size.
> 2. Allocating new pages, copying the content of the old pages before
> destroying them.
> The first approach is easier, it is selected in the proposed
> implementation. Changing the ring buffer sub page size is supposed to
> not happen frequently. Usually, that size should be set only once,
> when the buffer is not in use yet and is supposed to be empty.
>
> Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com
>
OK, this actually reallocate the sub buffers when a new order is set.
BTW, with this change, if we set a new order, the total buffer size will be
changed too? Or reserve the total size? I think either is OK but it should
be described in the document. (e.g. if it is changed, user should set the
order first and set the total size later.)
Thank you,
> Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> kernel/trace/ring_buffer.c | 80 ++++++++++++++++++++++++++++++++++----
> 1 file changed, 73 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 20fc0121735d..b928bd03dd0e 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -332,6 +332,7 @@ struct buffer_page {
> unsigned read; /* index for next read */
> local_t entries; /* entries on this page */
> unsigned long real_end; /* real end of data */
> + unsigned order; /* order of the page */
> struct buffer_data_page *page; /* Actual data page */
> };
>
> @@ -362,7 +363,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
>
> static void free_buffer_page(struct buffer_page *bpage)
> {
> - free_page((unsigned long)bpage->page);
> + free_pages((unsigned long)bpage->page, bpage->order);
> kfree(bpage);
> }
>
> @@ -1460,10 +1461,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
>
> list_add(&bpage->list, pages);
>
> - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
> + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
> + cpu_buffer->buffer->subbuf_order);
> if (!page)
> goto free_pages;
> bpage->page = page_address(page);
> + bpage->order = cpu_buffer->buffer->subbuf_order;
> rb_init_page(bpage->page);
>
> if (user_thread && fatal_signal_pending(current))
> @@ -1542,7 +1545,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
> rb_check_bpage(cpu_buffer, bpage);
>
> cpu_buffer->reader_page = bpage;
> - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
> +
> + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
> if (!page)
> goto fail_free_reader;
> bpage->page = page_address(page);
> @@ -1626,6 +1630,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
> goto fail_free_buffer;
>
> /* Default buffer page size - one system page */
> + buffer->subbuf_order = 0;
> buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
>
> /* Max payload is buffer page size - header (8bytes) */
> @@ -5503,8 +5508,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
> if (bpage)
> goto out;
>
> - page = alloc_pages_node(cpu_to_node(cpu),
> - GFP_KERNEL | __GFP_NORETRY, 0);
> + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
> + cpu_buffer->buffer->subbuf_order);
> if (!page)
> return ERR_PTR(-ENOMEM);
>
> @@ -5553,7 +5558,7 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
> local_irq_restore(flags);
>
> out:
> - free_page((unsigned long)bpage);
> + free_pages((unsigned long)bpage, buffer->subbuf_order);
> }
> EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
>
> @@ -5813,7 +5818,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
> */
> int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
> {
> + struct ring_buffer_per_cpu **cpu_buffers;
> + int old_order, old_size;
> + int nr_pages;
> int psize;
> + int bsize;
> + int err;
> + int cpu;
>
> if (!buffer || order < 0)
> return -EINVAL;
> @@ -5825,12 +5836,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
> if (psize <= BUF_PAGE_HDR_SIZE)
> return -EINVAL;
>
> + bsize = sizeof(void *) * buffer->cpus;
> + cpu_buffers = kzalloc(bsize, GFP_KERNEL);
> + if (!cpu_buffers)
> + return -ENOMEM;
> +
> + old_order = buffer->subbuf_order;
> + old_size = buffer->subbuf_size;
> +
> + /* prevent another thread from changing buffer sizes */
> + mutex_lock(&buffer->mutex);
> + atomic_inc(&buffer->record_disabled);
> +
> + /* Make sure all commits have finished */
> + synchronize_rcu();
> +
> buffer->subbuf_order = order;
> buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
>
> - /* Todo: reset the buffer with the new page size */
> + /* Make sure all new buffers are allocated, before deleting the old ones */
> + for_each_buffer_cpu(buffer, cpu) {
> + if (!cpumask_test_cpu(cpu, buffer->cpumask))
> + continue;
> +
> + nr_pages = buffer->buffers[cpu]->nr_pages;
> + cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
> + if (!cpu_buffers[cpu]) {
> + err = -ENOMEM;
> + goto error;
> + }
> + }
> +
> + for_each_buffer_cpu(buffer, cpu) {
> + if (!cpumask_test_cpu(cpu, buffer->cpumask))
> + continue;
> +
> + rb_free_cpu_buffer(buffer->buffers[cpu]);
> + buffer->buffers[cpu] = cpu_buffers[cpu];
> + }
> +
> + atomic_dec(&buffer->record_disabled);
> + mutex_unlock(&buffer->mutex);
> +
> + kfree(cpu_buffers);
>
> return 0;
> +
> +error:
> + buffer->subbuf_order = old_order;
> + buffer->subbuf_size = old_size;
> +
> + atomic_dec(&buffer->record_disabled);
> + mutex_unlock(&buffer->mutex);
> +
> + for_each_buffer_cpu(buffer, cpu) {
> + if (!cpu_buffers[cpu])
> + continue;
> + kfree(cpu_buffers[cpu]);
> + }
> + kfree(cpu_buffers);
> +
> + return err;
> }
> EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
>
> --
> 2.42.0
>
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page
2023-12-20 16:34 ` Masami Hiramatsu
@ 2023-12-20 16:56 ` Steven Rostedt
2023-12-21 0:11 ` Masami Hiramatsu
0 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-20 16:56 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Thu, 21 Dec 2023 01:34:56 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> On Tue, 19 Dec 2023 13:54:18 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
> >
> > There are two approaches when changing the size of the ring buffer
> > sub page:
> > 1. Destroying all pages and allocating new pages with the new size.
> > 2. Allocating new pages, copying the content of the old pages before
> > destroying them.
> > The first approach is easier, it is selected in the proposed
> > implementation. Changing the ring buffer sub page size is supposed to
> > not happen frequently. Usually, that size should be set only once,
> > when the buffer is not in use yet and is supposed to be empty.
> >
> > Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com
> >
>
> OK, this actually reallocate the sub buffers when a new order is set.
> BTW, with this change, if we set a new order, the total buffer size will be
> changed too? Or reserve the total size? I think either is OK but it should
> be described in the document. (e.g. if it is changed, user should set the
> order first and set the total size later.)
>
Patch 11 keeps the same size of the buffer. As I would think that would be
what the user would expect. And not only that, it breaks the latency
tracers if it doesn't keep the same size.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* Re: [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page
2023-12-20 16:56 ` Steven Rostedt
@ 2023-12-21 0:11 ` Masami Hiramatsu
0 siblings, 0 replies; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-21 0:11 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Wed, 20 Dec 2023 11:56:02 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 21 Dec 2023 01:34:56 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
> > On Tue, 19 Dec 2023 13:54:18 -0500
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > > From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
> > >
> > > There are two approaches when changing the size of the ring buffer
> > > sub page:
> > > 1. Destroying all pages and allocating new pages with the new size.
> > > 2. Allocating new pages, copying the content of the old pages before
> > > destroying them.
> > > The first approach is easier, it is selected in the proposed
> > > implementation. Changing the ring buffer sub page size is supposed to
> > > not happen frequently. Usually, that size should be set only once,
> > > when the buffer is not in use yet and is supposed to be empty.
> > >
> > > Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com
> > >
> >
> > OK, this actually reallocate the sub buffers when a new order is set.
> > BTW, with this change, if we set a new order, the total buffer size will be
> > changed too? Or reserve the total size? I think either is OK but it should
> > be described in the document. (e.g. if it is changed, user should set the
> > order first and set the total size later.)
> >
>
> Patch 11 keeps the same size of the buffer. As I would think that would be
> what the user would expect. And not only that, it breaks the latency
> tracers if it doesn't keep the same size.
Got it!
Thanks!
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 05/15] ring-buffer: Read and write to ring buffers with custom sub buffer size
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (3 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 04/15] ring-buffer: Set new size of the ring buffer sub page Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure Steven Rostedt
` (10 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
As the size of the ring sub buffer page can be changed dynamically,
the logic that reads and writes to the buffer should be fixed to take
that into account. Some internal ring buffer APIs are changed:
ring_buffer_alloc_read_page()
ring_buffer_free_read_page()
ring_buffer_read_page()
A new API is introduced:
ring_buffer_read_page_data()
Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
[ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/ring_buffer.h | 11 ++--
kernel/trace/ring_buffer.c | 75 ++++++++++++++++++++--------
kernel/trace/ring_buffer_benchmark.c | 10 ++--
kernel/trace/trace.c | 34 +++++++------
4 files changed, 89 insertions(+), 41 deletions(-)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 12573306b889..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer);
size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+ struct buffer_data_read_page *data_page,
size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
struct trace_seq;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b928bd03dd0e..c2afcf98ea91 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -5483,40 +5488,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
cpu_buffer->buffer->subbuf_order);
- if (!page)
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -5526,14 +5539,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
@@ -5542,8 +5556,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -5558,7 +5576,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_pages((unsigned long)bpage, buffer->subbuf_order);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5579,9 +5598,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -5596,7 +5616,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -5621,10 +5642,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -5718,11 +5741,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -5767,6 +5790,18 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
/**
* ring_buffer_subbuf_size_get - get size of the sub buffer.
* @buffer: the buffer to get the sub buffer size from
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a010aba4c4a4..711095aa731d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8286,6 +8286,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -8297,6 +8299,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8311,13 +8315,13 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -8338,11 +8342,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -8453,6 +8457,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8461,13 +8466,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -8477,7 +8483,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -8498,7 +8504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8507,14 +8513,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (4 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 05/15] ring-buffer: Read and write to ring buffers with custom sub buffer size Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-20 16:23 ` Masami Hiramatsu
2023-12-19 18:54 ` [PATCH v5 07/15] ring-buffer: Do no swap cpu buffers if order is different Steven Rostedt
` (9 subsequent siblings)
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
On failure to allocate ring buffer pages, the pointer to the CPU buffer
pages is freed, but the pages that were allocated previously were not.
Make sure they are freed too.
Fixes: TBD ("tracing: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c2afcf98ea91..3c11e8e811ed 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5927,6 +5927,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
for_each_buffer_cpu(buffer, cpu) {
if (!cpu_buffers[cpu])
continue;
+ rb_free_cpu_buffer(cpu_buffers[cpu]);
kfree(cpu_buffers[cpu]);
}
kfree(cpu_buffers);
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure
2023-12-19 18:54 ` [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure Steven Rostedt
@ 2023-12-20 16:23 ` Masami Hiramatsu
2023-12-20 16:30 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-20 16:23 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:20 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> On failure to allocate ring buffer pages, the pointer to the CPU buffer
> pages is freed, but the pages that were allocated previously were not.
> Make sure they are freed too.
>
> Fixes: TBD ("tracing: Set new size of the ring buffer sub page")
Do you merge this fix to the original one in the same series later?
I think it is better to merge it for git bisect.
Thank you,
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> kernel/trace/ring_buffer.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index c2afcf98ea91..3c11e8e811ed 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -5927,6 +5927,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
> for_each_buffer_cpu(buffer, cpu) {
> if (!cpu_buffers[cpu])
> continue;
> + rb_free_cpu_buffer(cpu_buffers[cpu]);
> kfree(cpu_buffers[cpu]);
> }
> kfree(cpu_buffers);
> --
> 2.42.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure
2023-12-20 16:23 ` Masami Hiramatsu
@ 2023-12-20 16:30 ` Steven Rostedt
0 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-20 16:30 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Thu, 21 Dec 2023 01:23:14 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> On Tue, 19 Dec 2023 13:54:20 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> >
> > On failure to allocate ring buffer pages, the pointer to the CPU buffer
> > pages is freed, but the pages that were allocated previously were not.
> > Make sure they are freed too.
> >
> > Fixes: TBD ("tracing: Set new size of the ring buffer sub page")
>
> Do you merge this fix to the original one in the same series later?
> I think it is better to merge it for git bisect.
It only fixes the new functionality. If a git bisect gets here for that,
then we know the issue already. But it shouldn't break bisect for things
that happened before this change.
The reason I'm not merging this with the other patch is because the
original patch is from Tzvetomir, who isn't working on this anymore. I want
to keep his patches untouched, and anything I do is on top of it.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 07/15] ring-buffer: Do no swap cpu buffers if order is different
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (5 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 06/15] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 08/15] ring-buffer: Make sure the spare sub buffer used for reads has same size Steven Rostedt
` (8 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
As all the subbuffer order (subbuffer sizes) must be the same throughout
the ring buffer, check the order of the buffers that are doing a CPU
buffer swap in ring_buffer_swap_cpu() to make sure they are the same.
If the are not the same, then fail to do the swap, otherwise the ring
buffer will think the CPU buffer has a specific subbuffer size when it
does not.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3c11e8e811ed..fdcd171b09b5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5417,6 +5417,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 08/15] ring-buffer: Make sure the spare sub buffer used for reads has same size
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (6 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 07/15] ring-buffer: Do no swap cpu buffers if order is different Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 09/15] tracing: Update snapshot order along with main buffer order Steven Rostedt
` (7 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Now that the ring buffer specifies the size of its sub buffers, they all
need to be the same size. When doing a read, a swap is done with a spare
page. Make sure they are the same size before doing the swap, otherwise
the read will fail.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 711095aa731d..4dcdc30aa110 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7582,6 +7582,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -8301,6 +8302,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8309,6 +8319,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 09/15] tracing: Update snapshot order along with main buffer order
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (7 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 08/15] ring-buffer: Make sure the spare sub buffer used for reads has same size Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 10/15] tracing: Stop the tracing while changing the ring buffer subbuf size Steven Rostedt
` (6 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
When updating the order of the sub buffers for the main buffer, make sure
that if the snapshot buffer exists, that it gets its order updated as
well.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 45 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4dcdc30aa110..2439e00aa4ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -9393,6 +9401,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
unsigned long val;
+ int old_order;
int ret;
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
@@ -9403,12 +9412,44 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
if (val < 0 || val > 7)
return -EINVAL;
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == val)
+ return 0;
+
ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
if (ret)
- return ret;
+ return 0;
- (*ppos)++;
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ return ret;
+ }
+ out_max:
+#endif
+ (*ppos)++;
return cnt;
}
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 10/15] tracing: Stop the tracing while changing the ring buffer subbuf size
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (8 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 09/15] tracing: Update snapshot order along with main buffer order Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 11/15] ring-buffer: Keep the same size when updating the order Steven Rostedt
` (5 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Because the main buffer and the snapshot buffer need to be the same for
some tracers, otherwise it will fail and disable all tracing, the tracers
need to be stopped while updating the sub buffer sizes so that the tracers
see the main and snapshot buffers with the same sub buffer size.
Fixes: TBD ("ring-buffer: Add interface for configuring trace sub buffer size")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2439e00aa4ce..82303bd2bba1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9412,13 +9412,16 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
if (val < 0 || val > 7)
return -EINVAL;
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
if (old_order == val)
- return 0;
+ goto out;
ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
if (ret)
- return 0;
+ goto out;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -9445,11 +9448,15 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
*/
tracing_disabled = 1;
}
- return ret;
+ goto out;
}
out_max:
#endif
(*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
return cnt;
}
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 11/15] ring-buffer: Keep the same size when updating the order
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (9 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 10/15] tracing: Stop the tracing while changing the ring buffer subbuf size Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 12/15] ring-buffer: Just update the subbuffers when changing their allocation order Steven Rostedt
` (4 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
The function ring_buffer_subbuf_order_set() just updated the sub-buffers
to the new size, but this also changes the size of the buffer in doing so.
As the size is determined by nr_pages * subbuf_size. If the subbuf_size is
increased without decreasing the nr_pages, this causes the total size of
the buffer to increase.
This broke the latency tracers as the snapshot needs to be the same size
as the main buffer. The size of the snapshot buffer is only expanded when
needed, and because the order is still the same, the size becomes out of
sync with the main buffer, as the main buffer increased in size without
the tracing system knowing.
Calculate the nr_pages to allocate with the new subbuf_size to be
buffer_size / new_subbuf_size.
Fixes: TBD ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fdcd171b09b5..23ead7602da0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5897,7 +5897,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
- nr_pages = buffer->buffers[cpu]->nr_pages;
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!cpu_buffers[cpu]) {
err = -ENOMEM;
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 12/15] ring-buffer: Just update the subbuffers when changing their allocation order
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (10 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 11/15] ring-buffer: Keep the same size when updating the order Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 13/15] ring-buffer: Add documentation on the buffer_subbuf_order file Steven Rostedt
` (3 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
The ring_buffer_subbuf_order_set() was creating ring_buffer_per_cpu
cpu_buffers with the new subbuffers with the updated order, and if they
all successfully were created, then they the ring_buffer's per_cpu buffers
would be freed and replaced by them.
The problem is that the freed per_cpu buffers contains state that would be
lost. Running the following commands:
1. # echo 3 > /sys/kernel/tracing/buffer_subbuf_order
2. # echo 0 > /sys/kernel/tracing/tracing_cpumask
3. # echo 1 > /sys/kernel/tracing/snapshot
4. # echo ff > /sys/kernel/tracing/tracing_cpumask
5. # echo test > /sys/kernel/tracing/trace_marker
Would result in:
-bash: echo: write error: Bad file descriptor
That's because the state of the per_cpu buffers of the snapshot buffer is
lost when the order is changed (the order of a freed snapshot buffer goes
to 0 to save memory, and when the snapshot buffer is allocated again, it
goes back to what the main buffer is).
In operation 2, the snapshot buffers were set to "disable" (as all the
ring buffers CPUs were disabled).
In operation 3, the snapshot is allocated and a call to
ring_buffer_subbuf_order_set() replaced the per_cpu buffers losing the
"record_disable" count.
When it was enabled again, the atomic_dec(&cpu_buffer->record_disable) was
decrementing a zero, setting it to -1. Writing 1 into the snapshot would
swap the snapshot buffer with the main buffer, so now the main buffer is
"disabled", and nothing can write to the ring buffer anymore.
Instead of creating new per_cpu buffers and losing the state of the old
buffers, basically do what the resize does and just allocate new subbuf
pages into the new_pages link list of the per_cpu buffer and if they all
succeed, then replace the old sub buffers with the new ones. This keeps
the per_cpu buffer descriptor in tact and by doing so, keeps its state.
Fixes: TBD ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 88 ++++++++++++++++++++++++++++++--------
1 file changed, 71 insertions(+), 17 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 23ead7602da0..7ee6779bf292 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5856,11 +5856,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
*/
int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
{
- struct ring_buffer_per_cpu **cpu_buffers;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
int old_order, old_size;
int nr_pages;
int psize;
- int bsize;
int err;
int cpu;
@@ -5874,11 +5874,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
if (psize <= BUF_PAGE_HDR_SIZE)
return -EINVAL;
- bsize = sizeof(void *) * buffer->cpus;
- cpu_buffers = kzalloc(bsize, GFP_KERNEL);
- if (!cpu_buffers)
- return -ENOMEM;
-
old_order = buffer->subbuf_order;
old_size = buffer->subbuf_size;
@@ -5894,33 +5889,88 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
/* Make sure all new buffers are allocated, before deleting the old ones */
for_each_buffer_cpu(buffer, cpu) {
+
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
+ cpu_buffer = buffer->buffers[cpu];
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
- cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
- if (!cpu_buffers[cpu]) {
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
err = -ENOMEM;
goto error;
}
}
for_each_buffer_cpu(buffer, cpu) {
+
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
- rb_free_cpu_buffer(buffer->buffers[cpu]);
- buffer->buffers[cpu] = cpu_buffers[cpu];
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
}
atomic_dec(&buffer->record_disabled);
mutex_unlock(&buffer->mutex);
- kfree(cpu_buffers);
-
return 0;
error:
@@ -5931,12 +5981,16 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
mutex_unlock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
- if (!cpu_buffers[cpu])
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
continue;
- rb_free_cpu_buffer(cpu_buffers[cpu]);
- kfree(cpu_buffers[cpu]);
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
}
- kfree(cpu_buffers);
return err;
}
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 13/15] ring-buffer: Add documentation on the buffer_subbuf_order file
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (11 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 12/15] ring-buffer: Just update the subbuffers when changing their allocation order Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-19 18:54 ` [PATCH v5 14/15] ringbuffer/selftest: Add basic selftest to test changing subbuf order Steven Rostedt
` (2 subsequent siblings)
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Add to the documentation how to use the buffer_subbuf_order file to change
the size and how it affects what events can be added to the ring buffer.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Documentation/trace/ftrace.rst | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 23572f6697c0..231d26ceedb0 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -203,6 +203,33 @@ of ftrace. Here is a list of some of the key files:
This displays the total combined size of all the trace buffers.
+ buffer_subbuf_order:
+
+ This sets or displays the sub buffer page size order. The ring buffer
+ is broken up into several same size "sub buffers". An event can not be
+ bigger than the size of the sub buffer. Normally, the sub buffer is
+ the size of the architecture's page (4K on x86). The sub buffer also
+ contains meta data at the start which also limits the size of an event.
+ That means when the sub buffer is a page size, no event can be larger
+ than the page size minus the sub buffer meta data.
+
+ The buffer_subbuf_order allows the user to change the size of the sub
+ buffer. As the sub buffer is a set of pages by the power of 2, thus
+ the sub buffer total size is defined by the order:
+
+ order size
+ ---- ----
+ 0 PAGE_SIZE
+ 1 PAGE_SIZE * 2
+ 2 PAGE_SIZE * 4
+ 3 PAGE_SIZE * 8
+
+ Changing the order will change the sub buffer size allowing for events
+ to be larger than the page size.
+
+ Note: When changing the order, tracing is stopped and any data in the
+ ring buffer and the snapshot buffer will be discarded.
+
free_buffer:
If a process is performing tracing, and the ring buffer should be
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* [PATCH v5 14/15] ringbuffer/selftest: Add basic selftest to test changing subbuf order
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (12 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 13/15] ring-buffer: Add documentation on the buffer_subbuf_order file Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-21 0:23 ` Masami Hiramatsu
2023-12-19 18:54 ` [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order Steven Rostedt
2023-12-19 22:38 ` [PATCH v5 16/15] ring-buffer: Use subbuf_order for buffer page masking Steven Rostedt
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Add a self test that will write into the trace buffer with differ trace
sub buffer order sizes.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
.../ftrace/test.d/00basic/ringbuffer_order.tc | 95 +++++++++++++++++++
1 file changed, 95 insertions(+)
create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
new file mode 100644
index 000000000000..ecbcc810e6c1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
@@ -0,0 +1,95 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Change the ringbuffer sub-buffer order
+# requires: buffer_subbuf_order
+# flags: instance
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_buffer_data_offset() {
+ sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_event_header_size() {
+ type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ total_bits=$((type_len+time_len+array_len))
+ total_bits=$((total_bits+7))
+ echo $((total_bits/8))
+}
+
+get_print_event_buf_offset() {
+ sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
+}
+
+event_header_size=`get_event_header_size`
+print_header_size=`get_print_event_buf_offset`
+
+data_offset=`get_buffer_data_offset`
+
+marker_meta=$((event_header_size+print_header_size))
+
+make_str() {
+ cnt=$1
+ printf -- 'X%.0s' $(seq $cnt)
+}
+
+write_buffer() {
+ size=$1
+
+ str=`make_str $size`
+
+ # clear the buffer
+ echo > trace
+
+ # write the string into the marker
+ echo $str > trace_marker
+
+ echo $str
+}
+
+test_buffer() {
+ orde=$1
+ page_size=$((4096<<order))
+
+ size=`get_buffer_data_size`
+
+ # the size must be greater than or equal to page_size - data_offset
+ page_size=$((page_size-data_offset))
+ if [ $size -lt $page_size ]; then
+ exit fail
+ fi
+
+ # Now add a little more the meta data overhead will overflow
+
+ str=`write_buffer $size`
+
+ # Make sure the line was broken
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
+
+ if [ "$new_str" = "$str" ]; then
+ exit fail;
+ fi
+
+ # Make sure the entire line can be found
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
+
+ if [ "$new_str" != "$str" ]; then
+ exit fail;
+ fi
+}
+
+ORIG=`cat buffer_subbuf_order`
+
+# Could test bigger orders than 3, but then creating the string
+# to write into the ring buffer takes too long
+for a in 0 1 2 3 ; do
+ echo $a > buffer_subbuf_order
+ test_buffer $a
+done
+
+echo $ORIG > buffer_subbuf_order
+
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 14/15] ringbuffer/selftest: Add basic selftest to test changing subbuf order
2023-12-19 18:54 ` [PATCH v5 14/15] ringbuffer/selftest: Add basic selftest to test changing subbuf order Steven Rostedt
@ 2023-12-21 0:23 ` Masami Hiramatsu
0 siblings, 0 replies; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-21 0:23 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:28 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> Add a self test that will write into the trace buffer with differ trace
> sub buffer order sizes.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> .../ftrace/test.d/00basic/ringbuffer_order.tc | 95 +++++++++++++++++++
> 1 file changed, 95 insertions(+)
> create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
>
> diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
> new file mode 100644
> index 000000000000..ecbcc810e6c1
> --- /dev/null
> +++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
> @@ -0,0 +1,95 @@
> +#!/bin/sh
> +# SPDX-License-Identifier: GPL-2.0
> +# description: Change the ringbuffer sub-buffer order
> +# requires: buffer_subbuf_order
> +# flags: instance
> +
> +get_buffer_data_size() {
> + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
> +}
> +
> +get_buffer_data_offset() {
> + sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
> +}
> +
> +get_event_header_size() {
> + type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
> + time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
> + array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
> + total_bits=$((type_len+time_len+array_len))
> + total_bits=$((total_bits+7))
> + echo $((total_bits/8))
> +}
> +
> +get_print_event_buf_offset() {
> + sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
> +}
> +
> +event_header_size=`get_event_header_size`
> +print_header_size=`get_print_event_buf_offset`
> +
> +data_offset=`get_buffer_data_offset`
> +
> +marker_meta=$((event_header_size+print_header_size))
> +
> +make_str() {
> + cnt=$1
> + printf -- 'X%.0s' $(seq $cnt)
> +}
> +
> +write_buffer() {
> + size=$1
> +
> + str=`make_str $size`
> +
> + # clear the buffer
> + echo > trace
> +
> + # write the string into the marker
> + echo $str > trace_marker
> +
> + echo $str
> +}
> +
> +test_buffer() {
> + orde=$1
> + page_size=$((4096<<order))
> +
> + size=`get_buffer_data_size`
> +
> + # the size must be greater than or equal to page_size - data_offset
> + page_size=$((page_size-data_offset))
> + if [ $size -lt $page_size ]; then
> + exit fail
> + fi
> +
> + # Now add a little more the meta data overhead will overflow
> +
> + str=`write_buffer $size`
> +
> + # Make sure the line was broken
> + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
> +
> + if [ "$new_str" = "$str" ]; then
> + exit fail;
> + fi
> +
> + # Make sure the entire line can be found
> + new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
> +
> + if [ "$new_str" != "$str" ]; then
> + exit fail;
> + fi
> +}
> +
> +ORIG=`cat buffer_subbuf_order`
> +
> +# Could test bigger orders than 3, but then creating the string
> +# to write into the ring buffer takes too long
For testing purpose, I think it is better to check whether the maximum order
can pass or not. Or shrink down the max order to support. (I think order=3 is
enough big at this moment)
Thank you,
> +for a in 0 1 2 3 ; do
> + echo $a > buffer_subbuf_order
> + test_buffer $a
> +done
> +
> +echo $ORIG > buffer_subbuf_order
> +
> --
> 2.42.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (13 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 14/15] ringbuffer/selftest: Add basic selftest to test changing subbuf order Steven Rostedt
@ 2023-12-19 18:54 ` Steven Rostedt
2023-12-21 0:26 ` Masami Hiramatsu
2023-12-19 22:38 ` [PATCH v5 16/15] ring-buffer: Use subbuf_order for buffer page masking Steven Rostedt
15 siblings, 1 reply; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 18:54 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Using page order for deciding what the size of the ring buffer sub buffers
are is exposing a bit too much of the implementation. Although the sub
buffers are only allocated in orders of pages, allow the user to specify
the minimum size of each sub-buffer via kilobytes like they can with the
buffer size itself.
If the user specifies 3 via:
echo 3 > buffer_subbuf_size_kb
Then the sub-buffer size will round up to 4kb (on a 4kb page size system).
If they specify:
echo 6 > buffer_subbuf_size_kb
The sub-buffer size will become 8kb.
and so on.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Documentation/trace/ftrace.rst | 46 ++++++++-----------
kernel/trace/trace.c | 38 +++++++++------
...fer_order.tc => ringbuffer_subbuf_size.tc} | 18 ++++----
3 files changed, 54 insertions(+), 48 deletions(-)
rename tools/testing/selftests/ftrace/test.d/00basic/{ringbuffer_order.tc => ringbuffer_subbuf_size.tc} (85%)
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 231d26ceedb0..933e7efb9f1b 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -203,32 +203,26 @@ of ftrace. Here is a list of some of the key files:
This displays the total combined size of all the trace buffers.
- buffer_subbuf_order:
-
- This sets or displays the sub buffer page size order. The ring buffer
- is broken up into several same size "sub buffers". An event can not be
- bigger than the size of the sub buffer. Normally, the sub buffer is
- the size of the architecture's page (4K on x86). The sub buffer also
- contains meta data at the start which also limits the size of an event.
- That means when the sub buffer is a page size, no event can be larger
- than the page size minus the sub buffer meta data.
-
- The buffer_subbuf_order allows the user to change the size of the sub
- buffer. As the sub buffer is a set of pages by the power of 2, thus
- the sub buffer total size is defined by the order:
-
- order size
- ---- ----
- 0 PAGE_SIZE
- 1 PAGE_SIZE * 2
- 2 PAGE_SIZE * 4
- 3 PAGE_SIZE * 8
-
- Changing the order will change the sub buffer size allowing for events
- to be larger than the page size.
-
- Note: When changing the order, tracing is stopped and any data in the
- ring buffer and the snapshot buffer will be discarded.
+ buffer_subbuf_size_kb:
+
+ This sets or displays the sub buffer size. The ring buffer is broken up
+ into several same size "sub buffers". An event can not be bigger than
+ the size of the sub buffer. Normally, the sub buffer is the size of the
+ architecture's page (4K on x86). The sub buffer also contains meta data
+ at the start which also limits the size of an event. That means when
+ the sub buffer is a page size, no event can be larger than the page
+ size minus the sub buffer meta data.
+
+ Note, the buffer_subbuf_size_kb is a way for the user to specify the
+ minimum size of the subbuffer. The kernel may make it bigger due to the
+ implementation details, or simply fail the operation if the kernel can
+ not handle the request.
+
+ Changing the sub buffer size allows for events to be larger than the
+ page size.
+
+ Note: When changing the sub-buffer size, tracing is stopped and any
+ data in the ring buffer and the snapshot buffer will be discarded.
free_buffer:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82303bd2bba1..46dbe22121e9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9384,42 +9384,54 @@ static const struct file_operations buffer_percent_fops = {
};
static ssize_t
-buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
+ size_t size;
char buf[64];
+ int order;
int r;
- r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
static ssize_t
-buffer_order_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
unsigned long val;
int old_order;
+ int order;
+ int pages;
int ret;
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
/* limit between 1 and 128 system pages */
- if (val < 0 || val > 7)
+ if (order < 0 || order > 7)
return -EINVAL;
/* Do not allow tracing while changing the order of the ring buffer */
tracing_stop_tr(tr);
old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
- if (old_order == val)
+ if (old_order == order)
goto out;
- ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
if (ret)
goto out;
@@ -9428,7 +9440,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
if (!tr->allocated_snapshot)
goto out_max;
- ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
if (ret) {
/* Put back the old order */
cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
@@ -9460,10 +9472,10 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static const struct file_operations buffer_order_fops = {
+static const struct file_operations buffer_subbuf_size_fops = {
.open = tracing_open_generic_tr,
- .read = buffer_order_read,
- .write = buffer_order_write,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
.release = tracing_release_generic_tr,
.llseek = default_llseek,
};
@@ -9934,8 +9946,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
- trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
- tr, &buffer_order_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
create_trace_options_dir(tr);
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
similarity index 85%
rename from tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
rename to tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
index ecbcc810e6c1..d44d09a33a74 100644
--- a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
@@ -1,7 +1,7 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
-# description: Change the ringbuffer sub-buffer order
-# requires: buffer_subbuf_order
+# description: Change the ringbuffer sub-buffer size
+# requires: buffer_subbuf_size_kb
# flags: instance
get_buffer_data_size() {
@@ -52,8 +52,8 @@ write_buffer() {
}
test_buffer() {
- orde=$1
- page_size=$((4096<<order))
+ size_kb=$1
+ page_size=$((size_kb*1024))
size=`get_buffer_data_size`
@@ -82,14 +82,14 @@ test_buffer() {
fi
}
-ORIG=`cat buffer_subbuf_order`
+ORIG=`cat buffer_subbuf_size_kb`
-# Could test bigger orders than 3, but then creating the string
+# Could test bigger sizes than 32K, but then creating the string
# to write into the ring buffer takes too long
-for a in 0 1 2 3 ; do
- echo $a > buffer_subbuf_order
+for a in 4 8 16 32 ; do
+ echo $a > buffer_subbuf_size_kb
test_buffer $a
done
-echo $ORIG > buffer_subbuf_order
+echo $ORIG > buffer_subbuf_size_kb
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread* Re: [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order
2023-12-19 18:54 ` [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order Steven Rostedt
@ 2023-12-21 0:26 ` Masami Hiramatsu
2023-12-21 1:57 ` Steven Rostedt
0 siblings, 1 reply; 36+ messages in thread
From: Masami Hiramatsu @ 2023-12-21 0:26 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Tzvetomir Stoyanov,
Vincent Donnefort, Kent Overstreet
On Tue, 19 Dec 2023 13:54:29 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> Using page order for deciding what the size of the ring buffer sub buffers
> are is exposing a bit too much of the implementation. Although the sub
> buffers are only allocated in orders of pages, allow the user to specify
> the minimum size of each sub-buffer via kilobytes like they can with the
> buffer size itself.
>
> If the user specifies 3 via:
>
> echo 3 > buffer_subbuf_size_kb
>
> Then the sub-buffer size will round up to 4kb (on a 4kb page size system).
>
> If they specify:
>
> echo 6 > buffer_subbuf_size_kb
>
> The sub-buffer size will become 8kb.
I think this is better interface. Can we apply this earlier in the series
to avoid rewriting the document and test code?
Thank you,
>
> and so on.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Documentation/trace/ftrace.rst | 46 ++++++++-----------
> kernel/trace/trace.c | 38 +++++++++------
> ...fer_order.tc => ringbuffer_subbuf_size.tc} | 18 ++++----
> 3 files changed, 54 insertions(+), 48 deletions(-)
> rename tools/testing/selftests/ftrace/test.d/00basic/{ringbuffer_order.tc => ringbuffer_subbuf_size.tc} (85%)
>
> diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
> index 231d26ceedb0..933e7efb9f1b 100644
> --- a/Documentation/trace/ftrace.rst
> +++ b/Documentation/trace/ftrace.rst
> @@ -203,32 +203,26 @@ of ftrace. Here is a list of some of the key files:
>
> This displays the total combined size of all the trace buffers.
>
> - buffer_subbuf_order:
> -
> - This sets or displays the sub buffer page size order. The ring buffer
> - is broken up into several same size "sub buffers". An event can not be
> - bigger than the size of the sub buffer. Normally, the sub buffer is
> - the size of the architecture's page (4K on x86). The sub buffer also
> - contains meta data at the start which also limits the size of an event.
> - That means when the sub buffer is a page size, no event can be larger
> - than the page size minus the sub buffer meta data.
> -
> - The buffer_subbuf_order allows the user to change the size of the sub
> - buffer. As the sub buffer is a set of pages by the power of 2, thus
> - the sub buffer total size is defined by the order:
> -
> - order size
> - ---- ----
> - 0 PAGE_SIZE
> - 1 PAGE_SIZE * 2
> - 2 PAGE_SIZE * 4
> - 3 PAGE_SIZE * 8
> -
> - Changing the order will change the sub buffer size allowing for events
> - to be larger than the page size.
> -
> - Note: When changing the order, tracing is stopped and any data in the
> - ring buffer and the snapshot buffer will be discarded.
> + buffer_subbuf_size_kb:
> +
> + This sets or displays the sub buffer size. The ring buffer is broken up
> + into several same size "sub buffers". An event can not be bigger than
> + the size of the sub buffer. Normally, the sub buffer is the size of the
> + architecture's page (4K on x86). The sub buffer also contains meta data
> + at the start which also limits the size of an event. That means when
> + the sub buffer is a page size, no event can be larger than the page
> + size minus the sub buffer meta data.
> +
> + Note, the buffer_subbuf_size_kb is a way for the user to specify the
> + minimum size of the subbuffer. The kernel may make it bigger due to the
> + implementation details, or simply fail the operation if the kernel can
> + not handle the request.
> +
> + Changing the sub buffer size allows for events to be larger than the
> + page size.
> +
> + Note: When changing the sub-buffer size, tracing is stopped and any
> + data in the ring buffer and the snapshot buffer will be discarded.
>
> free_buffer:
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 82303bd2bba1..46dbe22121e9 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -9384,42 +9384,54 @@ static const struct file_operations buffer_percent_fops = {
> };
>
> static ssize_t
> -buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> {
> struct trace_array *tr = filp->private_data;
> + size_t size;
> char buf[64];
> + int order;
> int r;
>
> - r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
> + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
> + size = (PAGE_SIZE << order) / 1024;
> +
> + r = sprintf(buf, "%zd\n", size);
>
> return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
> }
>
> static ssize_t
> -buffer_order_write(struct file *filp, const char __user *ubuf,
> - size_t cnt, loff_t *ppos)
> +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
> + size_t cnt, loff_t *ppos)
> {
> struct trace_array *tr = filp->private_data;
> unsigned long val;
> int old_order;
> + int order;
> + int pages;
> int ret;
>
> ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
> if (ret)
> return ret;
>
> + val *= 1024; /* value passed in is in KB */
> +
> + pages = DIV_ROUND_UP(val, PAGE_SIZE);
> + order = fls(pages - 1);
> +
> /* limit between 1 and 128 system pages */
> - if (val < 0 || val > 7)
> + if (order < 0 || order > 7)
> return -EINVAL;
>
> /* Do not allow tracing while changing the order of the ring buffer */
> tracing_stop_tr(tr);
>
> old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
> - if (old_order == val)
> + if (old_order == order)
> goto out;
>
> - ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
> + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
> if (ret)
> goto out;
>
> @@ -9428,7 +9440,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
> if (!tr->allocated_snapshot)
> goto out_max;
>
> - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
> + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
> if (ret) {
> /* Put back the old order */
> cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
> @@ -9460,10 +9472,10 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
> return cnt;
> }
>
> -static const struct file_operations buffer_order_fops = {
> +static const struct file_operations buffer_subbuf_size_fops = {
> .open = tracing_open_generic_tr,
> - .read = buffer_order_read,
> - .write = buffer_order_write,
> + .read = buffer_subbuf_size_read,
> + .write = buffer_subbuf_size_write,
> .release = tracing_release_generic_tr,
> .llseek = default_llseek,
> };
> @@ -9934,8 +9946,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
> trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
> tr, &buffer_percent_fops);
>
> - trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
> - tr, &buffer_order_fops);
> + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
> + tr, &buffer_subbuf_size_fops);
>
> create_trace_options_dir(tr);
>
> diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
> similarity index 85%
> rename from tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
> rename to tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
> index ecbcc810e6c1..d44d09a33a74 100644
> --- a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
> +++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
> @@ -1,7 +1,7 @@
> #!/bin/sh
> # SPDX-License-Identifier: GPL-2.0
> -# description: Change the ringbuffer sub-buffer order
> -# requires: buffer_subbuf_order
> +# description: Change the ringbuffer sub-buffer size
> +# requires: buffer_subbuf_size_kb
> # flags: instance
>
> get_buffer_data_size() {
> @@ -52,8 +52,8 @@ write_buffer() {
> }
>
> test_buffer() {
> - orde=$1
> - page_size=$((4096<<order))
> + size_kb=$1
> + page_size=$((size_kb*1024))
>
> size=`get_buffer_data_size`
>
> @@ -82,14 +82,14 @@ test_buffer() {
> fi
> }
>
> -ORIG=`cat buffer_subbuf_order`
> +ORIG=`cat buffer_subbuf_size_kb`
>
> -# Could test bigger orders than 3, but then creating the string
> +# Could test bigger sizes than 32K, but then creating the string
> # to write into the ring buffer takes too long
> -for a in 0 1 2 3 ; do
> - echo $a > buffer_subbuf_order
> +for a in 4 8 16 32 ; do
> + echo $a > buffer_subbuf_size_kb
> test_buffer $a
> done
>
> -echo $ORIG > buffer_subbuf_order
> +echo $ORIG > buffer_subbuf_size_kb
>
> --
> 2.42.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 36+ messages in thread* Re: [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order
2023-12-21 0:26 ` Masami Hiramatsu
@ 2023-12-21 1:57 ` Steven Rostedt
0 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-21 1:57 UTC (permalink / raw)
To: Masami Hiramatsu (Google)
Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
Andrew Morton, Tzvetomir Stoyanov, Vincent Donnefort,
Kent Overstreet
On Thu, 21 Dec 2023 09:26:21 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > If the user specifies 3 via:
> >
> > echo 3 > buffer_subbuf_size_kb
> >
> > Then the sub-buffer size will round up to 4kb (on a 4kb page size system).
> >
> > If they specify:
> >
> > echo 6 > buffer_subbuf_size_kb
> >
> > The sub-buffer size will become 8kb.
>
> I think this is better interface. Can we apply this earlier in the series
> to avoid rewriting the document and test code?
I kept it separate for testing purposes.
Through out all this, it was a good way to make sure the two approaches
were compatible. I still like to keep them separate as that's the way it
was developed. It's good to keep that history.
-- Steve
^ permalink raw reply [flat|nested] 36+ messages in thread
* [PATCH v5 16/15] ring-buffer: Use subbuf_order for buffer page masking
2023-12-19 18:54 [PATCH v5 00/15] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers Steven Rostedt
` (14 preceding siblings ...)
2023-12-19 18:54 ` [PATCH v5 15/15] tracing: Update subbuffer with kilobytes not page order Steven Rostedt
@ 2023-12-19 22:38 ` Steven Rostedt
15 siblings, 0 replies; 36+ messages in thread
From: Steven Rostedt @ 2023-12-19 22:38 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Tzvetomir Stoyanov, Vincent Donnefort, Kent Overstreet
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
The comparisons to PAGE_SIZE were all converted to use the
buffer->subbuf_order, but the use of PAGE_MASK was missed.
Convert all the PAGE_MASK usages over to:
(PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1
Fixes: TBD ("ring-buffer: Page size per ring buffer")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/ring_buffer.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7ee6779bf292..173d2595ce2d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2269,11 +2269,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2646,7 +2648,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2654,7 +2657,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2728,7 +2731,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
@@ -2812,10 +2815,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
@@ -3726,7 +3729,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
--
2.42.0
^ permalink raw reply related [flat|nested] 36+ messages in thread