linux-trace-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot
@ 2025-05-15 11:15 Masami Hiramatsu (Google)
  2025-05-20  6:13 ` [PATCH] tracing: Reset last-boot buffers when reading out all cpu buffers Masami Hiramatsu (Google)
  2025-05-21 18:51 ` [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Steven Rostedt
  0 siblings, 2 replies; 6+ messages in thread
From: Masami Hiramatsu (Google) @ 2025-05-15 11:15 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Rewind persistent ring buffer pages which have been read in the
previous boot. Those pages are highly possible to be lost before
writing it to the disk if the previous kernel crashed. In this
case, the trace data is kept on the persistent ring buffer, but
it can not be read because its commit size has been reset after
read.
This skips clearing the commit size of each sub-buffer and
recover it after reboot.

Note: If you read the previous boot data via trace_pipe, that
is not accessible in that time. But reboot without clearing (or
reusing) the read data, the read data is recovered again in the
next boot.
Thus, when you read the previous boot data, clear it by
`echo > trace`.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Changes in v2:
  - Stop rewind if timestamp is not older.
  - Rewind reader page and reset all indexes.
  - Make ring_buffer_read_page() not clear the commit size.
---
 kernel/trace/ring_buffer.c |   99 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6859008ca34d..48f5f248eb4c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
 	*bpage = list_entry(p, struct buffer_page, list);
 }
 
+static inline void rb_dec_page(struct buffer_page **bpage)
+{
+	struct list_head *p = rb_list_head((*bpage)->list.prev);
+
+	*bpage = list_entry(p, struct buffer_page, list);
+}
+
 static struct buffer_page *
 rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
 static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
-	struct buffer_page *head_page;
+	struct buffer_page *head_page, *orig_head;
 	unsigned long entry_bytes = 0;
 	unsigned long entries = 0;
 	int ret;
+	u64 ts;
 	int i;
 
 	if (!meta || !meta->head_buffer)
@@ -1885,8 +1893,93 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
 	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
 	local_set(&cpu_buffer->reader_page->entries, ret);
 
-	head_page = cpu_buffer->head_page;
+	orig_head = head_page = cpu_buffer->head_page;
+	ts = head_page->page->time_stamp;
+
+	/*
+	 * Try to rewind the head so that we can read the pages which already
+	 * read in the previous boot.
+	 */
+	if (head_page == cpu_buffer->tail_page)
+		goto rewound;
+
+	rb_dec_page(&head_page);
+	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
+
+		/* Rewind until tail (writer) page. */
+		if (head_page == cpu_buffer->tail_page)
+			break;
+
+		/* Ensure the page has older data than head. */
+		if (ts < head_page->page->time_stamp)
+			break;
+
+		ts = head_page->page->time_stamp;
+		/* Ensure the page has correct timestamp and some data. */
+		if (!ts || rb_page_commit(head_page) == 0)
+			break;
+
+		/* Stop rewind if the page is invalid. */
+		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+		if (ret < 0)
+			break;
+
+		/* Recover the number of entries and update stats. */
+		local_set(&head_page->entries, ret);
+		if (ret)
+			local_inc(&cpu_buffer->pages_touched);
+		entries += ret;
+		entry_bytes += rb_page_commit(head_page);
+	}
+	pr_info("Rewound %d pages on cpu%d\n", i, cpu_buffer->cpu);
+
+	/* The last rewound page must be skipped. */
+	if (head_page != orig_head)
+		rb_inc_page(&head_page);
 
+	/* If there are rewound pages, rewind the reader page too. */
+	if (head_page != orig_head) {
+		struct buffer_page *bpage = orig_head;
+
+		rb_dec_page(&bpage);
+		/*
+		 * Insert the reader_page before the original head page.
+		 * Since the list encode RB_PAGE flags, general list
+		 * operations should be avoided.
+		 */
+		cpu_buffer->reader_page->list.next = &orig_head->list;
+		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+		orig_head->list.prev = &cpu_buffer->reader_page->list;
+		bpage->list.next = &cpu_buffer->reader_page->list;
+
+		/* Make the head_page tthe new read page */
+		cpu_buffer->reader_page = head_page;
+		bpage = head_page;
+		rb_inc_page(&head_page);
+		head_page->list.prev = bpage->list.prev;
+		rb_dec_page(&bpage);
+		bpage->list.next = &head_page->list;
+		rb_set_list_to_head(&bpage->list);
+
+		cpu_buffer->head_page = head_page;
+		meta->head_buffer = (unsigned long)head_page->page;
+
+		/* Reset all the indexes */
+		bpage = cpu_buffer->reader_page;
+		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+		bpage->id = 0;
+
+		for (i = 0, bpage = head_page; i < meta->nr_subbufs;
+		     i++, rb_inc_page(&bpage)) {
+			meta->buffers[i + 1] = rb_meta_subbuf_idx(meta, bpage->page);
+			bpage->id = i + 1;
+		}
+
+		/* We'll restart verifying from orig_head */
+		head_page = orig_head;
+	}
+
+ rewound:
 	/* If the commit_buffer is the reader page, update the commit page */
 	if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
 		cpu_buffer->commit_page = cpu_buffer->reader_page;
@@ -5348,7 +5441,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	 */
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
-	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->real_end = 0;
 
  spin:
@@ -6642,7 +6734,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		cpu_buffer->read_bytes += rb_page_size(reader);
 
 		/* swap the pages */
-		rb_init_page(bpage);
 		bpage = reader->page;
 		reader->page = data_page->data;
 		local_set(&reader->write, 0);


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] tracing: Reset last-boot buffers when reading out all cpu buffers
  2025-05-15 11:15 [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Masami Hiramatsu (Google)
@ 2025-05-20  6:13 ` Masami Hiramatsu (Google)
  2025-05-21 18:51 ` [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Steven Rostedt
  1 sibling, 0 replies; 6+ messages in thread
From: Masami Hiramatsu (Google) @ 2025-05-20  6:13 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Reset the last-boot ring buffers when read() reads out all cpu
buffers through trace_pipe/trace_pipe_raw. This prevents ftrace to
unwind ring buffer read pointer next boot.

Note that this resets only when all per-cpu buffers are empty, and
read via read(2) syscall. For example, if you read only one of the
per-cpu trace_pipe, it does not reset it. Also, reading buffer by
splice(2) syscall, it does not reset because there will be the
data in reader page.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace.c |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c1764ed87b0..433671d3aa43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6677,6 +6677,22 @@ static int tracing_wait_pipe(struct file *filp)
 	return 1;
 }
 
+static bool update_last_data_if_empty(struct trace_array *tr)
+{
+	if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+		return false;
+
+	if (!ring_buffer_empty(tr->array_buffer.buffer))
+		return false;
+
+	/*
+	 * If the buffer contains the last boot data and all per-cpu
+	 * buffers are empty, reset it from the kernel side.
+	 */
+	update_last_data(tr);
+	return true;
+}
+
 /*
  * Consumer reader.
  */
@@ -6708,6 +6724,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	}
 
 waitagain:
+	if (update_last_data_if_empty(iter->tr))
+		return 0;
+
 	sret = tracing_wait_pipe(filp);
 	if (sret <= 0)
 		return sret;
@@ -8286,6 +8305,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	if (ret < 0) {
 		if (trace_empty(iter) && !iter->closed) {
+			if (update_last_data_if_empty(iter->tr))
+				return 0;
+
 			if ((filp->f_flags & O_NONBLOCK))
 				return -EAGAIN;
 


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot
  2025-05-15 11:15 [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Masami Hiramatsu (Google)
  2025-05-20  6:13 ` [PATCH] tracing: Reset last-boot buffers when reading out all cpu buffers Masami Hiramatsu (Google)
@ 2025-05-21 18:51 ` Steven Rostedt
  2025-05-21 23:19   ` Steven Rostedt
  2025-05-22 15:09   ` Masami Hiramatsu
  1 sibling, 2 replies; 6+ messages in thread
From: Steven Rostedt @ 2025-05-21 18:51 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Thu, 15 May 2025 20:15:56 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 6859008ca34d..48f5f248eb4c 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
>  	*bpage = list_entry(p, struct buffer_page, list);
>  }
>  
> +static inline void rb_dec_page(struct buffer_page **bpage)
> +{
> +	struct list_head *p = rb_list_head((*bpage)->list.prev);
> +
> +	*bpage = list_entry(p, struct buffer_page, list);
> +}
> +
>  static struct buffer_page *
>  rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
>  {
> @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
>  static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
>  {
>  	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
> -	struct buffer_page *head_page;
> +	struct buffer_page *head_page, *orig_head;
>  	unsigned long entry_bytes = 0;
>  	unsigned long entries = 0;
>  	int ret;
> +	u64 ts;
>  	int i;
>  
>  	if (!meta || !meta->head_buffer)
> @@ -1885,8 +1893,93 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
>  	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
>  	local_set(&cpu_buffer->reader_page->entries, ret);
>  
> -	head_page = cpu_buffer->head_page;
> +	orig_head = head_page = cpu_buffer->head_page;
> +	ts = head_page->page->time_stamp;
> +
> +	/*
> +	 * Try to rewind the head so that we can read the pages which already
> +	 * read in the previous boot.
> +	 */
> +	if (head_page == cpu_buffer->tail_page)
> +		goto rewound;

Hmm, jumping to a label called "rewound" when you didn't do a rewind seems
confusing.

Perhaps call the label "skip_rewind"?

> +
> +	rb_dec_page(&head_page);
> +	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
> +
> +		/* Rewind until tail (writer) page. */
> +		if (head_page == cpu_buffer->tail_page)
> +			break;
> +
> +		/* Ensure the page has older data than head. */
> +		if (ts < head_page->page->time_stamp)
> +			break;
> +
> +		ts = head_page->page->time_stamp;
> +		/* Ensure the page has correct timestamp and some data. */
> +		if (!ts || rb_page_commit(head_page) == 0)
> +			break;
> +
> +		/* Stop rewind if the page is invalid. */
> +		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
> +		if (ret < 0)
> +			break;
> +
> +		/* Recover the number of entries and update stats. */
> +		local_set(&head_page->entries, ret);
> +		if (ret)
> +			local_inc(&cpu_buffer->pages_touched);
> +		entries += ret;
> +		entry_bytes += rb_page_commit(head_page);
> +	}
> +	pr_info("Rewound %d pages on cpu%d\n", i, cpu_buffer->cpu);

Should state this is coming from the ring buffer and use "[%d]" for cpu
number as the other pr_info()'s do. Also only print if it did a rewind:

	if (i)
		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);


> +
> +	/* The last rewound page must be skipped. */
> +	if (head_page != orig_head)
> +		rb_inc_page(&head_page);
>  
> +	/* If there are rewound pages, rewind the reader page too. */

I would change the comment to:

	/*
	 * If the ring buffer was rewound, then inject the reader page
	 * into the location just before the original head page.
	 */

> +	if (head_page != orig_head) {
> +		struct buffer_page *bpage = orig_head;
> +
> +		rb_dec_page(&bpage);
> +		/*
> +		 * Insert the reader_page before the original head page.
> +		 * Since the list encode RB_PAGE flags, general list
> +		 * operations should be avoided.
> +		 */
> +		cpu_buffer->reader_page->list.next = &orig_head->list;
> +		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
> +		orig_head->list.prev = &cpu_buffer->reader_page->list;
> +		bpage->list.next = &cpu_buffer->reader_page->list;
> +
> +		/* Make the head_page tthe new read page */

Typo "tthe" and call it "new reader page", not "read page".

> +		cpu_buffer->reader_page = head_page;
> +		bpage = head_page;
> +		rb_inc_page(&head_page);
> +		head_page->list.prev = bpage->list.prev;
> +		rb_dec_page(&bpage);
> +		bpage->list.next = &head_page->list;
> +		rb_set_list_to_head(&bpage->list);
> +
> +		cpu_buffer->head_page = head_page;
> +		meta->head_buffer = (unsigned long)head_page->page;
> +
> +		/* Reset all the indexes */
> +		bpage = cpu_buffer->reader_page;
> +		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
> +		bpage->id = 0;
> +
> +		for (i = 0, bpage = head_page; i < meta->nr_subbufs;
> +		     i++, rb_inc_page(&bpage)) {
> +			meta->buffers[i + 1] = rb_meta_subbuf_idx(meta, bpage->page);
> +			bpage->id = i + 1;
> +		}
> +
> +		/* We'll restart verifying from orig_head */
> +		head_page = orig_head;
> +	}
> +
> + rewound:

 skip_rewind:

Also, I know other's don't like to do this, but I do add a space before
labels. It makes patch diffs easier to see which functions they are,
otherwise the patch shows the label and not the function.

-- Steve


>  	/* If the commit_buffer is the reader page, update the commit page */
>  	if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
>  		cpu_buffer->commit_page = cpu_buffer->reader_page;
> @@ -5348,7 +5441,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
>  	 */
>  	local_set(&cpu_buffer->reader_page->write, 0);
>  	local_set(&cpu_buffer->reader_page->entries, 0);
> -	local_set(&cpu_buffer->reader_page->page->commit, 0);
>  	cpu_buffer->reader_page->real_end = 0;
>  
>   spin:
> @@ -6642,7 +6734,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  		cpu_buffer->read_bytes += rb_page_size(reader);
>  
>  		/* swap the pages */
> -		rb_init_page(bpage);
>  		bpage = reader->page;
>  		reader->page = data_page->data;
>  		local_set(&reader->write, 0);


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot
  2025-05-21 18:51 ` [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Steven Rostedt
@ 2025-05-21 23:19   ` Steven Rostedt
  2025-05-22 15:23     ` Masami Hiramatsu
  2025-05-22 15:09   ` Masami Hiramatsu
  1 sibling, 1 reply; 6+ messages in thread
From: Steven Rostedt @ 2025-05-21 23:19 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Wed, 21 May 2025 14:51:28 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 15 May 2025 20:15:56 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> 
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 6859008ca34d..48f5f248eb4c 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
> >  	*bpage = list_entry(p, struct buffer_page, list);
> >  }
> >  
> > +static inline void rb_dec_page(struct buffer_page **bpage)
> > +{
> > +	struct list_head *p = rb_list_head((*bpage)->list.prev);
> > +
> > +	*bpage = list_entry(p, struct buffer_page, list);
> > +}
> > +
> >  static struct buffer_page *
> >  rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
> >  {
> > @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
> >  static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> >  {
> >  	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
> > -	struct buffer_page *head_page;
> > +	struct buffer_page *head_page, *orig_head;
> >  	unsigned long entry_bytes = 0;
> >  	unsigned long entries = 0;
> >  	int ret;
> > +	u64 ts;
> >  	int i;
> >  
> >  	if (!meta || !meta->head_buffer)
> > @@ -1885,8 +1893,93 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> >  	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
> >  	local_set(&cpu_buffer->reader_page->entries, ret);
> >  
> > -	head_page = cpu_buffer->head_page;
> > +	orig_head = head_page = cpu_buffer->head_page;
> > +	ts = head_page->page->time_stamp;
> > +
> > +	/*
> > +	 * Try to rewind the head so that we can read the pages which already
> > +	 * read in the previous boot.
> > +	 */
> > +	if (head_page == cpu_buffer->tail_page)
> > +		goto rewound;  
> 
> Hmm, jumping to a label called "rewound" when you didn't do a rewind seems
> confusing.
> 
> Perhaps call the label "skip_rewind"?
> 
> > +
> > +	rb_dec_page(&head_page);
> > +	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
> > +
> > +		/* Rewind until tail (writer) page. */
> > +		if (head_page == cpu_buffer->tail_page)
> > +			break;
> > +
> > +		/* Ensure the page has older data than head. */
> > +		if (ts < head_page->page->time_stamp)
> > +			break;
> > +
> > +		ts = head_page->page->time_stamp;
> > +		/* Ensure the page has correct timestamp and some data. */
> > +		if (!ts || rb_page_commit(head_page) == 0)
> > +			break;
> > +
> > +		/* Stop rewind if the page is invalid. */
> > +		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
> > +		if (ret < 0)
> > +			break;
> > +
> > +		/* Recover the number of entries and update stats. */
> > +		local_set(&head_page->entries, ret);
> > +		if (ret)
> > +			local_inc(&cpu_buffer->pages_touched);
> > +		entries += ret;
> > +		entry_bytes += rb_page_commit(head_page);
> > +	}
> > +	pr_info("Rewound %d pages on cpu%d\n", i, cpu_buffer->cpu);  
> 
> Should state this is coming from the ring buffer and use "[%d]" for cpu
> number as the other pr_info()'s do. Also only print if it did a rewind:
> 
> 	if (i)
> 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
> 
> 
> > +
> > +	/* The last rewound page must be skipped. */
> > +	if (head_page != orig_head)
> > +		rb_inc_page(&head_page);
> >  
> > +	/* If there are rewound pages, rewind the reader page too. */  
> 
> I would change the comment to:
> 
> 	/*
> 	 * If the ring buffer was rewound, then inject the reader page
> 	 * into the location just before the original head page.
> 	 */
> 
> > +	if (head_page != orig_head) {
> > +		struct buffer_page *bpage = orig_head;
> > +
> > +		rb_dec_page(&bpage);
> > +		/*
> > +		 * Insert the reader_page before the original head page.
> > +		 * Since the list encode RB_PAGE flags, general list
> > +		 * operations should be avoided.
> > +		 */
> > +		cpu_buffer->reader_page->list.next = &orig_head->list;
> > +		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
> > +		orig_head->list.prev = &cpu_buffer->reader_page->list;
> > +		bpage->list.next = &cpu_buffer->reader_page->list;
> > +
> > +		/* Make the head_page tthe new read page */  
> 
> Typo "tthe" and call it "new reader page", not "read page".
> 
> > +		cpu_buffer->reader_page = head_page;
> > +		bpage = head_page;
> > +		rb_inc_page(&head_page);
> > +		head_page->list.prev = bpage->list.prev;
> > +		rb_dec_page(&bpage);
> > +		bpage->list.next = &head_page->list;
> > +		rb_set_list_to_head(&bpage->list);

When testing this patch, it kept crashing. I had to add this here:

		cpu_buffer->pages = &head_page->list;

That's because my test would end up having cpu_buffer->pages pointing to
the reader page, and that will cause issues later. It has to point into the
writing portion of the buffer.

> > +
> > +		cpu_buffer->head_page = head_page;
> > +		meta->head_buffer = (unsigned long)head_page->page;
> > +
> > +		/* Reset all the indexes */
> > +		bpage = cpu_buffer->reader_page;
> > +		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
> > +		bpage->id = 0;
> > +
> > +		for (i = 0, bpage = head_page; i < meta->nr_subbufs;
> > +		     i++, rb_inc_page(&bpage)) {
> > +			meta->buffers[i + 1] = rb_meta_subbuf_idx(meta, bpage->page);
> > +			bpage->id = i + 1;
> > +		}

Can we convert the above to:

		for (i = 1, bpage = head_page; i < meta->nr_subbufs;
		     i++, rb_inc_page(&bpage)) {
			meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
			bpage->id = i;
		}

By starting i at one, we can remove the "+ 1" inside the loop. It's a bit
cleaner that way.

-- Steve


> > +
> > +		/* We'll restart verifying from orig_head */
> > +		head_page = orig_head;
> > +	}
> > +
> > + rewound:  
> 
>  skip_rewind:
> 
> Also, I know other's don't like to do this, but I do add a space before
> labels. It makes patch diffs easier to see which functions they are,
> otherwise the patch shows the label and not the function.
> 
> -- Steve
> 
> 
> >  	/* If the commit_buffer is the reader page, update the commit page */
> >  	if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
> >  		cpu_buffer->commit_page = cpu_buffer->reader_page;
> > @@ -5348,7 +5441,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
> >  	 */
> >  	local_set(&cpu_buffer->reader_page->write, 0);
> >  	local_set(&cpu_buffer->reader_page->entries, 0);
> > -	local_set(&cpu_buffer->reader_page->page->commit, 0);
> >  	cpu_buffer->reader_page->real_end = 0;
> >  
> >   spin:
> > @@ -6642,7 +6734,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> >  		cpu_buffer->read_bytes += rb_page_size(reader);
> >  
> >  		/* swap the pages */
> > -		rb_init_page(bpage);
> >  		bpage = reader->page;
> >  		reader->page = data_page->data;
> >  		local_set(&reader->write, 0);  
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot
  2025-05-21 18:51 ` [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Steven Rostedt
  2025-05-21 23:19   ` Steven Rostedt
@ 2025-05-22 15:09   ` Masami Hiramatsu
  1 sibling, 0 replies; 6+ messages in thread
From: Masami Hiramatsu @ 2025-05-22 15:09 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Wed, 21 May 2025 14:51:28 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 15 May 2025 20:15:56 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> 
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 6859008ca34d..48f5f248eb4c 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
> >  	*bpage = list_entry(p, struct buffer_page, list);
> >  }
> >  
> > +static inline void rb_dec_page(struct buffer_page **bpage)
> > +{
> > +	struct list_head *p = rb_list_head((*bpage)->list.prev);
> > +
> > +	*bpage = list_entry(p, struct buffer_page, list);
> > +}
> > +
> >  static struct buffer_page *
> >  rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
> >  {
> > @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
> >  static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> >  {
> >  	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
> > -	struct buffer_page *head_page;
> > +	struct buffer_page *head_page, *orig_head;
> >  	unsigned long entry_bytes = 0;
> >  	unsigned long entries = 0;
> >  	int ret;
> > +	u64 ts;
> >  	int i;
> >  
> >  	if (!meta || !meta->head_buffer)
> > @@ -1885,8 +1893,93 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> >  	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
> >  	local_set(&cpu_buffer->reader_page->entries, ret);
> >  
> > -	head_page = cpu_buffer->head_page;
> > +	orig_head = head_page = cpu_buffer->head_page;
> > +	ts = head_page->page->time_stamp;
> > +
> > +	/*
> > +	 * Try to rewind the head so that we can read the pages which already
> > +	 * read in the previous boot.
> > +	 */
> > +	if (head_page == cpu_buffer->tail_page)
> > +		goto rewound;
> 
> Hmm, jumping to a label called "rewound" when you didn't do a rewind seems
> confusing.
> 
> Perhaps call the label "skip_rewind"?

Ah, indeed. :-)

> 
> > +
> > +	rb_dec_page(&head_page);
> > +	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
> > +
> > +		/* Rewind until tail (writer) page. */
> > +		if (head_page == cpu_buffer->tail_page)
> > +			break;
> > +
> > +		/* Ensure the page has older data than head. */
> > +		if (ts < head_page->page->time_stamp)
> > +			break;
> > +
> > +		ts = head_page->page->time_stamp;
> > +		/* Ensure the page has correct timestamp and some data. */
> > +		if (!ts || rb_page_commit(head_page) == 0)
> > +			break;
> > +
> > +		/* Stop rewind if the page is invalid. */
> > +		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
> > +		if (ret < 0)
> > +			break;
> > +
> > +		/* Recover the number of entries and update stats. */
> > +		local_set(&head_page->entries, ret);
> > +		if (ret)
> > +			local_inc(&cpu_buffer->pages_touched);
> > +		entries += ret;
> > +		entry_bytes += rb_page_commit(head_page);
> > +	}
> > +	pr_info("Rewound %d pages on cpu%d\n", i, cpu_buffer->cpu);
> 
> Should state this is coming from the ring buffer and use "[%d]" for cpu
> number as the other pr_info()'s do. Also only print if it did a rewind:
> 
> 	if (i)
> 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);

OK.

> 
> 
> > +
> > +	/* The last rewound page must be skipped. */
> > +	if (head_page != orig_head)
> > +		rb_inc_page(&head_page);
> >  
> > +	/* If there are rewound pages, rewind the reader page too. */
> 
> I would change the comment to:
> 
> 	/*
> 	 * If the ring buffer was rewound, then inject the reader page
> 	 * into the location just before the original head page.
> 	 */

OK.

> 
> > +	if (head_page != orig_head) {
> > +		struct buffer_page *bpage = orig_head;
> > +
> > +		rb_dec_page(&bpage);
> > +		/*
> > +		 * Insert the reader_page before the original head page.
> > +		 * Since the list encode RB_PAGE flags, general list
> > +		 * operations should be avoided.
> > +		 */
> > +		cpu_buffer->reader_page->list.next = &orig_head->list;
> > +		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
> > +		orig_head->list.prev = &cpu_buffer->reader_page->list;
> > +		bpage->list.next = &cpu_buffer->reader_page->list;
> > +
> > +		/* Make the head_page tthe new read page */
> 
> Typo "tthe" and call it "new reader page", not "read page".

Oops, thanks.

> 
> > +		cpu_buffer->reader_page = head_page;
> > +		bpage = head_page;
> > +		rb_inc_page(&head_page);
> > +		head_page->list.prev = bpage->list.prev;
> > +		rb_dec_page(&bpage);
> > +		bpage->list.next = &head_page->list;
> > +		rb_set_list_to_head(&bpage->list);
> > +
> > +		cpu_buffer->head_page = head_page;
> > +		meta->head_buffer = (unsigned long)head_page->page;
> > +
> > +		/* Reset all the indexes */
> > +		bpage = cpu_buffer->reader_page;
> > +		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
> > +		bpage->id = 0;
> > +
> > +		for (i = 0, bpage = head_page; i < meta->nr_subbufs;
> > +		     i++, rb_inc_page(&bpage)) {
> > +			meta->buffers[i + 1] = rb_meta_subbuf_idx(meta, bpage->page);
> > +			bpage->id = i + 1;
> > +		}
> > +
> > +		/* We'll restart verifying from orig_head */
> > +		head_page = orig_head;
> > +	}
> > +
> > + rewound:
> 
>  skip_rewind:
> 
> Also, I know other's don't like to do this, but I do add a space before
> labels. It makes patch diffs easier to see which functions they are,
> otherwise the patch shows the label and not the function.

Ah, that's the reason. Let me fix that.

Thank you,

> 
> -- Steve
> 
> 
> >  	/* If the commit_buffer is the reader page, update the commit page */
> >  	if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
> >  		cpu_buffer->commit_page = cpu_buffer->reader_page;
> > @@ -5348,7 +5441,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
> >  	 */
> >  	local_set(&cpu_buffer->reader_page->write, 0);
> >  	local_set(&cpu_buffer->reader_page->entries, 0);
> > -	local_set(&cpu_buffer->reader_page->page->commit, 0);
> >  	cpu_buffer->reader_page->real_end = 0;
> >  
> >   spin:
> > @@ -6642,7 +6734,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> >  		cpu_buffer->read_bytes += rb_page_size(reader);
> >  
> >  		/* swap the pages */
> > -		rb_init_page(bpage);
> >  		bpage = reader->page;
> >  		reader->page = data_page->data;
> >  		local_set(&reader->write, 0);
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot
  2025-05-21 23:19   ` Steven Rostedt
@ 2025-05-22 15:23     ` Masami Hiramatsu
  0 siblings, 0 replies; 6+ messages in thread
From: Masami Hiramatsu @ 2025-05-22 15:23 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Wed, 21 May 2025 19:19:47 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 21 May 2025 14:51:28 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Thu, 15 May 2025 20:15:56 +0900
> > "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> > 
> > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > > index 6859008ca34d..48f5f248eb4c 100644
> > > --- a/kernel/trace/ring_buffer.c
> > > +++ b/kernel/trace/ring_buffer.c
> > > @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
> > >  	*bpage = list_entry(p, struct buffer_page, list);
> > >  }
> > >  
> > > +static inline void rb_dec_page(struct buffer_page **bpage)
> > > +{
> > > +	struct list_head *p = rb_list_head((*bpage)->list.prev);
> > > +
> > > +	*bpage = list_entry(p, struct buffer_page, list);
> > > +}
> > > +
> > >  static struct buffer_page *
> > >  rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
> > >  {
> > > @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
> > >  static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> > >  {
> > >  	struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
> > > -	struct buffer_page *head_page;
> > > +	struct buffer_page *head_page, *orig_head;
> > >  	unsigned long entry_bytes = 0;
> > >  	unsigned long entries = 0;
> > >  	int ret;
> > > +	u64 ts;
> > >  	int i;
> > >  
> > >  	if (!meta || !meta->head_buffer)
> > > @@ -1885,8 +1893,93 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
> > >  	entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
> > >  	local_set(&cpu_buffer->reader_page->entries, ret);
> > >  
> > > -	head_page = cpu_buffer->head_page;
> > > +	orig_head = head_page = cpu_buffer->head_page;
> > > +	ts = head_page->page->time_stamp;
> > > +
> > > +	/*
> > > +	 * Try to rewind the head so that we can read the pages which already
> > > +	 * read in the previous boot.
> > > +	 */
> > > +	if (head_page == cpu_buffer->tail_page)
> > > +		goto rewound;  
> > 
> > Hmm, jumping to a label called "rewound" when you didn't do a rewind seems
> > confusing.
> > 
> > Perhaps call the label "skip_rewind"?
> > 
> > > +
> > > +	rb_dec_page(&head_page);
> > > +	for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
> > > +
> > > +		/* Rewind until tail (writer) page. */
> > > +		if (head_page == cpu_buffer->tail_page)
> > > +			break;
> > > +
> > > +		/* Ensure the page has older data than head. */
> > > +		if (ts < head_page->page->time_stamp)
> > > +			break;
> > > +
> > > +		ts = head_page->page->time_stamp;
> > > +		/* Ensure the page has correct timestamp and some data. */
> > > +		if (!ts || rb_page_commit(head_page) == 0)
> > > +			break;
> > > +
> > > +		/* Stop rewind if the page is invalid. */
> > > +		ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
> > > +		if (ret < 0)
> > > +			break;
> > > +
> > > +		/* Recover the number of entries and update stats. */
> > > +		local_set(&head_page->entries, ret);
> > > +		if (ret)
> > > +			local_inc(&cpu_buffer->pages_touched);
> > > +		entries += ret;
> > > +		entry_bytes += rb_page_commit(head_page);
> > > +	}
> > > +	pr_info("Rewound %d pages on cpu%d\n", i, cpu_buffer->cpu);  
> > 
> > Should state this is coming from the ring buffer and use "[%d]" for cpu
> > number as the other pr_info()'s do. Also only print if it did a rewind:
> > 
> > 	if (i)
> > 		pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
> > 
> > 
> > > +
> > > +	/* The last rewound page must be skipped. */
> > > +	if (head_page != orig_head)
> > > +		rb_inc_page(&head_page);
> > >  
> > > +	/* If there are rewound pages, rewind the reader page too. */  
> > 
> > I would change the comment to:
> > 
> > 	/*
> > 	 * If the ring buffer was rewound, then inject the reader page
> > 	 * into the location just before the original head page.
> > 	 */
> > 
> > > +	if (head_page != orig_head) {
> > > +		struct buffer_page *bpage = orig_head;
> > > +
> > > +		rb_dec_page(&bpage);
> > > +		/*
> > > +		 * Insert the reader_page before the original head page.
> > > +		 * Since the list encode RB_PAGE flags, general list
> > > +		 * operations should be avoided.
> > > +		 */
> > > +		cpu_buffer->reader_page->list.next = &orig_head->list;
> > > +		cpu_buffer->reader_page->list.prev = orig_head->list.prev;
> > > +		orig_head->list.prev = &cpu_buffer->reader_page->list;
> > > +		bpage->list.next = &cpu_buffer->reader_page->list;
> > > +
> > > +		/* Make the head_page tthe new read page */  
> > 
> > Typo "tthe" and call it "new reader page", not "read page".
> > 
> > > +		cpu_buffer->reader_page = head_page;
> > > +		bpage = head_page;
> > > +		rb_inc_page(&head_page);
> > > +		head_page->list.prev = bpage->list.prev;
> > > +		rb_dec_page(&bpage);
> > > +		bpage->list.next = &head_page->list;
> > > +		rb_set_list_to_head(&bpage->list);
> 
> When testing this patch, it kept crashing. I had to add this here:
> 
> 		cpu_buffer->pages = &head_page->list;
> 
> That's because my test would end up having cpu_buffer->pages pointing to
> the reader page, and that will cause issues later. It has to point into the
> writing portion of the buffer.

Ah, good catch!

> 
> > > +
> > > +		cpu_buffer->head_page = head_page;
> > > +		meta->head_buffer = (unsigned long)head_page->page;
> > > +
> > > +		/* Reset all the indexes */
> > > +		bpage = cpu_buffer->reader_page;
> > > +		meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
> > > +		bpage->id = 0;
> > > +
> > > +		for (i = 0, bpage = head_page; i < meta->nr_subbufs;
> > > +		     i++, rb_inc_page(&bpage)) {
> > > +			meta->buffers[i + 1] = rb_meta_subbuf_idx(meta, bpage->page);
> > > +			bpage->id = i + 1;
> > > +		}
> 
> Can we convert the above to:
> 
> 		for (i = 1, bpage = head_page; i < meta->nr_subbufs;
> 		     i++, rb_inc_page(&bpage)) {
> 			meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
> 			bpage->id = i;
> 		}
> 
> By starting i at one, we can remove the "+ 1" inside the loop. It's a bit
> cleaner that way.

Agreed. And maybe previous one is wrong because it writes
'meta->buffers[meta->nr_subbufs]' (buffer overflow) and set wrong
head_page->id.

Thanks!

> 
> -- Steve
> 
> 
> > > +
> > > +		/* We'll restart verifying from orig_head */
> > > +		head_page = orig_head;
> > > +	}
> > > +
> > > + rewound:  
> > 
> >  skip_rewind:
> > 
> > Also, I know other's don't like to do this, but I do add a space before
> > labels. It makes patch diffs easier to see which functions they are,
> > otherwise the patch shows the label and not the function.
> > 
> > -- Steve
> > 
> > 
> > >  	/* If the commit_buffer is the reader page, update the commit page */
> > >  	if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
> > >  		cpu_buffer->commit_page = cpu_buffer->reader_page;
> > > @@ -5348,7 +5441,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
> > >  	 */
> > >  	local_set(&cpu_buffer->reader_page->write, 0);
> > >  	local_set(&cpu_buffer->reader_page->entries, 0);
> > > -	local_set(&cpu_buffer->reader_page->page->commit, 0);
> > >  	cpu_buffer->reader_page->real_end = 0;
> > >  
> > >   spin:
> > > @@ -6642,7 +6734,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> > >  		cpu_buffer->read_bytes += rb_page_size(reader);
> > >  
> > >  		/* swap the pages */
> > > -		rb_init_page(bpage);
> > >  		bpage = reader->page;
> > >  		reader->page = data_page->data;
> > >  		local_set(&reader->write, 0);  
> > 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-05-22 15:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-15 11:15 [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Masami Hiramatsu (Google)
2025-05-20  6:13 ` [PATCH] tracing: Reset last-boot buffers when reading out all cpu buffers Masami Hiramatsu (Google)
2025-05-21 18:51 ` [PATCH v2] tracing: ring_buffer: Rewind persistent ring buffer when reboot Steven Rostedt
2025-05-21 23:19   ` Steven Rostedt
2025-05-22 15:23     ` Masami Hiramatsu
2025-05-22 15:09   ` Masami Hiramatsu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).