linux-trace-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/3] Introduce configurable ring buffer page size
@ 2021-11-17 15:40 Tzvetomir Stoyanov (VMware)
  2021-11-17 15:40 ` [PATCH 1/3] [RFC] trace: Page size per ring buffer Tzvetomir Stoyanov (VMware)
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Tzvetomir Stoyanov (VMware) @ 2021-11-17 15:40 UTC (permalink / raw)
  To: rostedt; +Cc: linux-trace-devel

From: "Tzvetomir Stoyanov (VMware)" <tstoyanov@vmware.com>

Currently the size of one buffer page is global for all buffers and it
is hard coded to one system page. The patch set introduces configurable
ring buffer page size, per ring buffer. A new user space interface is
introduced, which allows to change the page size of the ftrace buffer, per
ftrace instance.

Tzvetomir Stoyanov (VMware) (3):
  [RFC] trace: Page size per ring buffer
  [RFC] trace: Add interface for configuring trace ring buffer size
  [WiP] trace: Set new size of the ring buffer page

 include/linux/ring_buffer.h |   5 +-
 kernel/trace/ring_buffer.c  | 251 +++++++++++++++++++++++++++---------
 kernel/trace/trace.c        |  47 +++++++
 kernel/trace/trace_events.c |  71 ++++++++--
 4 files changed, 304 insertions(+), 70 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/3] [RFC] trace: Page size per ring buffer
  2021-11-17 15:40 [RFC PATCH 0/3] Introduce configurable ring buffer page size Tzvetomir Stoyanov (VMware)
@ 2021-11-17 15:40 ` Tzvetomir Stoyanov (VMware)
  2021-11-17 18:17   ` Steven Rostedt
  2021-11-17 15:41 ` [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size Tzvetomir Stoyanov (VMware)
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Tzvetomir Stoyanov (VMware) @ 2021-11-17 15:40 UTC (permalink / raw)
  To: rostedt; +Cc: linux-trace-devel

Currently the size of one buffer page is global for all buffers and it
is hard coded to one system page. In order to introduce configurable
ring buffer page size, the internal logic should be refactored to work
with page size per ring buffer.

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 include/linux/ring_buffer.h |   2 +-
 kernel/trace/ring_buffer.c  | 117 +++++++++++++++++++-----------------
 kernel/trace/trace_events.c |  71 +++++++++++++++++++---
 3 files changed, 123 insertions(+), 67 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea..d9a2e6e8fb79 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
 struct trace_seq;
 
 int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2699e9e562b1..6bca2977ca1a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -366,40 +366,8 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
-	struct buffer_data_page field;
-
-	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-			 "offset:0;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)sizeof(field.time_stamp),
-			 (unsigned int)is_signed_type(u64));
-
-	trace_seq_printf(s, "\tfield: local_t commit;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), commit),
-			 (unsigned int)sizeof(field.commit),
-			 (unsigned int)is_signed_type(long));
-
-	trace_seq_printf(s, "\tfield: int overwrite;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), commit),
-			 1,
-			 (unsigned int)is_signed_type(long));
-
-	trace_seq_printf(s, "\tfield: char data;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), data),
-			 (unsigned int)BUF_PAGE_SIZE,
-			 (unsigned int)is_signed_type(char));
-
-	return !trace_seq_has_overflowed(s);
-}
+/* Max payload is buffer page size - header (8bytes) */
+#define BUF_MAX_DATA_SIZE(B) ((B)->page_size - (sizeof(u32) * 2))
 
 struct rb_irq_work {
 	struct irq_work			work;
@@ -544,6 +512,8 @@ struct trace_buffer {
 
 	struct rb_irq_work		irq_work;
 	bool				time_stamp_abs;
+
+	unsigned int			page_size;
 };
 
 struct ring_buffer_iter {
@@ -559,6 +529,36 @@ struct ring_buffer_iter {
 	int				missed_events;
 };
 
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
+{
+	struct buffer_data_page field;
+
+	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			 "offset:0;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)sizeof(field.time_stamp),
+			 (unsigned int)is_signed_type(u64));
+
+	trace_seq_printf(s, "\tfield: local_t commit;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), commit),
+			 (unsigned int)sizeof(field.commit),
+			 (unsigned int)is_signed_type(long));
+
+	trace_seq_printf(s, "\tfield: int overwrite;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), commit),
+			 1,
+			 (unsigned int)is_signed_type(long));
+
+	trace_seq_printf(s, "\tfield: char data;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), data),
+			 (unsigned int)buffer->page_size,
+			 (unsigned int)is_signed_type(char));
+
+	return !trace_seq_has_overflowed(s);
+}
+
 #ifdef RB_TIME_32
 
 /*
@@ -1725,7 +1725,9 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
 		goto fail_free_buffer;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	/* Default buffer page size - one system page */
+	buffer->page_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+	nr_pages = DIV_ROUND_UP(size, buffer->page_size);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
@@ -1919,7 +1921,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
 			 * Increment overrun to account for the lost events.
 			 */
 			local_add(page_entries, &cpu_buffer->overrun);
-			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+			local_sub(cpu_buffer->buffer->page_size,
+				  &cpu_buffer->entries_bytes);
 		}
 
 		/*
@@ -2041,7 +2044,7 @@ static void update_pages_handler(struct work_struct *work)
  * @size: the new size.
  * @cpu_id: the cpu buffer to resize
  *
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->page_size.
  *
  * Returns 0 on success and < 0 on failure.
  */
@@ -2063,7 +2066,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
 		return 0;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	nr_pages = DIV_ROUND_UP(size, buffer->page_size);
 
 	/* we need a minimum of two pages */
 	if (nr_pages < 2)
@@ -2290,7 +2293,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 */
 	barrier();
 
-	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE(iter->cpu_buffer->buffer))
 		/* Writer corrupted the read? */
 		goto reset;
 
@@ -2403,7 +2406,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 		 * the counters.
 		 */
 		local_add(entries, &cpu_buffer->overrun);
-		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+		local_sub(cpu_buffer->buffer->page_size,
+			  &cpu_buffer->entries_bytes);
 
 		/*
 		 * The entries will be zeroed out when we move the
@@ -2530,13 +2534,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Only the event that crossed the page boundary
 	 * must fill the old tail_page with padding.
 	 */
-	if (tail >= BUF_PAGE_SIZE) {
+	if (tail >= cpu_buffer->buffer->page_size) {
 		/*
 		 * If the page was filled, then we still need
 		 * to update the real_end. Reset it to zero
 		 * and the reader will ignore it.
 		 */
-		if (tail == BUF_PAGE_SIZE)
+		if (tail == cpu_buffer->buffer->page_size)
 			tail_page->real_end = 0;
 
 		local_sub(length, &tail_page->write);
@@ -2546,7 +2550,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 
 	/* account for padding bytes */
-	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+	local_add(cpu_buffer->buffer->page_size - tail,
+		  &cpu_buffer->entries_bytes);
 
 	/*
 	 * Save the original length to the meta data.
@@ -2566,7 +2571,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we are less than the minimum size, we don't need to
 	 * worry about it.
 	 */
-	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+	if (tail > (cpu_buffer->buffer->page_size - RB_EVNT_MIN_SIZE)) {
 		/* No room for any events */
 
 		/* Mark the rest of the page with padding */
@@ -2578,13 +2583,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	/* Put in a discarded event */
-	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+	event->array[0] = (cpu_buffer->buffer->page_size - tail) - RB_EVNT_HDR_SIZE;
 	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	event->time_delta = 1;
 
 	/* Set write to end of buffer */
-	length = (tail + length) - BUF_PAGE_SIZE;
+	length = (tail + length) - cpu_buffer->buffer->page_size;
 	local_sub(length, &tail_page->write);
 }
 
@@ -3476,7 +3481,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - info->length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE)) {
+	if (unlikely(write > cpu_buffer->buffer->page_size)) {
 		/* before and after may now different, fix it up*/
 		b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
 		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
@@ -3685,7 +3690,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
 	if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
 		goto out;
 
-	if (unlikely(length > BUF_MAX_DATA_SIZE))
+	if (unlikely(length > BUF_MAX_DATA_SIZE(buffer)))
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3835,7 +3840,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	if (length > BUF_MAX_DATA_SIZE)
+	if (length > BUF_MAX_DATA_SIZE(buffer))
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4957,7 +4962,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 	if (!iter)
 		return NULL;
 
-	iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+	iter->event = kmalloc(BUF_MAX_DATA_SIZE(buffer), flags);
 	if (!iter->event) {
 		kfree(iter);
 		return NULL;
@@ -5075,14 +5080,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 {
 	/*
 	 * Earlier, this method returned
-	 *	BUF_PAGE_SIZE * buffer->nr_pages
+	 *	buffer->page_size * buffer->nr_pages
 	 * Since the nr_pages field is now removed, we have converted this to
 	 * return the per cpu buffer value.
 	 */
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
-	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+	return buffer->page_size * buffer->buffers[cpu]->nr_pages;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
@@ -5618,7 +5623,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	} else {
 		/* update the entry counter */
 		cpu_buffer->read += rb_page_entries(reader);
-		cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+		cpu_buffer->read_bytes += buffer->page_size;
 
 		/* swap the pages */
 		rb_init_page(bpage);
@@ -5649,7 +5654,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		/* If there is room at the end of the page to save the
 		 * missed events, then record it there.
 		 */
-		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+		if (buffer->page_size - commit >= sizeof(missed_events)) {
 			memcpy(&bpage->data[commit], &missed_events,
 			       sizeof(missed_events));
 			local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5661,8 +5666,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	/*
 	 * This page may be off to user land. Zero it out here.
 	 */
-	if (commit < BUF_PAGE_SIZE)
-		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+	if (commit < buffer->page_size)
+		memset(&bpage->data[commit], 0, buffer->page_size - commit);
 
  out_unlock:
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4021b9a79f93..a2bac76e73d3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1846,10 +1846,31 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static int open_header_file(struct inode *inode, struct file *filp)
+{
+	struct trace_array *tr = inode->i_private;
+	int ret;
+
+	ret = tracing_check_open_get_tr(tr);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int release_header_file(struct inode *inode, struct file *file)
+{
+	struct trace_array *tr = inode->i_private;
+
+	trace_array_put(tr);
+
+	return 0;
+}
+
 static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 {
-	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_array *tr = file_inode(filp)->i_private;
 	struct trace_seq *s;
 	int r;
 
@@ -1862,7 +1883,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 
 	trace_seq_init(s);
 
-	func(s);
+	ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, trace_seq_used(s));
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	ring_buffer_print_entry_header(s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos,
 				    s->buffer, trace_seq_used(s));
 
@@ -2117,10 +2162,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
 	.release = subsystem_release,
 };
 
-static const struct file_operations ftrace_show_header_fops = {
-	.open = tracing_open_generic,
-	.read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+	.open = open_header_file,
+	.read = show_header_page_file,
+	.llseek = default_llseek,
+	.release = release_header_file,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+	.open = open_header_file,
+	.read = show_header_event_file,
 	.llseek = default_llseek,
+	.release = release_header_file,
 };
 
 static int
@@ -3469,14 +3522,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 
 	/* ring buffer internal formats */
 	entry = trace_create_file("header_page", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_page_header,
-				  &ftrace_show_header_fops);
+				  tr, &ftrace_show_header_page_fops);
 	if (!entry)
 		pr_warn("Could not create tracefs 'header_page' entry\n");
 
 	entry = trace_create_file("header_event", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_entry_header,
-				  &ftrace_show_header_fops);
+				  tr, &ftrace_show_header_event_fops);
 	if (!entry)
 		pr_warn("Could not create tracefs 'header_event' entry\n");
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size
  2021-11-17 15:40 [RFC PATCH 0/3] Introduce configurable ring buffer page size Tzvetomir Stoyanov (VMware)
  2021-11-17 15:40 ` [PATCH 1/3] [RFC] trace: Page size per ring buffer Tzvetomir Stoyanov (VMware)
@ 2021-11-17 15:41 ` Tzvetomir Stoyanov (VMware)
  2021-11-17 18:39   ` Steven Rostedt
  2021-11-17 15:41 ` [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page Tzvetomir Stoyanov (VMware)
  2021-11-17 17:10 ` [RFC PATCH 0/3] Introduce configurable ring buffer page size Steven Rostedt
  3 siblings, 1 reply; 10+ messages in thread
From: Tzvetomir Stoyanov (VMware) @ 2021-11-17 15:41 UTC (permalink / raw)
  To: rostedt; +Cc: linux-trace-devel

The trace ring buffer page size can be configured, per trace instance. A
new ftrace file "buffer_page_size" is added to get and set the size of
the ring buffer page for current trace instance. The size must be
multiple of system page size, that's why the new interface works with
system page count, instead of absolute page size: 1 means the ring
buffer page is equal to one system page and so forth. The ring buffer
page is limited between 1 and 100 system pages.

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 include/linux/ring_buffer.h |  3 +++
 kernel/trace/ring_buffer.c  | 51 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        | 47 ++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index d9a2e6e8fb79..53cd7a38b717 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,9 @@ struct trace_seq;
 int ring_buffer_print_entry_header(struct trace_seq *s);
 int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
+int ring_buffer_page_size_get(struct trace_buffer *buffer);
+int ring_buffer_page_size_set(struct trace_buffer *buffer, int psize);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6bca2977ca1a..9aa245795c3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5677,6 +5677,57 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_page_size_get - get count of system pages in one buffer page.
+ * @buffer: The ring_buffer to get the system page count from
+ *
+ * By default, one ring buffer pages equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer page can be
+ * extended, but must be multiple of system page size.
+ *
+ * Returns the size of buffer page, in system pages: 1 means the buffer size is
+ * one system page and so forth. In case of an error < 0 is returned.
+ */
+int ring_buffer_page_size_get(struct trace_buffer *buffer)
+{
+	if (!buffer)
+		return -EINVAL;
+
+	return (buffer->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_page_size_get);
+
+/**
+ * ring_buffer_page_size_set - set the size of ring buffer page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @pcount: Number of system pages.
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be multiple of
+ * system page size, that's why the input parameter @pcount is the count of
+ * system pages that are allocated for one ring buffer page.
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_page_size_set(struct trace_buffer *buffer, int pcount)
+{
+	int psize;
+
+	if (!buffer)
+		return -EINVAL;
+
+	psize = pcount * PAGE_SIZE;
+	if (psize <= BUF_PAGE_HDR_SIZE)
+		return -EINVAL;
+
+	buffer->page_size = psize - BUF_PAGE_HDR_SIZE;
+
+	/* Todo: reset the buffer with the new page size */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_page_size_set);
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f9139dc1262c..05fc2712fdbd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9005,6 +9005,50 @@ static const struct file_operations buffer_percent_fops = {
 	.llseek		= default_llseek,
 };
 
+static ssize_t
+buffer_psize_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%d\n", ring_buffer_page_size_get(tr->array_buffer.buffer));
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_psize_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	if (val < 1 || val > 100)
+		return -EINVAL;
+
+	ret = ring_buffer_page_size_set(tr->array_buffer.buffer, val);
+	if (ret)
+		return ret;
+
+	(*ppos)++;
+
+	return cnt;
+}
+
+static const struct file_operations buffer_psize_fops = {
+	.open		= tracing_open_generic_tr,
+	.read		= buffer_psize_read,
+	.write		= buffer_psize_write,
+	.release	= tracing_release_generic_tr,
+	.llseek		= default_llseek,
+};
+
 static struct dentry *trace_instance_dir;
 
 static void
@@ -9458,6 +9502,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer,
 			tr, &buffer_percent_fops);
 
+	trace_create_file("buffer_page_size", TRACE_MODE_WRITE, d_tracer,
+			tr, &buffer_psize_fops);
+
 	create_trace_options_dir(tr);
 
 	trace_create_maxlat_file(tr, d_tracer);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page
  2021-11-17 15:40 [RFC PATCH 0/3] Introduce configurable ring buffer page size Tzvetomir Stoyanov (VMware)
  2021-11-17 15:40 ` [PATCH 1/3] [RFC] trace: Page size per ring buffer Tzvetomir Stoyanov (VMware)
  2021-11-17 15:41 ` [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size Tzvetomir Stoyanov (VMware)
@ 2021-11-17 15:41 ` Tzvetomir Stoyanov (VMware)
  2021-11-17 19:50   ` Steven Rostedt
  2021-11-17 17:10 ` [RFC PATCH 0/3] Introduce configurable ring buffer page size Steven Rostedt
  3 siblings, 1 reply; 10+ messages in thread
From: Tzvetomir Stoyanov (VMware) @ 2021-11-17 15:41 UTC (permalink / raw)
  To: rostedt; +Cc: linux-trace-devel

There are two approaches when changing the size of the ring buffer page:
 1. Destroying all pages and allocating new pages with the new size.
 2. Allocating new pages, copying the content of the old pages before
    destroying them.
The first approach is easier, it is selected in the proposed
implementation. Changing the ring buffer page size is supposed to
not happen frequently. Usually, that size should be set only once,
when the buffer is not in use yet and is supposed to be empty.

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 kernel/trace/ring_buffer.c | 95 ++++++++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9aa245795c3d..39be9b1cf6e0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -368,6 +368,7 @@ static inline int test_time_stamp(u64 delta)
 
 /* Max payload is buffer page size - header (8bytes) */
 #define BUF_MAX_DATA_SIZE(B) ((B)->page_size - (sizeof(u32) * 2))
+#define BUF_SYS_PAGE_COUNT(B) (((B)->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE)
 
 struct rb_irq_work {
 	struct irq_work			work;
@@ -1521,6 +1522,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *bpage, *tmp;
 	bool user_thread = current->mm != NULL;
 	gfp_t mflags;
+	int psize;
 	long i;
 
 	/*
@@ -1552,6 +1554,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	 */
 	if (user_thread)
 		set_current_oom_origin();
+
+	/* Buffer page size must be at least one system page */
+	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
+	if (psize < 0)
+		psize = 0;
+
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 
@@ -1564,7 +1572,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 
 		list_add(&bpage->list, pages);
 
-		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, psize);
 		if (!page)
 			goto free_pages;
 		bpage->page = page_address(page);
@@ -1620,6 +1628,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct buffer_page *bpage;
 	struct page *page;
+	int psize;
 	int ret;
 
 	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1646,7 +1655,13 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	rb_check_bpage(cpu_buffer, bpage);
 
 	cpu_buffer->reader_page = bpage;
-	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+	/* Buffer page size must be at least one system page */
+	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
+	if (psize < 0)
+		psize = 0;
+
+	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, psize);
 	if (!page)
 		goto fail_free_reader;
 	bpage->page = page_address(page);
@@ -5412,6 +5427,7 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 	struct buffer_data_page *bpage = NULL;
 	unsigned long flags;
 	struct page *page;
+	int psize;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return ERR_PTR(-ENODEV);
@@ -5431,8 +5447,13 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 	if (bpage)
 		goto out;
 
+	/* Buffer page size must be at least one system page */
+	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
+	if (psize < 0)
+		psize = 0;
+
 	page = alloc_pages_node(cpu_to_node(cpu),
-				GFP_KERNEL | __GFP_NORETRY, 0);
+				GFP_KERNEL | __GFP_NORETRY, psize);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -5693,10 +5714,70 @@ int ring_buffer_page_size_get(struct trace_buffer *buffer)
 	if (!buffer)
 		return -EINVAL;
 
-	return (buffer->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE;
+	return BUF_SYS_PAGE_COUNT(buffer);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_page_size_get);
 
+static int page_size_set(struct trace_buffer *buffer, int size)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int old_size = buffer->page_size;
+	int nr_pages;
+	int ret = 0;
+	int err;
+	int cpu;
+
+	if (buffer->page_size == size)
+		return 0;
+
+	/* prevent another thread from changing buffer sizes */
+	mutex_lock(&buffer->mutex);
+	atomic_inc(&buffer->record_disabled);
+
+	/* Make sure all commits have finished */
+	synchronize_rcu();
+
+	buffer->page_size = size;
+
+	for_each_buffer_cpu(buffer, cpu) {
+
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			continue;
+
+		nr_pages = buffer->buffers[cpu]->nr_pages;
+		rb_free_cpu_buffer(buffer->buffers[cpu]);
+		buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+	}
+
+	atomic_dec(&buffer->record_disabled);
+	mutex_unlock(&buffer->mutex);
+
+	return 0;
+
+out_err:
+	buffer->page_size = old_size;
+
+	for_each_buffer_cpu(buffer, cpu) {
+		struct buffer_page *bpage, *tmp;
+
+		cpu_buffer = buffer->buffers[cpu];
+
+		if (list_empty(&cpu_buffer->new_pages))
+			continue;
+
+		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+			list_del_init(&bpage->list);
+			free_buffer_page(bpage);
+		}
+		atomic_dec(&cpu_buffer->record_disabled);
+		atomic_dec(&cpu_buffer->resize_disabled);
+	}
+
+	mutex_unlock(&buffer->mutex);
+
+	return err;
+}
+
 /**
  * ring_buffer_page_size_set - set the size of ring buffer page.
  * @buffer: The ring_buffer to set the new page size.
@@ -5720,11 +5801,7 @@ int ring_buffer_page_size_set(struct trace_buffer *buffer, int pcount)
 	if (psize <= BUF_PAGE_HDR_SIZE)
 		return -EINVAL;
 
-	buffer->page_size = psize - BUF_PAGE_HDR_SIZE;
-
-	/* Todo: reset the buffer with the new page size */
-
-	return 0;
+	return page_size_set(buffer, psize - BUF_PAGE_HDR_SIZE);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_page_size_set);
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH 0/3] Introduce configurable ring buffer page size
  2021-11-17 15:40 [RFC PATCH 0/3] Introduce configurable ring buffer page size Tzvetomir Stoyanov (VMware)
                   ` (2 preceding siblings ...)
  2021-11-17 15:41 ` [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page Tzvetomir Stoyanov (VMware)
@ 2021-11-17 17:10 ` Steven Rostedt
  2021-11-17 17:20   ` Tzvetomir Stoyanov
  3 siblings, 1 reply; 10+ messages in thread
From: Steven Rostedt @ 2021-11-17 17:10 UTC (permalink / raw)
  To: Tzvetomir Stoyanov (VMware); +Cc: linux-trace-devel

On Wed, 17 Nov 2021 17:40:58 +0200
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:

> From: "Tzvetomir Stoyanov (VMware)" <tstoyanov@vmware.com>
> 
> Currently the size of one buffer page is global for all buffers and it
> is hard coded to one system page. The patch set introduces configurable
> ring buffer page size, per ring buffer. A new user space interface is
> introduced, which allows to change the page size of the ftrace buffer, per
> ftrace instance.

A few of nits.

1. Even the cover letter should have a category ("tracing:").
2. The category is "tracing:" not "trace:"
3. These are kernel changes and should be sent to lkml and not
   linux-trace-devel, as that mailing list is for tooling.

-- Steve

> 
> Tzvetomir Stoyanov (VMware) (3):
>   [RFC] trace: Page size per ring buffer
>   [RFC] trace: Add interface for configuring trace ring buffer size
>   [WiP] trace: Set new size of the ring buffer page
> 
>  include/linux/ring_buffer.h |   5 +-
>  kernel/trace/ring_buffer.c  | 251 +++++++++++++++++++++++++++---------
>  kernel/trace/trace.c        |  47 +++++++
>  kernel/trace/trace_events.c |  71 ++++++++--
>  4 files changed, 304 insertions(+), 70 deletions(-)
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH 0/3] Introduce configurable ring buffer page size
  2021-11-17 17:10 ` [RFC PATCH 0/3] Introduce configurable ring buffer page size Steven Rostedt
@ 2021-11-17 17:20   ` Tzvetomir Stoyanov
  0 siblings, 0 replies; 10+ messages in thread
From: Tzvetomir Stoyanov @ 2021-11-17 17:20 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Linux Trace Devel

On Wed, Nov 17, 2021 at 7:10 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 17 Nov 2021 17:40:58 +0200
> "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:
>
> > From: "Tzvetomir Stoyanov (VMware)" <tstoyanov@vmware.com>
> >
> > Currently the size of one buffer page is global for all buffers and it
> > is hard coded to one system page. The patch set introduces configurable
> > ring buffer page size, per ring buffer. A new user space interface is
> > introduced, which allows to change the page size of the ftrace buffer, per
> > ftrace instance.
>
> A few of nits.
>
> 1. Even the cover letter should have a category ("tracing:").
> 2. The category is "tracing:" not "trace:"
> 3. These are kernel changes and should be sent to lkml and not
>    linux-trace-devel, as that mailing list is for tooling.
>

Thanks Steven,
I'll send the patch set to lkml when it is ready, it is still a work
in progress. I just noticed that I forgot to remove some leftovers
from my previous implementation in the third patch. It is still not
stable, as the kernel freezes in some cases.

> -- Steve
>
> >
> > Tzvetomir Stoyanov (VMware) (3):
> >   [RFC] trace: Page size per ring buffer
> >   [RFC] trace: Add interface for configuring trace ring buffer size
> >   [WiP] trace: Set new size of the ring buffer page
> >
> >  include/linux/ring_buffer.h |   5 +-
> >  kernel/trace/ring_buffer.c  | 251 +++++++++++++++++++++++++++---------
> >  kernel/trace/trace.c        |  47 +++++++
> >  kernel/trace/trace_events.c |  71 ++++++++--
> >  4 files changed, 304 insertions(+), 70 deletions(-)
> >
>


-- 
Tzvetomir (Ceco) Stoyanov
VMware Open Source Technology Center

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] [RFC] trace: Page size per ring buffer
  2021-11-17 15:40 ` [PATCH 1/3] [RFC] trace: Page size per ring buffer Tzvetomir Stoyanov (VMware)
@ 2021-11-17 18:17   ` Steven Rostedt
  2021-11-18  4:32     ` Tzvetomir Stoyanov
  0 siblings, 1 reply; 10+ messages in thread
From: Steven Rostedt @ 2021-11-17 18:17 UTC (permalink / raw)
  To: Tzvetomir Stoyanov (VMware); +Cc: linux-trace-devel

On Wed, 17 Nov 2021 17:40:59 +0200
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:

> Currently the size of one buffer page is global for all buffers and it
> is hard coded to one system page. In order to introduce configurable
> ring buffer page size, the internal logic should be refactored to work
> with page size per ring buffer.
> 
> Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
> ---
>  include/linux/ring_buffer.h |   2 +-
>  kernel/trace/ring_buffer.c  | 117 +++++++++++++++++++-----------------
>  kernel/trace/trace_events.c |  71 +++++++++++++++++++---
>  3 files changed, 123 insertions(+), 67 deletions(-)
> 
> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> index dac53fd3afea..d9a2e6e8fb79 100644
> --- a/include/linux/ring_buffer.h
> +++ b/include/linux/ring_buffer.h
> @@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
>  struct trace_seq;
>  
>  int ring_buffer_print_entry_header(struct trace_seq *s);
> -int ring_buffer_print_page_header(struct trace_seq *s);
> +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
>  
>  enum ring_buffer_flags {
>  	RB_FL_OVERWRITE		= 1 << 0,
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 2699e9e562b1..6bca2977ca1a 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -366,40 +366,8 @@ static inline int test_time_stamp(u64 delta)
>  	return 0;
>  }
>  
> -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
> -
> -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
> -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
> -
> -int ring_buffer_print_page_header(struct trace_seq *s)
> -{
> -	struct buffer_data_page field;
> -
> -	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
> -			 "offset:0;\tsize:%u;\tsigned:%u;\n",
> -			 (unsigned int)sizeof(field.time_stamp),
> -			 (unsigned int)is_signed_type(u64));
> -
> -	trace_seq_printf(s, "\tfield: local_t commit;\t"
> -			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> -			 (unsigned int)offsetof(typeof(field), commit),
> -			 (unsigned int)sizeof(field.commit),
> -			 (unsigned int)is_signed_type(long));
> -
> -	trace_seq_printf(s, "\tfield: int overwrite;\t"
> -			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> -			 (unsigned int)offsetof(typeof(field), commit),
> -			 1,
> -			 (unsigned int)is_signed_type(long));
> -
> -	trace_seq_printf(s, "\tfield: char data;\t"
> -			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> -			 (unsigned int)offsetof(typeof(field), data),
> -			 (unsigned int)BUF_PAGE_SIZE,
> -			 (unsigned int)is_signed_type(char));
> -
> -	return !trace_seq_has_overflowed(s);
> -}
> +/* Max payload is buffer page size - header (8bytes) */
> +#define BUF_MAX_DATA_SIZE(B) ((B)->page_size - (sizeof(u32) * 2))

This is used in a lot of hot paths. So it should be calculated at the time
it is changed and use that instead.

	(B)->buf_data_size


>  
>  struct rb_irq_work {
>  	struct irq_work			work;
> @@ -544,6 +512,8 @@ struct trace_buffer {
>  
>  	struct rb_irq_work		irq_work;
>  	bool				time_stamp_abs;
> +
> +	unsigned int			page_size;

Should call it subbuf_size, as it's not a page size.

>  };
>  
>  struct ring_buffer_iter {
> @@ -559,6 +529,36 @@ struct ring_buffer_iter {
>  	int				missed_events;
>  };
>  
> +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
> +{
> +	struct buffer_data_page field;
> +
> +	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
> +			 "offset:0;\tsize:%u;\tsigned:%u;\n",
> +			 (unsigned int)sizeof(field.time_stamp),
> +			 (unsigned int)is_signed_type(u64));
> +
> +	trace_seq_printf(s, "\tfield: local_t commit;\t"
> +			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> +			 (unsigned int)offsetof(typeof(field), commit),
> +			 (unsigned int)sizeof(field.commit),
> +			 (unsigned int)is_signed_type(long));
> +
> +	trace_seq_printf(s, "\tfield: int overwrite;\t"
> +			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> +			 (unsigned int)offsetof(typeof(field), commit),
> +			 1,
> +			 (unsigned int)is_signed_type(long));
> +
> +	trace_seq_printf(s, "\tfield: char data;\t"
> +			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
> +			 (unsigned int)offsetof(typeof(field), data),
> +			 (unsigned int)buffer->page_size,
> +			 (unsigned int)is_signed_type(char));
> +
> +	return !trace_seq_has_overflowed(s);

When moving and changing blocks of code, it is best to make that in
separate changes, so that the reviewer can see what changed.

The above just looks like you moved it, but it has changed as well, and
there's no diff on what those changes were.

> +}
> +
>  #ifdef RB_TIME_32
>  
>  /*
> @@ -1725,7 +1725,9 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
>  	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
>  		goto fail_free_buffer;
>  
> -	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
> +	/* Default buffer page size - one system page */
> +	buffer->page_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
> +	nr_pages = DIV_ROUND_UP(size, buffer->page_size);
>  	buffer->flags = flags;
>  	buffer->clock = trace_clock_local;
>  	buffer->reader_lock_key = key;
> @@ -1919,7 +1921,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
>  			 * Increment overrun to account for the lost events.
>  			 */
>  			local_add(page_entries, &cpu_buffer->overrun);
> -			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
> +			local_sub(cpu_buffer->buffer->page_size,
> +				  &cpu_buffer->entries_bytes);
>  		}
>  
>  		/*
> @@ -2041,7 +2044,7 @@ static void update_pages_handler(struct work_struct *work)
>   * @size: the new size.
>   * @cpu_id: the cpu buffer to resize
>   *
> - * Minimum size is 2 * BUF_PAGE_SIZE.
> + * Minimum size is 2 * buffer->page_size.
>   *
>   * Returns 0 on success and < 0 on failure.
>   */
> @@ -2063,7 +2066,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
>  	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
>  		return 0;
>  
> -	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
> +	nr_pages = DIV_ROUND_UP(size, buffer->page_size);
>  
>  	/* we need a minimum of two pages */
>  	if (nr_pages < 2)
> @@ -2290,7 +2293,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
>  	 */
>  	barrier();
>  
> -	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
> +	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE(iter->cpu_buffer->buffer))
>  		/* Writer corrupted the read? */
>  		goto reset;
>  
> @@ -2403,7 +2406,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
>  		 * the counters.
>  		 */
>  		local_add(entries, &cpu_buffer->overrun);
> -		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
> +		local_sub(cpu_buffer->buffer->page_size,
> +			  &cpu_buffer->entries_bytes);
>  
>  		/*
>  		 * The entries will be zeroed out when we move the
> @@ -2530,13 +2534,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
>  	 * Only the event that crossed the page boundary
>  	 * must fill the old tail_page with padding.
>  	 */
> -	if (tail >= BUF_PAGE_SIZE) {
> +	if (tail >= cpu_buffer->buffer->page_size) {

rb_reset_tail() is a fast path, so we should put this into a single
variable and have it saved:

	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);

Then use bsize for the rest.

>  		/*
>  		 * If the page was filled, then we still need
>  		 * to update the real_end. Reset it to zero
>  		 * and the reader will ignore it.
>  		 */
> -		if (tail == BUF_PAGE_SIZE)
> +		if (tail == cpu_buffer->buffer->page_size)
>  			tail_page->real_end = 0;
>  
>  		local_sub(length, &tail_page->write);
> @@ -2546,7 +2550,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
>  	event = __rb_page_index(tail_page, tail);
>  
>  	/* account for padding bytes */
> -	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
> +	local_add(cpu_buffer->buffer->page_size - tail,
> +		  &cpu_buffer->entries_bytes);
>  
>  	/*
>  	 * Save the original length to the meta data.
> @@ -2566,7 +2571,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
>  	 * If we are less than the minimum size, we don't need to
>  	 * worry about it.
>  	 */
> -	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
> +	if (tail > (cpu_buffer->buffer->page_size - RB_EVNT_MIN_SIZE)) {
>  		/* No room for any events */
>  
>  		/* Mark the rest of the page with padding */
> @@ -2578,13 +2583,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
>  	}
>  
>  	/* Put in a discarded event */
> -	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
> +	event->array[0] = (cpu_buffer->buffer->page_size - tail) - RB_EVNT_HDR_SIZE;
>  	event->type_len = RINGBUF_TYPE_PADDING;
>  	/* time delta must be non zero */
>  	event->time_delta = 1;
>  
>  	/* Set write to end of buffer */
> -	length = (tail + length) - BUF_PAGE_SIZE;
> +	length = (tail + length) - cpu_buffer->buffer->page_size;
>  	local_sub(length, &tail_page->write);
>  }
>  
> @@ -3476,7 +3481,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
>  	tail = write - info->length;
>  
>  	/* See if we shot pass the end of this buffer page */
> -	if (unlikely(write > BUF_PAGE_SIZE)) {
> +	if (unlikely(write > cpu_buffer->buffer->page_size)) {
>  		/* before and after may now different, fix it up*/
>  		b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
>  		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
> @@ -3685,7 +3690,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
>  	if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
>  		goto out;
>  
> -	if (unlikely(length > BUF_MAX_DATA_SIZE))
> +	if (unlikely(length > BUF_MAX_DATA_SIZE(buffer)))
>  		goto out;
>  
>  	if (unlikely(trace_recursive_lock(cpu_buffer)))
> @@ -3835,7 +3840,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
>  	if (atomic_read(&cpu_buffer->record_disabled))
>  		goto out;
>  
> -	if (length > BUF_MAX_DATA_SIZE)
> +	if (length > BUF_MAX_DATA_SIZE(buffer))
>  		goto out;
>  
>  	if (unlikely(trace_recursive_lock(cpu_buffer)))
> @@ -4957,7 +4962,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
>  	if (!iter)
>  		return NULL;
>  
> -	iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
> +	iter->event = kmalloc(BUF_MAX_DATA_SIZE(buffer), flags);
>  	if (!iter->event) {
>  		kfree(iter);
>  		return NULL;
> @@ -5075,14 +5080,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
>  {
>  	/*
>  	 * Earlier, this method returned
> -	 *	BUF_PAGE_SIZE * buffer->nr_pages
> +	 *	buffer->page_size * buffer->nr_pages
>  	 * Since the nr_pages field is now removed, we have converted this to
>  	 * return the per cpu buffer value.
>  	 */
>  	if (!cpumask_test_cpu(cpu, buffer->cpumask))
>  		return 0;
>  
> -	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
> +	return buffer->page_size * buffer->buffers[cpu]->nr_pages;

We may want to add nr_subbufs and keep that separate from nr_pages, as
nr_pages should probably be the nr_pages that we allocated.

I'll have to look at the entire patch series to get a better idea of what
we need to keep track of.

>  }
>  EXPORT_SYMBOL_GPL(ring_buffer_size);
>  
> @@ -5618,7 +5623,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  	} else {

I just realized we are going to have to analyze the logic of
ring_buffer_read_page(), because it can pass in a buffer to use that may
not be big enough.

I'll have to think about this some more.

>  		/* update the entry counter */
>  		cpu_buffer->read += rb_page_entries(reader);
> -		cpu_buffer->read_bytes += BUF_PAGE_SIZE;
> +		cpu_buffer->read_bytes += buffer->page_size;
>  
>  		/* swap the pages */
>  		rb_init_page(bpage);
> @@ -5649,7 +5654,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  		/* If there is room at the end of the page to save the
>  		 * missed events, then record it there.
>  		 */
> -		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
> +		if (buffer->page_size - commit >= sizeof(missed_events)) {
>  			memcpy(&bpage->data[commit], &missed_events,
>  			       sizeof(missed_events));
>  			local_add(RB_MISSED_STORED, &bpage->commit);
> @@ -5661,8 +5666,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  	/*
>  	 * This page may be off to user land. Zero it out here.
>  	 */
> -	if (commit < BUF_PAGE_SIZE)
> -		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
> +	if (commit < buffer->page_size)
> +		memset(&bpage->data[commit], 0, buffer->page_size - commit);
>  
>   out_unlock:
>  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 4021b9a79f93..a2bac76e73d3 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1846,10 +1846,31 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
>  	return cnt;
>  }
>  
> +static int open_header_file(struct inode *inode, struct file *filp)
> +{
> +	struct trace_array *tr = inode->i_private;
> +	int ret;
> +
> +	ret = tracing_check_open_get_tr(tr);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +static int release_header_file(struct inode *inode, struct file *file)
> +{
> +	struct trace_array *tr = inode->i_private;
> +
> +	trace_array_put(tr);
> +
> +	return 0;
> +}
> +

Why add these and not just use:

tracing_open_generic_tr() and tracing_release_generic()?

Where the latter would need to become global to use.


>  static ssize_t
> -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
>  {
> -	int (*func)(struct trace_seq *s) = filp->private_data;
> +	struct trace_array *tr = file_inode(filp)->i_private;

Then you also do not need to use the file_inode macro, because the
tracing_open_generic_tr() will set filp->private_data to it.

-- Steve

>  	struct trace_seq *s;
>  	int r;
>  
> @@ -1862,7 +1883,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
>  
>  	trace_seq_init(s);
>  
> -	func(s);
> +	ring_buffer_print_page_header(tr->array_buffer.buffer, s);
> +	r = simple_read_from_buffer(ubuf, cnt, ppos,
> +				    s->buffer, trace_seq_used(s));
> +
> +	kfree(s);
> +
> +	return r;
> +}
> +
> +static ssize_t
> +show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> +{
> +	struct trace_seq *s;
> +	int r;
> +
> +	if (*ppos)
> +		return 0;
> +
> +	s = kmalloc(sizeof(*s), GFP_KERNEL);
> +	if (!s)
> +		return -ENOMEM;
> +
> +	trace_seq_init(s);
> +
> +	ring_buffer_print_entry_header(s);
>  	r = simple_read_from_buffer(ubuf, cnt, ppos,
>  				    s->buffer, trace_seq_used(s));
>  
> @@ -2117,10 +2162,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
>  	.release = subsystem_release,
>  };
>  
> -static const struct file_operations ftrace_show_header_fops = {
> -	.open = tracing_open_generic,
> -	.read = show_header,
> +static const struct file_operations ftrace_show_header_page_fops = {
> +	.open = open_header_file,
> +	.read = show_header_page_file,
> +	.llseek = default_llseek,
> +	.release = release_header_file,
> +};
> +
> +static const struct file_operations ftrace_show_header_event_fops = {
> +	.open = open_header_file,
> +	.read = show_header_event_file,
>  	.llseek = default_llseek,
> +	.release = release_header_file,
>  };
>  
>  static int
> @@ -3469,14 +3522,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
>  
>  	/* ring buffer internal formats */
>  	entry = trace_create_file("header_page", TRACE_MODE_READ, d_events,
> -				  ring_buffer_print_page_header,
> -				  &ftrace_show_header_fops);
> +				  tr, &ftrace_show_header_page_fops);
>  	if (!entry)
>  		pr_warn("Could not create tracefs 'header_page' entry\n");
>  
>  	entry = trace_create_file("header_event", TRACE_MODE_READ, d_events,
> -				  ring_buffer_print_entry_header,
> -				  &ftrace_show_header_fops);
> +				  tr, &ftrace_show_header_event_fops);
>  	if (!entry)
>  		pr_warn("Could not create tracefs 'header_event' entry\n");
>  


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size
  2021-11-17 15:41 ` [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size Tzvetomir Stoyanov (VMware)
@ 2021-11-17 18:39   ` Steven Rostedt
  0 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2021-11-17 18:39 UTC (permalink / raw)
  To: Tzvetomir Stoyanov (VMware); +Cc: linux-trace-devel

On Wed, 17 Nov 2021 17:41:00 +0200
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:

> The trace ring buffer page size can be configured, per trace instance. A
> new ftrace file "buffer_page_size" is added to get and set the size of
> the ring buffer page for current trace instance. The size must be
> multiple of system page size, that's why the new interface works with
> system page count, instead of absolute page size: 1 means the ring
> buffer page is equal to one system page and so forth. The ring buffer
> page is limited between 1 and 100 system pages.

It should be an order of pages, not a size (or a multiple).

0 - 1 page
1 - 2 pages
2 - 4 pages
3 - 8 pages
4 - 16 pages
[..]


> 
> Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
> ---
>  include/linux/ring_buffer.h |  3 +++
>  kernel/trace/ring_buffer.c  | 51 +++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace.c        | 47 ++++++++++++++++++++++++++++++++++
>  3 files changed, 101 insertions(+)
> 
> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> index d9a2e6e8fb79..53cd7a38b717 100644
> --- a/include/linux/ring_buffer.h
> +++ b/include/linux/ring_buffer.h
> @@ -202,6 +202,9 @@ struct trace_seq;
>  int ring_buffer_print_entry_header(struct trace_seq *s);
>  int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
>  
> +int ring_buffer_page_size_get(struct trace_buffer *buffer);
> +int ring_buffer_page_size_set(struct trace_buffer *buffer, int psize);

  ring_buffer_subbuf_order_get/set()

> +
>  enum ring_buffer_flags {
>  	RB_FL_OVERWRITE		= 1 << 0,
>  };
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 6bca2977ca1a..9aa245795c3d 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -5677,6 +5677,57 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
>  }
>  EXPORT_SYMBOL_GPL(ring_buffer_read_page);
>  
> +/**
> + * ring_buffer_page_size_get - get count of system pages in one buffer page.
> + * @buffer: The ring_buffer to get the system page count from
> + *
> + * By default, one ring buffer pages equals to one system page. This parameter
> + * is configurable, per ring buffer. The size of the ring buffer page can be
> + * extended, but must be multiple of system page size.
> + *
> + * Returns the size of buffer page, in system pages: 1 means the buffer size is
> + * one system page and so forth. In case of an error < 0 is returned.
> + */
> +int ring_buffer_page_size_get(struct trace_buffer *buffer)
> +{
> +	if (!buffer)
> +		return -EINVAL;
> +
> +	return (buffer->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE;

And save it to another field in the structure, and not calculate it.

> +}
> +EXPORT_SYMBOL_GPL(ring_buffer_page_size_get);
> +
> +/**
> + * ring_buffer_page_size_set - set the size of ring buffer page.
> + * @buffer: The ring_buffer to set the new page size.
> + * @pcount: Number of system pages.
> + *
> + * By default, one ring buffer pages equals to one system page. This API can be
> + * used to set new size of the ring buffer page. The size must be multiple of
> + * system page size, that's why the input parameter @pcount is the count of
> + * system pages that are allocated for one ring buffer page.
> + *
> + * Returns 0 on success or < 0 in case of an error.
> + */
> +int ring_buffer_page_size_set(struct trace_buffer *buffer, int pcount)
> +{
> +	int psize;
> +
> +	if (!buffer)
> +		return -EINVAL;
> +
> +	psize = pcount * PAGE_SIZE;
> +	if (psize <= BUF_PAGE_HDR_SIZE)
> +		return -EINVAL;
> +
> +	buffer->page_size = psize - BUF_PAGE_HDR_SIZE;
> +
> +	/* Todo: reset the buffer with the new page size */
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(ring_buffer_page_size_set);
> +
>  /*
>   * We only allocate new buffers, never free them if the CPU goes down.
>   * If we were to free the buffer, then the user would lose any trace that was in
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index f9139dc1262c..05fc2712fdbd 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -9005,6 +9005,50 @@ static const struct file_operations buffer_percent_fops = {
>  	.llseek		= default_llseek,
>  };
>  
> +static ssize_t
> +buffer_psize_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> +{
> +	struct trace_array *tr = filp->private_data;
> +	char buf[64];
> +	int r;
> +
> +	r = sprintf(buf, "%d\n", ring_buffer_page_size_get(tr->array_buffer.buffer));
> +
> +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
> +}
> +
> +static ssize_t
> +buffer_psize_write(struct file *filp, const char __user *ubuf,
> +		   size_t cnt, loff_t *ppos)
> +{
> +	struct trace_array *tr = filp->private_data;
> +	unsigned long val;
> +	int ret;
> +
> +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
> +	if (ret)
> +		return ret;
> +
> +	if (val < 1 || val > 100)
> +		return -EINVAL;
> +
> +	ret = ring_buffer_page_size_set(tr->array_buffer.buffer, val);
> +	if (ret)
> +		return ret;
> +
> +	(*ppos)++;
> +
> +	return cnt;
> +}
> +
> +static const struct file_operations buffer_psize_fops = {
> +	.open		= tracing_open_generic_tr,
> +	.read		= buffer_psize_read,
> +	.write		= buffer_psize_write,
> +	.release	= tracing_release_generic_tr,
> +	.llseek		= default_llseek,
> +};
> +
>  static struct dentry *trace_instance_dir;
>  
>  static void
> @@ -9458,6 +9502,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
>  	trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer,
>  			tr, &buffer_percent_fops);
>  
> +	trace_create_file("buffer_page_size", TRACE_MODE_WRITE, d_tracer,

 "buffer_subbuf_order"

-- Steve

> +			tr, &buffer_psize_fops);
> +
>  	create_trace_options_dir(tr);
>  
>  	trace_create_maxlat_file(tr, d_tracer);


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page
  2021-11-17 15:41 ` [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page Tzvetomir Stoyanov (VMware)
@ 2021-11-17 19:50   ` Steven Rostedt
  0 siblings, 0 replies; 10+ messages in thread
From: Steven Rostedt @ 2021-11-17 19:50 UTC (permalink / raw)
  To: Tzvetomir Stoyanov (VMware); +Cc: linux-trace-devel

On Wed, 17 Nov 2021 17:41:01 +0200
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:

> There are two approaches when changing the size of the ring buffer page:
>  1. Destroying all pages and allocating new pages with the new size.
>  2. Allocating new pages, copying the content of the old pages before
>     destroying them.
> The first approach is easier, it is selected in the proposed
> implementation. Changing the ring buffer page size is supposed to
> not happen frequently. Usually, that size should be set only once,
> when the buffer is not in use yet and is supposed to be empty.
> 
> Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
> ---
>  kernel/trace/ring_buffer.c | 95 ++++++++++++++++++++++++++++++++++----
>  1 file changed, 86 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 9aa245795c3d..39be9b1cf6e0 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -368,6 +368,7 @@ static inline int test_time_stamp(u64 delta)
>  
>  /* Max payload is buffer page size - header (8bytes) */
>  #define BUF_MAX_DATA_SIZE(B) ((B)->page_size - (sizeof(u32) * 2))
> +#define BUF_SYS_PAGE_COUNT(B) (((B)->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE)
>  
>  struct rb_irq_work {
>  	struct irq_work			work;
> @@ -1521,6 +1522,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
>  	struct buffer_page *bpage, *tmp;
>  	bool user_thread = current->mm != NULL;
>  	gfp_t mflags;
> +	int psize;
>  	long i;
>  
>  	/*
> @@ -1552,6 +1554,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
>  	 */
>  	if (user_thread)
>  		set_current_oom_origin();
> +
> +	/* Buffer page size must be at least one system page */
> +	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
> +	if (psize < 0)
> +		psize = 0;
> +
>  	for (i = 0; i < nr_pages; i++) {
>  		struct page *page;
>  
> @@ -1564,7 +1572,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
>  
>  		list_add(&bpage->list, pages);
>  
> -		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
> +		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, psize);
>  		if (!page)
>  			goto free_pages;
>  		bpage->page = page_address(page);
> @@ -1620,6 +1628,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
>  	struct ring_buffer_per_cpu *cpu_buffer;
>  	struct buffer_page *bpage;
>  	struct page *page;
> +	int psize;
>  	int ret;
>  
>  	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
> @@ -1646,7 +1655,13 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
>  	rb_check_bpage(cpu_buffer, bpage);
>  
>  	cpu_buffer->reader_page = bpage;
> -	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
> +
> +	/* Buffer page size must be at least one system page */
> +	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
> +	if (psize < 0)
> +		psize = 0;
> +
> +	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, psize);

Note, psize is an order not a size. That is if psize = 5 it is allocating
32 pages. And this needs to be kept track of.

If we are shrinking the pages, we could do some tricks too, and not even
have to free anything (although we would need to still clear out the data).

That is, because the subbuf size is a power of two that means we can split
them up nicely.

if order is 3 (8 pages) and we shrink to 1 (two pages) we split each of the
8 pages into four 2 page sub buffers.

Note, when freeing pages, you must pass in the order. Allocating pages is
not the same as allocating from the heap (like kmalloc). You must keep
track of what you allocated.

Which brings up how to do this. If a sub buffer is allocated of a given
size, and a reader reads that sub buffer, it must know its size.

>  	if (!page)
>  		goto fail_free_reader;
>  	bpage->page = page_address(page);
> @@ -5412,6 +5427,7 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
>  	struct buffer_data_page *bpage = NULL;
>  	unsigned long flags;
>  	struct page *page;
> +	int psize;
>  
>  	if (!cpumask_test_cpu(cpu, buffer->cpumask))
>  		return ERR_PTR(-ENODEV);
> @@ -5431,8 +5447,13 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
>  	if (bpage)
>  		goto out;
>  
> +	/* Buffer page size must be at least one system page */
> +	psize = BUF_SYS_PAGE_COUNT(cpu_buffer->buffer) - 1;
> +	if (psize < 0)
> +		psize = 0;
> +
>  	page = alloc_pages_node(cpu_to_node(cpu),
> -				GFP_KERNEL | __GFP_NORETRY, 0);
> +				GFP_KERNEL | __GFP_NORETRY, psize);
>  	if (!page)
>  		return ERR_PTR(-ENOMEM);
>  
> @@ -5693,10 +5714,70 @@ int ring_buffer_page_size_get(struct trace_buffer *buffer)
>  	if (!buffer)
>  		return -EINVAL;
>  
> -	return (buffer->page_size + BUF_PAGE_HDR_SIZE) / PAGE_SIZE;
> +	return BUF_SYS_PAGE_COUNT(buffer);
>  }
>  EXPORT_SYMBOL_GPL(ring_buffer_page_size_get);
>  
> +static int page_size_set(struct trace_buffer *buffer, int size)
> +{
> +	struct ring_buffer_per_cpu *cpu_buffer;
> +	int old_size = buffer->page_size;
> +	int nr_pages;
> +	int ret = 0;
> +	int err;
> +	int cpu;
> +
> +	if (buffer->page_size == size)
> +		return 0;
> +
> +	/* prevent another thread from changing buffer sizes */
> +	mutex_lock(&buffer->mutex);
> +	atomic_inc(&buffer->record_disabled);
> +

We probably need to add a read_disable field as well, and set it via the
raw_spin_lock reader_lock, and have all readers exit without reading
anything (like the buffer is empty).

> +	/* Make sure all commits have finished */
> +	synchronize_rcu();
> +
> +	buffer->page_size = size;
> +
> +	for_each_buffer_cpu(buffer, cpu) {
> +
> +		if (!cpumask_test_cpu(cpu, buffer->cpumask))
> +			continue;
> +
> +		nr_pages = buffer->buffers[cpu]->nr_pages;
> +		rb_free_cpu_buffer(buffer->buffers[cpu]);
> +		buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);

So are we considering that nr_pages is the number of pages, or the number
of sub buffers?

> +	}
> +
> +	atomic_dec(&buffer->record_disabled);
> +	mutex_unlock(&buffer->mutex);
> +
> +	return 0;
> +
> +out_err:

Nothing jumps to out_err.  ??

-- Steve


> +	buffer->page_size = old_size;
> +
> +	for_each_buffer_cpu(buffer, cpu) {
> +		struct buffer_page *bpage, *tmp;
> +
> +		cpu_buffer = buffer->buffers[cpu];
> +
> +		if (list_empty(&cpu_buffer->new_pages))
> +			continue;
> +
> +		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
> +			list_del_init(&bpage->list);
> +			free_buffer_page(bpage);
> +		}
> +		atomic_dec(&cpu_buffer->record_disabled);
> +		atomic_dec(&cpu_buffer->resize_disabled);
> +	}
> +
> +	mutex_unlock(&buffer->mutex);
> +
> +	return err;
> +}
> +
>  /**
>   * ring_buffer_page_size_set - set the size of ring buffer page.
>   * @buffer: The ring_buffer to set the new page size.
> @@ -5720,11 +5801,7 @@ int ring_buffer_page_size_set(struct trace_buffer *buffer, int pcount)
>  	if (psize <= BUF_PAGE_HDR_SIZE)
>  		return -EINVAL;
>  
> -	buffer->page_size = psize - BUF_PAGE_HDR_SIZE;
> -
> -	/* Todo: reset the buffer with the new page size */
> -
> -	return 0;
> +	return page_size_set(buffer, psize - BUF_PAGE_HDR_SIZE);
>  }
>  EXPORT_SYMBOL_GPL(ring_buffer_page_size_set);
>  


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] [RFC] trace: Page size per ring buffer
  2021-11-17 18:17   ` Steven Rostedt
@ 2021-11-18  4:32     ` Tzvetomir Stoyanov
  0 siblings, 0 replies; 10+ messages in thread
From: Tzvetomir Stoyanov @ 2021-11-18  4:32 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Linux Trace Devel

On Wed, Nov 17, 2021 at 8:18 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
[ ... ]
> > +static int release_header_file(struct inode *inode, struct file *file)
> > +{
> > +     struct trace_array *tr = inode->i_private;
> > +
> > +     trace_array_put(tr);
> > +
> > +     return 0;
> > +}
> > +
>
> Why add these and not just use:
>
> tracing_open_generic_tr() and tracing_release_generic()?
>
> Where the latter would need to become global to use.
>

This is another leftover of a previous version of that implementation
- I used inode->i_private to pass the buffer and filp->private_data to
pass the pointer to the function that should be called. That way the
same show_header_page_file() can be used for both "header_page" and
"header_event" files. But for that approach I had to change the
definition of ring_buffer_print_entry_header(), to be the same as
ring_buffer_print_page_header(). It would have an additional
trace_buffer input parameter that is not used. That's why decided to
have different "show_header_.." functions and not to touch
ring_buffer_print_entry_header().

>
> >  static ssize_t
> > -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> > +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
> >  {
> > -     int (*func)(struct trace_seq *s) = filp->private_data;
> > +     struct trace_array *tr = file_inode(filp)->i_private;
>
> Then you also do not need to use the file_inode macro, because the
> tracing_open_generic_tr() will set filp->private_data to it.
>
> -- Steve
>


-- 
Tzvetomir (Ceco) Stoyanov
VMware Open Source Technology Center

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-11-18  4:32 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-11-17 15:40 [RFC PATCH 0/3] Introduce configurable ring buffer page size Tzvetomir Stoyanov (VMware)
2021-11-17 15:40 ` [PATCH 1/3] [RFC] trace: Page size per ring buffer Tzvetomir Stoyanov (VMware)
2021-11-17 18:17   ` Steven Rostedt
2021-11-18  4:32     ` Tzvetomir Stoyanov
2021-11-17 15:41 ` [PATCH 2/3] [RFC] trace: Add interface for configuring trace ring buffer size Tzvetomir Stoyanov (VMware)
2021-11-17 18:39   ` Steven Rostedt
2021-11-17 15:41 ` [PATCH 3/3] [WiP] trace: Set new size of the ring buffer page Tzvetomir Stoyanov (VMware)
2021-11-17 19:50   ` Steven Rostedt
2021-11-17 17:10 ` [RFC PATCH 0/3] Introduce configurable ring buffer page size Steven Rostedt
2021-11-17 17:20   ` Tzvetomir Stoyanov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).