From: Alexander Graf <graf@amazon.com>
To: <linux-kernel@vger.kernel.org>
Cc: <linux-trace-kernel@vger.kernel.org>, <linux-mm@kvack.org>,
<devicetree@vger.kernel.org>,
<linux-arm-kernel@lists.infradead.org>,
<kexec@lists.infradead.org>, <linux-doc@vger.kernel.org>,
<x86@kernel.org>, Eric Biederman <ebiederm@xmission.com>,
"H. Peter Anvin" <hpa@zytor.com>,
Andy Lutomirski <luto@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
"Rob Herring" <robh+dt@kernel.org>,
Steven Rostedt <rostedt@goodmis.org>,
"Andrew Morton" <akpm@linux-foundation.org>,
Mark Rutland <mark.rutland@arm.com>,
"Tom Lendacky" <thomas.lendacky@amd.com>,
Ashish Kalra <ashish.kalra@amd.com>,
James Gowans <jgowans@amazon.com>,
Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>,
<arnd@arndb.de>, <pbonzini@redhat.com>,
<madvenka@linux.microsoft.com>,
Anthony Yznaga <anthony.yznaga@oracle.com>,
Usama Arif <usama.arif@bytedance.com>,
David Woodhouse <dwmw@amazon.co.uk>,
Benjamin Herrenschmidt <benh@kernel.crashing.org>
Subject: [PATCH 12/15] tracing: Recover trace buffers from kexec handover
Date: Wed, 13 Dec 2023 00:04:49 +0000 [thread overview]
Message-ID: <20231213000452.88295-13-graf@amazon.com> (raw)
In-Reply-To: <20231213000452.88295-1-graf@amazon.com>
When kexec handover is in place, we now know the location of all
previous buffers for ftrace rings. With this patch applied, ftrace
reassembles any new trace buffer that carries the same name as a
previous one with the same data pages that the previous buffer had.
That way, a buffer that we had in place before kexec becomes readable
after kexec again as soon as it gets initialized with the same name.
Signed-off-by: Alexander Graf <graf@amazon.com>
---
kernel/trace/ring_buffer.c | 173 ++++++++++++++++++++++++++++++++++++-
1 file changed, 171 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 691d1236eeb1..f3d07cb90762 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -575,6 +575,28 @@ struct ring_buffer_iter {
int missed_events;
};
+struct trace_kho_cpu {
+ const struct kho_mem *mem;
+ uint32_t nr_mems;
+};
+
+#ifdef CONFIG_FTRACE_KHO
+static int trace_kho_replace_buffers(struct ring_buffer_per_cpu *cpu_buffer,
+ struct trace_kho_cpu *kho);
+static int trace_kho_read_cpu(const char *name, int cpu, struct trace_kho_cpu *kho);
+#else
+static int trace_kho_replace_buffers(struct ring_buffer_per_cpu *cpu_buffer,
+ struct trace_kho_cpu *kho)
+{
+ return -EINVAL;
+}
+
+static int trace_kho_read_cpu(const char *name, int cpu, struct trace_kho_cpu *kho)
+{
+ return -EINVAL;
+}
+#endif
+
#ifdef RB_TIME_32
/*
@@ -1807,10 +1829,12 @@ struct trace_buffer *__ring_buffer_alloc(const char *name,
unsigned long size, unsigned flags,
struct lock_class_key *key)
{
+ int cpu = raw_smp_processor_id();
+ struct trace_kho_cpu kho = {};
struct trace_buffer *buffer;
+ bool use_kho = false;
long nr_pages;
int bsize;
- int cpu;
int ret;
/* keep it in its own cache line */
@@ -1823,6 +1847,12 @@ struct trace_buffer *__ring_buffer_alloc(const char *name,
goto fail_free_buffer;
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ if (!trace_kho_read_cpu(name, cpu, &kho) && kho.nr_mems > 4) {
+ nr_pages = kho.nr_mems / 2;
+ use_kho = true;
+ pr_debug("Using kho for buffer '%s' on CPU [%03d]", name, cpu);
+ }
+
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1843,12 +1873,14 @@ struct trace_buffer *__ring_buffer_alloc(const char *name,
if (!buffer->buffers)
goto fail_free_cpumask;
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
+ if (use_kho && trace_kho_replace_buffers(buffer->buffers[cpu], &kho))
+ pr_warn("Could not revive all previous trace data");
+
ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
if (ret < 0)
goto fail_free_buffers;
@@ -5886,7 +5918,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_page);
*/
int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
+ struct trace_kho_cpu kho = {};
struct trace_buffer *buffer;
+ bool use_kho = false;
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;
@@ -5910,6 +5944,12 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
/* allocate minimum pages, user can later expand it */
if (!nr_pages_same)
nr_pages = 2;
+
+ if (!trace_kho_read_cpu(buffer->name, cpu, &kho) && kho.nr_mems > 4) {
+ nr_pages = kho.nr_mems / 2;
+ use_kho = true;
+ }
+
buffer->buffers[cpu] =
rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
@@ -5917,12 +5957,141 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
+
+ if (use_kho && trace_kho_replace_buffers(buffer->buffers[cpu], &kho))
+ pr_warn("Could not revive all previous trace data");
+
smp_wmb();
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
#ifdef CONFIG_FTRACE_KHO
+static int trace_kho_replace_buffers(struct ring_buffer_per_cpu *cpu_buffer,
+ struct trace_kho_cpu *kho)
+{
+ bool first_loop = true;
+ struct list_head *tmp;
+ int err = 0;
+ int i = 0;
+
+ if (kho->nr_mems != cpu_buffer->nr_pages * 2)
+ return -EINVAL;
+
+ for (tmp = rb_list_head(cpu_buffer->pages);
+ tmp != rb_list_head(cpu_buffer->pages) || first_loop;
+ tmp = rb_list_head(tmp->next), first_loop = false) {
+ struct buffer_page *bpage = (struct buffer_page *)tmp;
+ const struct kho_mem *mem_bpage = &kho->mem[i++];
+ const struct kho_mem *mem_page = &kho->mem[i++];
+ const uint64_t rb_page_head = 1;
+ struct buffer_page *old_bpage;
+ void *old_page;
+
+ old_bpage = __va(mem_bpage->addr);
+ if (!bpage)
+ goto out;
+
+ if ((ulong)old_bpage->list.next & rb_page_head) {
+ struct list_head *new_lhead;
+ struct buffer_page *new_head;
+
+ new_lhead = rb_list_head(bpage->list.next);
+ new_head = (struct buffer_page *)new_lhead;
+
+ /* Assume the buffer is completely full */
+ cpu_buffer->tail_page = bpage;
+ cpu_buffer->commit_page = bpage;
+ /* Set the head pointers to what they were before */
+ cpu_buffer->head_page->list.prev->next = (struct list_head *)
+ ((ulong)cpu_buffer->head_page->list.prev->next & ~rb_page_head);
+ cpu_buffer->head_page = new_head;
+ bpage->list.next = (struct list_head *)((ulong)new_lhead | rb_page_head);
+ }
+
+ if (rb_page_entries(old_bpage) || rb_page_write(old_bpage)) {
+ /*
+ * We want to recycle the pre-kho page, it contains
+ * trace data. To do so, we unreserve it and swap the
+ * current data page with the pre-kho one
+ */
+ old_page = kho_claim_mem(mem_page);
+
+ /* Recycle the old page, it contains data */
+ free_page((ulong)bpage->page);
+ bpage->page = old_page;
+
+ bpage->write = old_bpage->write;
+ bpage->entries = old_bpage->entries;
+ bpage->real_end = old_bpage->real_end;
+
+ local_inc(&cpu_buffer->pages_touched);
+ } else {
+ kho_return_mem(mem_page);
+ }
+
+ kho_return_mem(mem_bpage);
+ }
+
+out:
+ return err;
+}
+
+static int trace_kho_read_cpu(const char *name, int cpu,
+ struct trace_kho_cpu *kho)
+{
+ void *fdt = kho_get_fdt();
+ int mem_len;
+ int err = 0;
+ char *path;
+ int off;
+
+ if (!fdt)
+ return -ENOENT;
+
+ if (!kho)
+ return -EINVAL;
+
+ path = kasprintf(GFP_KERNEL, "/ftrace/%s/buffer/cpu%x", name, cpu);
+ if (!path)
+ return -ENOMEM;
+
+ pr_debug("Trying to revive trace buffer '%s'", path);
+
+ off = fdt_path_offset(fdt, path);
+ if (off < 0) {
+ pr_debug("Could not find '%s' in DT", path);
+ err = -ENOENT;
+ goto out;
+ }
+
+ err = fdt_node_check_compatible(fdt, off, "ftrace,cpu-v1");
+ if (err) {
+ pr_warn("Node '%s' has invalid compatible", path);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kho->mem = fdt_getprop(fdt, off, "mem", &mem_len);
+ if (!kho->mem) {
+ pr_warn("Node '%s' has invalid mem property", path);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kho->nr_mems = mem_len / sizeof(*kho->mem);
+
+ /* Should follow "bpage 0, page 0, bpage 1, page 1, ..." pattern */
+ if ((kho->nr_mems & 1)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+out:
+ kfree(path);
+ return err;
+}
+
static int trace_kho_write_cpu(void *fdt, struct trace_buffer *buffer, int cpu)
{
int i = 0;
--
2.40.1
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
next prev parent reply other threads:[~2023-12-13 0:07 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-13 0:04 [PATCH 00/15] kexec: Allow preservation of ftrace buffers Alexander Graf
2023-12-13 0:04 ` [PATCH 01/15] mm,memblock: Add support for scratch memory Alexander Graf
2023-12-13 0:04 ` [PATCH 02/15] memblock: Declare scratch memory as CMA Alexander Graf
2023-12-13 11:32 ` kernel test robot
2023-12-13 0:04 ` [PATCH 03/15] kexec: Add Kexec HandOver (KHO) generation helpers Alexander Graf
2023-12-13 18:36 ` Stanislav Kinsburskii
2023-12-13 23:36 ` Alexander Graf
2023-12-13 0:04 ` [PATCH 04/15] kexec: Add KHO parsing support Alexander Graf
2023-12-13 18:56 ` Stanislav Kinsburskii
2023-12-13 0:04 ` [PATCH 05/15] kexec: Add KHO support to kexec file loads Alexander Graf
2023-12-13 0:04 ` [PATCH 06/15] arm64: Add KHO support Alexander Graf
2023-12-13 11:22 ` kernel test robot
2023-12-13 13:41 ` kernel test robot
2023-12-14 22:36 ` Rob Herring
2023-12-18 23:01 ` Alexander Graf
2023-12-13 0:04 ` [PATCH 07/15] x86: " Alexander Graf
2023-12-13 0:04 ` [PATCH 08/15] tracing: Introduce names for ring buffers Alexander Graf
2023-12-13 0:15 ` Steven Rostedt
2023-12-13 0:35 ` Alexander Graf
2023-12-13 0:44 ` Steven Rostedt
2023-12-13 11:22 ` kernel test robot
2023-12-13 0:04 ` [PATCH 09/15] tracing: Introduce names for events Alexander Graf
2023-12-13 0:49 ` Steven Rostedt
2023-12-13 0:04 ` [PATCH 10/15] tracing: Introduce kho serialization Alexander Graf
2023-12-13 0:04 ` [PATCH 11/15] tracing: Add kho serialization of trace buffers Alexander Graf
2023-12-13 0:04 ` Alexander Graf [this message]
2023-12-13 0:04 ` [PATCH 13/15] tracing: Add kho serialization of trace events Alexander Graf
2023-12-13 0:04 ` [PATCH 14/15] tracing: Recover trace events from kexec handover Alexander Graf
2023-12-13 0:04 ` [PATCH 15/15] tracing: Add config option for " Alexander Graf
2023-12-14 14:58 ` [PATCH 00/15] kexec: Allow preservation of ftrace buffers Eric W. Biederman
2023-12-14 16:02 ` Alexander Graf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231213000452.88295-13-graf@amazon.com \
--to=graf@amazon.com \
--cc=akpm@linux-foundation.org \
--cc=anthony.yznaga@oracle.com \
--cc=arnd@arndb.de \
--cc=ashish.kalra@amd.com \
--cc=benh@kernel.crashing.org \
--cc=devicetree@vger.kernel.org \
--cc=dwmw@amazon.co.uk \
--cc=ebiederm@xmission.com \
--cc=hpa@zytor.com \
--cc=jgowans@amazon.com \
--cc=kexec@lists.infradead.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=madvenka@linux.microsoft.com \
--cc=mark.rutland@arm.com \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=robh+dt@kernel.org \
--cc=rostedt@goodmis.org \
--cc=skinsburskii@linux.microsoft.com \
--cc=thomas.lendacky@amd.com \
--cc=usama.arif@bytedance.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).