[RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
@ 2019-02-07 11:19 Adrian Hunter
  2019-02-07 20:02 ` Peter Zijlstra
  2019-02-08 23:29 ` Alexei Starovoitov
  0 siblings, 2 replies; 9+ messages in thread
From: Adrian Hunter @ 2019-02-07 11:19 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Andi Kleen, Alexander Shishkin, Arnaldo Carvalho de Melo,
	Jiri Olsa, Song Liu, Daniel Borkmann, Alexei Starovoitov,
	linux-kernel

Subject to memory pressure and other limits, retain executable code, such
as JIT-compiled bpf, in memory instead of freeing it immediately it is no
longer needed for execution.

While perf is primarily aimed at statistical analysis, tools like Intel
PT can aim to provide a trace of exactly what happened. As such, corner
cases that can be overlooked statistically need to be addressed. For
example, there is a gap where JIT-compiled bpf can be freed from memory
before a tracer has a chance to read it out through the bpf syscall.
While that can be ignored statistically, it contributes to a death by
1000 cuts for tracers attempting to assemble exactly what happened. This is
a bit gratuitous given that retaining the executable code is relatively
simple, and the amount of memory involved relatively small. The retained
executable code is then available in memory images such as /proc/kcore.

This facility could perhaps be extended also to init sections.

Note that this patch is compile tested only and, at present, is missing
the ability to retain symbols.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 arch/x86/Kconfig.cpu       |   1 +
 include/linux/filter.h     |   4 +
 include/linux/xc_retain.h  |  49 ++++++++++
 init/Kconfig               |   6 ++
 kernel/Makefile            |   1 +
 kernel/bpf/core.c          |  44 ++++++++-
 kernel/xc_retain.c         | 183 +++++++++++++++++++++++++++++++++++++
 net/core/sysctl_net_core.c |  62 +++++++++++++
 8 files changed, 349 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/xc_retain.h
 create mode 100644 kernel/xc_retain.c

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6adce15268bd..21dcd064c272 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -389,6 +389,7 @@ menuconfig PROCESSOR_SELECT
 config CPU_SUP_INTEL
 	default y
 	bool "Support Intel processors" if PROCESSOR_SELECT
+	select XC_RETAIN if PERF_EVENTS && BPF_JIT
 	---help---
 	  This enables detection, tunings and quirks for Intel processors
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d531d4250bff..40b9f601e18f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -851,6 +851,10 @@ extern int bpf_jit_enable;
 extern int bpf_jit_harden;
 extern int bpf_jit_kallsyms;
 extern long bpf_jit_limit;
+extern unsigned int bpf_jit_retain_min;
+extern unsigned int bpf_jit_retain_max;
+
+void bpf_jit_retain_update_sz(void);
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
diff --git a/include/linux/xc_retain.h b/include/linux/xc_retain.h
new file mode 100644
index 000000000000..e79dc138bab8
--- /dev/null
+++ b/include/linux/xc_retain.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Intel Corporation.
+ */
+#ifndef _LINUX_XC_RETAIN_H
+#define _LINUX_XC_RETAIN_H
+
+#include <linux/list.h>
+#include <linux/shrinker.h>
+#include <linux/spinlock.h>
+
+struct xc_retain_ops {
+	void (*free)(void *addr);
+};
+
+struct xc_retain {
+	struct list_head list;
+	struct list_head items;
+	const struct xc_retain_ops ops;
+	unsigned int min_pages;
+	unsigned int max_pages;
+	unsigned int current_pages;
+	unsigned int item_cnt;
+	spinlock_t lock;
+	struct shrinker shrinker;
+};
+
+#ifdef CONFIG_XC_RETAIN
+int xc_retain_register(struct xc_retain *xr);
+void xc_retain_binary(struct xc_retain *xr, void *addr, unsigned int pages);
+void xc_retain_set_min_pages(struct xc_retain *xr, unsigned int min_pages);
+void xc_retain_set_max_pages(struct xc_retain *xr, unsigned int max_pages);
+#else
+static inline int xc_retain_register(struct xc_retain *xr)
+{
+	return 0;
+}
+static inline void xc_retain_binary(struct xc_retain *xr, void *addr,
+				    unsigned int pages)
+{
+	xr->ops.free(addr);
+}
+static inline void xc_retain_set_max_pages(struct xc_retain *xr,
+					   unsigned int max_pages)
+{
+}
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..954c288cabdc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1550,6 +1550,12 @@ config EMBEDDED
 	  an embedded system so certain expert options are available
 	  for configuration.
 
+config XC_RETAIN
+	bool
+	help
+	  Retain kernel executable code (e.g. jitted BPF) in memory after it
+	  would normally be freed.
+
 config HAVE_PERF_EVENTS
 	bool
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..5df40e2a934e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_XC_RETAIN) += xc_retain.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 19c49313c709..7fd235d235c2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,7 @@
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/perf_event.h>
+#include <linux/xc_retain.h>
 
 #include <asm/unaligned.h>
 
@@ -480,6 +481,10 @@ int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
 int bpf_jit_harden   __read_mostly;
 int bpf_jit_kallsyms __read_mostly;
 long bpf_jit_limit   __read_mostly;
+#define BPF_JIT_RETAIN_MIN 0
+#define BPF_JIT_RETAIN_MAX 16
+unsigned int bpf_jit_retain_min __read_mostly = BPF_JIT_RETAIN_MIN;
+unsigned int bpf_jit_retain_max __read_mostly = BPF_JIT_RETAIN_MAX;
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
@@ -795,6 +800,43 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(pages);
 }
 
+#ifdef CONFIG_XC_RETAIN
+static struct xc_retain bpf_jit_retain = {
+	.min_pages = BPF_JIT_RETAIN_MIN,
+	.max_pages = BPF_JIT_RETAIN_MAX,
+	.ops = {
+		.free = module_memfree,
+	},
+};
+
+void bpf_jit_retain_update_sz(void)
+{
+	xc_retain_set_min_pages(&bpf_jit_retain, bpf_jit_retain_min);
+	xc_retain_set_max_pages(&bpf_jit_retain, bpf_jit_retain_max);
+}
+
+static int __init bpf_jit_retain_init(void)
+{
+	return xc_retain_register(&bpf_jit_retain);
+}
+subsys_initcall(bpf_jit_retain_init);
+
+static void bpf_jit_binary_retain(struct bpf_prog *fp,
+				  struct bpf_binary_header *hdr)
+{
+	u32 pages = hdr->pages;
+
+	xc_retain_binary(&bpf_jit_retain, hdr, pages);
+	bpf_jit_uncharge_modmem(pages);
+}
+#else
+static void bpf_jit_binary_retain(struct bpf_prog *fp,
+				  struct bpf_binary_header *hdr)
+{
+	return bpf_jit_binary_free(hdr);
+}
+#endif
+
 /* This symbol is only overridden by archs that have different
  * requirements than the usual eBPF JITs, f.e. when they only
  * implement cBPF JIT, do not set images read-only, etc.
@@ -805,7 +847,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
 		bpf_jit_binary_unlock_ro(hdr);
-		bpf_jit_binary_free(hdr);
+		bpf_jit_binary_retain(fp, hdr);
 
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
 	}
diff --git a/kernel/xc_retain.c b/kernel/xc_retain.c
new file mode 100644
index 000000000000..fcf987d443f2
--- /dev/null
+++ b/kernel/xc_retain.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation.
+ */
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/shrinker.h>
+#include <linux/xc_retain.h>
+
+struct xc_retain_item {
+	struct list_head list;
+	void *addr;
+	unsigned int pages;
+};
+
+#define XC_RETAIN_MAX_ITEMS 1024
+
+static struct xc_retain_item *xc_retain_item_alloc(void *addr,
+						   unsigned int pages)
+{
+	struct xc_retain_item *item;
+
+	item = kzalloc(sizeof(*item), GFP_KERNEL);
+	if (item) {
+		INIT_LIST_HEAD(&item->list);
+		item->addr = addr;
+		item->pages = pages;
+	}
+
+	return item;
+}
+
+static void xc_retain_item_free(struct xc_retain *xr,
+				struct xc_retain_item *item)
+{
+	xr->ops.free(item->addr);
+	kfree(item);
+}
+
+static void xc_retain_item_add(struct xc_retain *xr,
+			       struct xc_retain_item *item)
+{
+	list_add_tail(&item->list, &xr->items);
+	xr->current_pages += item->pages;
+	xr->item_cnt += 1;
+}
+
+static void xc_retain_item_remove(struct xc_retain *xr,
+				  struct xc_retain_item *item)
+{
+	list_del(&item->list);
+	xr->current_pages -= item->pages;
+	xr->item_cnt -= 1;
+}
+
+static inline bool xc_retain_size_ok(struct xc_retain *xr)
+{
+	return xr->current_pages <= xr->max_pages &&
+	       xr->item_cnt <= XC_RETAIN_MAX_ITEMS;
+}
+
+static void xc_retain_resize(struct xc_retain *xr, struct list_head *to_free)
+{
+	struct xc_retain_item *item, *next;
+
+	if (xc_retain_size_ok(xr))
+		return;
+
+	list_for_each_entry_safe(item, next, &xr->items, list) {
+		xc_retain_item_remove(xr, item);
+		list_add_tail(&item->list, to_free);
+		if (xc_retain_size_ok(xr))
+			break;
+	}
+}
+
+static void xc_retain_items_free(struct xc_retain *xr,
+				 struct list_head *to_free)
+{
+	struct xc_retain_item *item, *next;
+
+	list_for_each_entry_safe(item, next, to_free, list)
+		xc_retain_item_free(xr, item);
+}
+
+void xc_retain_binary(struct xc_retain *xr, void *addr, unsigned int pages)
+{
+	struct xc_retain_item *item;
+	LIST_HEAD(to_free);
+
+	if (pages > xr->max_pages)
+		goto out_not_cached;
+
+	item = xc_retain_item_alloc(addr, pages);
+	if (!item)
+		goto out_not_cached;
+
+	spin_lock(&xr->lock);
+	xc_retain_item_add(xr, item);
+	xc_retain_resize(xr, &to_free);
+	spin_unlock(&xr->lock);
+
+	xc_retain_items_free(xr, &to_free);
+
+	return;
+
+out_not_cached:
+	xr->ops.free(addr);
+}
+
+void xc_retain_set_min_pages(struct xc_retain *xr, unsigned int min_pages)
+{
+	spin_lock(&xr->lock);
+	xr->min_pages = min_pages;
+	spin_unlock(&xr->lock);
+}
+
+void xc_retain_set_max_pages(struct xc_retain *xr, unsigned int max_pages)
+{
+	LIST_HEAD(to_free);
+
+	spin_lock(&xr->lock);
+	xr->max_pages = max_pages;
+	xc_retain_resize(xr, &to_free);
+	spin_unlock(&xr->lock);
+
+	xc_retain_items_free(xr, &to_free);
+}
+
+static unsigned long xc_retain_shrink_count(struct shrinker *shrinker,
+					    struct shrink_control *sc)
+{
+	struct xc_retain *xr = container_of(shrinker, struct xc_retain, shrinker);
+	struct xc_retain_item *item;
+	unsigned long nr = 0;
+
+	spin_lock(&xr->lock);
+	list_for_each_entry(item, &xr->items, list) {
+		if (xr->current_pages - item->pages < xr->min_pages)
+			break;
+		nr += 1;
+	}
+	spin_unlock(&xr->lock);
+
+	return nr ?: SHRINK_EMPTY;
+}
+
+static unsigned long xc_retain_shrink_scan(struct shrinker *shrinker,
+					   struct shrink_control *sc)
+{
+	struct xc_retain *xr = container_of(shrinker, struct xc_retain, shrinker);
+	struct xc_retain_item *item;
+	unsigned long freed;
+	LIST_HEAD(to_free);
+
+	spin_lock(&xr->lock);
+	for (freed = 0; sc->nr_to_scan && xr->item_cnt; freed++, sc->nr_to_scan--) {
+		item = list_first_entry(&xr->items, struct xc_retain_item, list);
+		if (xr->current_pages - item->pages < xr->min_pages)
+			break;
+		xc_retain_item_remove(xr, item);
+		list_add_tail(&item->list, &to_free);
+	}
+	spin_unlock(&xr->lock);
+
+	xc_retain_items_free(xr, &to_free);
+
+	return freed;
+}
+
+int xc_retain_register(struct xc_retain *xr)
+{
+	INIT_LIST_HEAD(&xr->list);
+	INIT_LIST_HEAD(&xr->items);
+
+	spin_lock_init(&xr->lock);
+
+	xr->shrinker.count_objects = xc_retain_shrink_count;
+	xr->shrinker.scan_objects = xc_retain_shrink_scan;
+
+	return register_shrinker(&xr->shrinker);
+}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d67ec17f2cc8..144d440dd9cd 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -28,6 +28,7 @@ static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static int int_max __maybe_unused = INT_MAX;
 static long long_one __maybe_unused = 1;
 static long long_max __maybe_unused = LONG_MAX;
 
@@ -302,6 +303,47 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
+
+# ifdef CONFIG_XC_RETAIN
+#define BPF_JIT_CACHE_LIMIT_SHIFT 4
+static int proc_bpf_jit_retain_sz(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos, bool is_max)
+{
+	int ret, val = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &val;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		if (is_max ||
+		    val <= (totalram_pages() >> BPF_JIT_CACHE_LIMIT_SHIFT)) {
+			*(int *)table->data = val;
+			bpf_jit_retain_update_sz();
+		} else {
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+static int proc_bpf_jit_retain_min(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	return proc_bpf_jit_retain_sz(table, write, buffer, lenp, ppos, false);
+}
+
+static int proc_bpf_jit_retain_max(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp,
+				  loff_t *ppos)
+{
+	return proc_bpf_jit_retain_sz(table, write, buffer, lenp, ppos, true);
+}
+# endif
 #endif
 
 static struct ctl_table net_core_table[] = {
@@ -417,6 +459,26 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &long_one,
 		.extra2		= &long_max,
 	},
+# ifdef CONFIG_XC_RETAIN
+	{
+		.procname	= "bpf_jit_retain_min",
+		.data		= &bpf_jit_retain_min,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_bpf_jit_retain_min,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "bpf_jit_retain_max",
+		.data		= &bpf_jit_retain_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_bpf_jit_retain_max,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+# endif
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-07 11:19 [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing Adrian Hunter
@ 2019-02-07 20:02 ` Peter Zijlstra
  2019-02-08  8:53   ` Adrian Hunter
  2019-02-11  7:46   ` Ingo Molnar
  2019-02-08 23:29 ` Alexei Starovoitov
  1 sibling, 2 replies; 9+ messages in thread
From: Peter Zijlstra @ 2019-02-07 20:02 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel

On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
> Subject to memory pressure and other limits, retain executable code, such
> as JIT-compiled bpf, in memory instead of freeing it immediately it is no
> longer needed for execution.
> 
> While perf is primarily aimed at statistical analysis, tools like Intel
> PT can aim to provide a trace of exactly what happened. As such, corner
> cases that can be overlooked statistically need to be addressed. For
> example, there is a gap where JIT-compiled bpf can be freed from memory
> before a tracer has a chance to read it out through the bpf syscall.
> While that can be ignored statistically, it contributes to a death by
> 1000 cuts for tracers attempting to assemble exactly what happened. This is
> a bit gratuitous given that retaining the executable code is relatively
> simple, and the amount of memory involved relatively small. The retained
> executable code is then available in memory images such as /proc/kcore.
> 
> This facility could perhaps be extended also to init sections.
> 
> Note that this patch is compile tested only and, at present, is missing
> the ability to retain symbols.

You don't need the symbols; you already have them through
PERF_RECORD_KSYMBOL.

Also; afaict this patch guarantees exactly nothing. It registers a
shrinker which will (given enough memory pressure) happily free your
text before we get around to copying it out.

Did you read this proposal?

  https://lkml.kernel.org/r/20190109101808.GG1900@hirez.programming.kicks-ass.net

(also: s/KCORE_QC/KCORE_QS/ for quiescent state)

That would create an RCU like interface to /proc/kcore and give you the
guarantees you need, while also allowing the memory to get freed once
you've obtained a copy.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-07 20:02 ` Peter Zijlstra
@ 2019-02-08  8:53   ` Adrian Hunter
  2019-02-08  9:12     ` Peter Zijlstra
  2019-02-11  7:46   ` Ingo Molnar
  1 sibling, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2019-02-08  8:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel

On 7/02/19 10:02 PM, Peter Zijlstra wrote:
> On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
>> Subject to memory pressure and other limits, retain executable code, such
>> as JIT-compiled bpf, in memory instead of freeing it immediately it is no
>> longer needed for execution.
>>
>> While perf is primarily aimed at statistical analysis, tools like Intel
>> PT can aim to provide a trace of exactly what happened. As such, corner
>> cases that can be overlooked statistically need to be addressed. For
>> example, there is a gap where JIT-compiled bpf can be freed from memory
>> before a tracer has a chance to read it out through the bpf syscall.
>> While that can be ignored statistically, it contributes to a death by
>> 1000 cuts for tracers attempting to assemble exactly what happened. This is
>> a bit gratuitous given that retaining the executable code is relatively
>> simple, and the amount of memory involved relatively small. The retained
>> executable code is then available in memory images such as /proc/kcore.
>>
>> This facility could perhaps be extended also to init sections.
>>
>> Note that this patch is compile tested only and, at present, is missing
>> the ability to retain symbols.
> 
> You don't need the symbols; you already have them through
> PERF_RECORD_KSYMBOL.

And you intend to use that for module loading/unloading also?

> 
> Also; afaict this patch guarantees exactly nothing. It registers a
> shrinker which will (given enough memory pressure) happily free your
> text before we get around to copying it out.

No, there is a minimum size (default 0) which is not subject to the shrinker.

> 
> Did you read this proposal?

Please cc me on anything affecting Intel PT decoding.

> 
>   https://lkml.kernel.org/r/20190109101808.GG1900@hirez.programming.kicks-ass.net
> 
> (also: s/KCORE_QC/KCORE_QS/ for quiescent state)
> 
> That would create an RCU like interface to /proc/kcore and give you the
> guarantees you need, while also allowing the memory to get freed once
> you've obtained a copy.

So, open /proc/kcore and it pins all executable code in memory?

Do you intend to extend that to module / module init unloads?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-08  8:53   ` Adrian Hunter
@ 2019-02-08  9:12     ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2019-02-08  9:12 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel

On Fri, Feb 08, 2019 at 10:53:41AM +0200, Adrian Hunter wrote:
> On 7/02/19 10:02 PM, Peter Zijlstra wrote:
> > On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
> >> Subject to memory pressure and other limits, retain executable code, such
> >> as JIT-compiled bpf, in memory instead of freeing it immediately it is no
> >> longer needed for execution.
> >>
> >> While perf is primarily aimed at statistical analysis, tools like Intel
> >> PT can aim to provide a trace of exactly what happened. As such, corner
> >> cases that can be overlooked statistically need to be addressed. For
> >> example, there is a gap where JIT-compiled bpf can be freed from memory
> >> before a tracer has a chance to read it out through the bpf syscall.
> >> While that can be ignored statistically, it contributes to a death by
> >> 1000 cuts for tracers attempting to assemble exactly what happened. This is
> >> a bit gratuitous given that retaining the executable code is relatively
> >> simple, and the amount of memory involved relatively small. The retained
> >> executable code is then available in memory images such as /proc/kcore.
> >>
> >> This facility could perhaps be extended also to init sections.
> >>
> >> Note that this patch is compile tested only and, at present, is missing
> >> the ability to retain symbols.
> > 
> > You don't need the symbols; you already have them through
> > PERF_RECORD_KSYMBOL.
> 
> And you intend to use that for module loading/unloading also?
> 
> > 
> > Also; afaict this patch guarantees exactly nothing. It registers a
> > shrinker which will (given enough memory pressure) happily free your
> > text before we get around to copying it out.
> 
> No, there is a minimum size (default 0) which is not subject to the shrinker.
> 
> > 
> > Did you read this proposal?
> 
> Please cc me on anything affecting Intel PT decoding.
> 
> > 
> >   https://lkml.kernel.org/r/20190109101808.GG1900@hirez.programming.kicks-ass.net
> > 
> > (also: s/KCORE_QC/KCORE_QS/ for quiescent state)
> > 
> > That would create an RCU like interface to /proc/kcore and give you the
> > guarantees you need, while also allowing the memory to get freed once
> > you've obtained a copy.
> 
> So, open /proc/kcore and it pins all executable code in memory?
> 
> Do you intend to extend that to module / module init unloads?

I didn't intent to write it at all; but yes the intend was for this to
apply to all executable maps. Modules, BPF, ftrace, everything.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-07 20:02 ` Peter Zijlstra
  2019-02-08  8:53   ` Adrian Hunter
@ 2019-02-11  7:46   ` Ingo Molnar
  1 sibling, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2019-02-11  7:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Adrian Hunter, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
> > Subject to memory pressure and other limits, retain executable code, such
> > as JIT-compiled bpf, in memory instead of freeing it immediately it is no
> > longer needed for execution.
> > 
> > While perf is primarily aimed at statistical analysis, tools like Intel
> > PT can aim to provide a trace of exactly what happened. As such, corner
> > cases that can be overlooked statistically need to be addressed. For
> > example, there is a gap where JIT-compiled bpf can be freed from memory
> > before a tracer has a chance to read it out through the bpf syscall.
> > While that can be ignored statistically, it contributes to a death by
> > 1000 cuts for tracers attempting to assemble exactly what happened. This is
> > a bit gratuitous given that retaining the executable code is relatively
> > simple, and the amount of memory involved relatively small. The retained
> > executable code is then available in memory images such as /proc/kcore.
> > 
> > This facility could perhaps be extended also to init sections.
> > 
> > Note that this patch is compile tested only and, at present, is missing
> > the ability to retain symbols.
> 
> You don't need the symbols; you already have them through
> PERF_RECORD_KSYMBOL.
> 
> Also; afaict this patch guarantees exactly nothing. It registers a
> shrinker which will (given enough memory pressure) happily free your
> text before we get around to copying it out.
> 
> Did you read this proposal?
> 
>   https://lkml.kernel.org/r/20190109101808.GG1900@hirez.programming.kicks-ass.net
> 
> (also: s/KCORE_QC/KCORE_QS/ for quiescent state)
> 
> That would create an RCU like interface to /proc/kcore and give you the
> guarantees you need, while also allowing the memory to get freed once
> you've obtained a copy.

Yeah, adding a proper change-notification interface to /proc/kcore sounds 
like a superior solution to trying to shoehorn this down perf's throat.

It's not like any of this is useful without having opened /proc/kcore.

Also, /proc/kcore is privileged, so the indefinite resource allocation 
side effect in case user-space doesn't drain the lists is OK.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-07 11:19 [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing Adrian Hunter
  2019-02-07 20:02 ` Peter Zijlstra
@ 2019-02-08 23:29 ` Alexei Starovoitov
  2019-02-11  7:54   ` Adrian Hunter
  1 sibling, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2019-02-08 23:29 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Peter Zijlstra, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel, netdev

On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
> Subject to memory pressure and other limits, retain executable code, such
> as JIT-compiled bpf, in memory instead of freeing it immediately it is no
> longer needed for execution.
> 
> While perf is primarily aimed at statistical analysis, tools like Intel
> PT can aim to provide a trace of exactly what happened. As such, corner
> cases that can be overlooked statistically need to be addressed. For
> example, there is a gap where JIT-compiled bpf can be freed from memory
> before a tracer has a chance to read it out through the bpf syscall.
> While that can be ignored statistically, it contributes to a death by
> 1000 cuts for tracers attempting to assemble exactly what happened. This is
> a bit gratuitous given that retaining the executable code is relatively
> simple, and the amount of memory involved relatively small. The retained
> executable code is then available in memory images such as /proc/kcore.
> 
> This facility could perhaps be extended also to init sections.
> 
> Note that this patch is compile tested only and, at present, is missing
> the ability to retain symbols.
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> ---
>  arch/x86/Kconfig.cpu       |   1 +
>  include/linux/filter.h     |   4 +
>  include/linux/xc_retain.h  |  49 ++++++++++
>  init/Kconfig               |   6 ++
>  kernel/Makefile            |   1 +
>  kernel/bpf/core.c          |  44 ++++++++-
>  kernel/xc_retain.c         | 183 +++++++++++++++++++++++++++++++++++++
>  net/core/sysctl_net_core.c |  62 +++++++++++++
>  8 files changed, 349 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/xc_retain.h
>  create mode 100644 kernel/xc_retain.c
> 
> diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
> index 6adce15268bd..21dcd064c272 100644
> --- a/arch/x86/Kconfig.cpu
> +++ b/arch/x86/Kconfig.cpu
> @@ -389,6 +389,7 @@ menuconfig PROCESSOR_SELECT
>  config CPU_SUP_INTEL
>  	default y
>  	bool "Support Intel processors" if PROCESSOR_SELECT
> +	select XC_RETAIN if PERF_EVENTS && BPF_JIT
>  	---help---
>  	  This enables detection, tunings and quirks for Intel processors
>  
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d531d4250bff..40b9f601e18f 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -851,6 +851,10 @@ extern int bpf_jit_enable;
>  extern int bpf_jit_harden;
>  extern int bpf_jit_kallsyms;
>  extern long bpf_jit_limit;
> +extern unsigned int bpf_jit_retain_min;
> +extern unsigned int bpf_jit_retain_max;
> +
> +void bpf_jit_retain_update_sz(void);
>  
>  typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
>  
> diff --git a/include/linux/xc_retain.h b/include/linux/xc_retain.h
> new file mode 100644
> index 000000000000..e79dc138bab8
> --- /dev/null
> +++ b/include/linux/xc_retain.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2019 Intel Corporation.
> + */
> +#ifndef _LINUX_XC_RETAIN_H
> +#define _LINUX_XC_RETAIN_H
> +
> +#include <linux/list.h>
> +#include <linux/shrinker.h>
> +#include <linux/spinlock.h>
> +
> +struct xc_retain_ops {
> +	void (*free)(void *addr);
> +};
> +
> +struct xc_retain {
> +	struct list_head list;
> +	struct list_head items;
> +	const struct xc_retain_ops ops;
> +	unsigned int min_pages;
> +	unsigned int max_pages;
> +	unsigned int current_pages;
> +	unsigned int item_cnt;
> +	spinlock_t lock;
> +	struct shrinker shrinker;
> +};
> +
> +#ifdef CONFIG_XC_RETAIN
> +int xc_retain_register(struct xc_retain *xr);
> +void xc_retain_binary(struct xc_retain *xr, void *addr, unsigned int pages);
> +void xc_retain_set_min_pages(struct xc_retain *xr, unsigned int min_pages);
> +void xc_retain_set_max_pages(struct xc_retain *xr, unsigned int max_pages);
> +#else
> +static inline int xc_retain_register(struct xc_retain *xr)
> +{
> +	return 0;
> +}
> +static inline void xc_retain_binary(struct xc_retain *xr, void *addr,
> +				    unsigned int pages)
> +{
> +	xr->ops.free(addr);
> +}
> +static inline void xc_retain_set_max_pages(struct xc_retain *xr,
> +					   unsigned int max_pages)
> +{
> +}
> +#endif
> +
> +#endif
> diff --git a/init/Kconfig b/init/Kconfig
> index c9386a365eea..954c288cabdc 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1550,6 +1550,12 @@ config EMBEDDED
>  	  an embedded system so certain expert options are available
>  	  for configuration.
>  
> +config XC_RETAIN
> +	bool
> +	help
> +	  Retain kernel executable code (e.g. jitted BPF) in memory after it
> +	  would normally be freed.
> +
>  config HAVE_PERF_EVENTS
>  	bool
>  	help
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 6aa7543bcdb2..5df40e2a934e 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -98,6 +98,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
>  obj-$(CONFIG_IRQ_WORK) += irq_work.o
>  obj-$(CONFIG_CPU_PM) += cpu_pm.o
>  obj-$(CONFIG_BPF) += bpf/
> +obj-$(CONFIG_XC_RETAIN) += xc_retain.o
>  
>  obj-$(CONFIG_PERF_EVENTS) += events/
>  
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 19c49313c709..7fd235d235c2 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -34,6 +34,7 @@
>  #include <linux/kallsyms.h>
>  #include <linux/rcupdate.h>
>  #include <linux/perf_event.h>
> +#include <linux/xc_retain.h>
>  
>  #include <asm/unaligned.h>
>  
> @@ -480,6 +481,10 @@ int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
>  int bpf_jit_harden   __read_mostly;
>  int bpf_jit_kallsyms __read_mostly;
>  long bpf_jit_limit   __read_mostly;
> +#define BPF_JIT_RETAIN_MIN 0
> +#define BPF_JIT_RETAIN_MAX 16
> +unsigned int bpf_jit_retain_min __read_mostly = BPF_JIT_RETAIN_MIN;
> +unsigned int bpf_jit_retain_max __read_mostly = BPF_JIT_RETAIN_MAX;
>  
>  static __always_inline void
>  bpf_get_prog_addr_region(const struct bpf_prog *prog,
> @@ -795,6 +800,43 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
>  	bpf_jit_uncharge_modmem(pages);
>  }
>  
> +#ifdef CONFIG_XC_RETAIN
> +static struct xc_retain bpf_jit_retain = {
> +	.min_pages = BPF_JIT_RETAIN_MIN,
> +	.max_pages = BPF_JIT_RETAIN_MAX,
> +	.ops = {
> +		.free = module_memfree,
> +	},
> +};
> +
> +void bpf_jit_retain_update_sz(void)
> +{
> +	xc_retain_set_min_pages(&bpf_jit_retain, bpf_jit_retain_min);
> +	xc_retain_set_max_pages(&bpf_jit_retain, bpf_jit_retain_max);
> +}
> +
> +static int __init bpf_jit_retain_init(void)
> +{
> +	return xc_retain_register(&bpf_jit_retain);
> +}
> +subsys_initcall(bpf_jit_retain_init);
> +
> +static void bpf_jit_binary_retain(struct bpf_prog *fp,
> +				  struct bpf_binary_header *hdr)
> +{
> +	u32 pages = hdr->pages;
> +
> +	xc_retain_binary(&bpf_jit_retain, hdr, pages);
> +	bpf_jit_uncharge_modmem(pages);
> +}
> +#else
> +static void bpf_jit_binary_retain(struct bpf_prog *fp,
> +				  struct bpf_binary_header *hdr)
> +{
> +	return bpf_jit_binary_free(hdr);
> +}
> +#endif

I'm strongly against this approach.

I understand that it's under CONFIG, but changing kernel
into garbage collection nightmare even under config
or sysctl is not an option.
In many cases bpf progs are loaded/unloaded a lot.
Consider CI test system that runs tests 24/7.
bpf progs are loaded/unloaded in huge numbers.
Such system will suffer non deterministic test and
performance results due to shrinkers.
perf analysis with PT becomes inaccurate and main goal
of retaining accurate instruction info is not achieved.
bpf_jit_retain_min/max tunables is not an option either.
Please see how perf record is handling bpf prog/unload.
What stops you from doing the same for PT?


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-08 23:29 ` Alexei Starovoitov
@ 2019-02-11  7:54   ` Adrian Hunter
  2019-02-11  8:18     ` Alexei Starovoitov
  0 siblings, 1 reply; 9+ messages in thread
From: Adrian Hunter @ 2019-02-11  7:54 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Peter Zijlstra, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel, netdev

On 9/02/19 1:29 AM, Alexei Starovoitov wrote:
> On Thu, Feb 07, 2019 at 01:19:01PM +0200, Adrian Hunter wrote:
>> Subject to memory pressure and other limits, retain executable code, such
>> as JIT-compiled bpf, in memory instead of freeing it immediately it is no
>> longer needed for execution.
>>
>> While perf is primarily aimed at statistical analysis, tools like Intel
>> PT can aim to provide a trace of exactly what happened. As such, corner
>> cases that can be overlooked statistically need to be addressed. For
>> example, there is a gap where JIT-compiled bpf can be freed from memory
>> before a tracer has a chance to read it out through the bpf syscall.
>> While that can be ignored statistically, it contributes to a death by
>> 1000 cuts for tracers attempting to assemble exactly what happened. This is
>> a bit gratuitous given that retaining the executable code is relatively
>> simple, and the amount of memory involved relatively small. The retained
>> executable code is then available in memory images such as /proc/kcore.
>>
>> This facility could perhaps be extended also to init sections.
>>
>> Note that this patch is compile tested only and, at present, is missing
>> the ability to retain symbols.
>>
>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>> ---
>>  arch/x86/Kconfig.cpu       |   1 +
>>  include/linux/filter.h     |   4 +
>>  include/linux/xc_retain.h  |  49 ++++++++++
>>  init/Kconfig               |   6 ++
>>  kernel/Makefile            |   1 +
>>  kernel/bpf/core.c          |  44 ++++++++-
>>  kernel/xc_retain.c         | 183 +++++++++++++++++++++++++++++++++++++
>>  net/core/sysctl_net_core.c |  62 +++++++++++++
>>  8 files changed, 349 insertions(+), 1 deletion(-)
>>  create mode 100644 include/linux/xc_retain.h
>>  create mode 100644 kernel/xc_retain.c
>>
>> diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
>> index 6adce15268bd..21dcd064c272 100644
>> --- a/arch/x86/Kconfig.cpu
>> +++ b/arch/x86/Kconfig.cpu
>> @@ -389,6 +389,7 @@ menuconfig PROCESSOR_SELECT
>>  config CPU_SUP_INTEL
>>  	default y
>>  	bool "Support Intel processors" if PROCESSOR_SELECT
>> +	select XC_RETAIN if PERF_EVENTS && BPF_JIT
>>  	---help---
>>  	  This enables detection, tunings and quirks for Intel processors
>>  
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index d531d4250bff..40b9f601e18f 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -851,6 +851,10 @@ extern int bpf_jit_enable;
>>  extern int bpf_jit_harden;
>>  extern int bpf_jit_kallsyms;
>>  extern long bpf_jit_limit;
>> +extern unsigned int bpf_jit_retain_min;
>> +extern unsigned int bpf_jit_retain_max;
>> +
>> +void bpf_jit_retain_update_sz(void);
>>  
>>  typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
>>  
>> diff --git a/include/linux/xc_retain.h b/include/linux/xc_retain.h
>> new file mode 100644
>> index 000000000000..e79dc138bab8
>> --- /dev/null
>> +++ b/include/linux/xc_retain.h
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (C) 2019 Intel Corporation.
>> + */
>> +#ifndef _LINUX_XC_RETAIN_H
>> +#define _LINUX_XC_RETAIN_H
>> +
>> +#include <linux/list.h>
>> +#include <linux/shrinker.h>
>> +#include <linux/spinlock.h>
>> +
>> +struct xc_retain_ops {
>> +	void (*free)(void *addr);
>> +};
>> +
>> +struct xc_retain {
>> +	struct list_head list;
>> +	struct list_head items;
>> +	const struct xc_retain_ops ops;
>> +	unsigned int min_pages;
>> +	unsigned int max_pages;
>> +	unsigned int current_pages;
>> +	unsigned int item_cnt;
>> +	spinlock_t lock;
>> +	struct shrinker shrinker;
>> +};
>> +
>> +#ifdef CONFIG_XC_RETAIN
>> +int xc_retain_register(struct xc_retain *xr);
>> +void xc_retain_binary(struct xc_retain *xr, void *addr, unsigned int pages);
>> +void xc_retain_set_min_pages(struct xc_retain *xr, unsigned int min_pages);
>> +void xc_retain_set_max_pages(struct xc_retain *xr, unsigned int max_pages);
>> +#else
>> +static inline int xc_retain_register(struct xc_retain *xr)
>> +{
>> +	return 0;
>> +}
>> +static inline void xc_retain_binary(struct xc_retain *xr, void *addr,
>> +				    unsigned int pages)
>> +{
>> +	xr->ops.free(addr);
>> +}
>> +static inline void xc_retain_set_max_pages(struct xc_retain *xr,
>> +					   unsigned int max_pages)
>> +{
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/init/Kconfig b/init/Kconfig
>> index c9386a365eea..954c288cabdc 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1550,6 +1550,12 @@ config EMBEDDED
>>  	  an embedded system so certain expert options are available
>>  	  for configuration.
>>  
>> +config XC_RETAIN
>> +	bool
>> +	help
>> +	  Retain kernel executable code (e.g. jitted BPF) in memory after it
>> +	  would normally be freed.
>> +
>>  config HAVE_PERF_EVENTS
>>  	bool
>>  	help
>> diff --git a/kernel/Makefile b/kernel/Makefile
>> index 6aa7543bcdb2..5df40e2a934e 100644
>> --- a/kernel/Makefile
>> +++ b/kernel/Makefile
>> @@ -98,6 +98,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
>>  obj-$(CONFIG_IRQ_WORK) += irq_work.o
>>  obj-$(CONFIG_CPU_PM) += cpu_pm.o
>>  obj-$(CONFIG_BPF) += bpf/
>> +obj-$(CONFIG_XC_RETAIN) += xc_retain.o
>>  
>>  obj-$(CONFIG_PERF_EVENTS) += events/
>>  
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 19c49313c709..7fd235d235c2 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -34,6 +34,7 @@
>>  #include <linux/kallsyms.h>
>>  #include <linux/rcupdate.h>
>>  #include <linux/perf_event.h>
>> +#include <linux/xc_retain.h>
>>  
>>  #include <asm/unaligned.h>
>>  
>> @@ -480,6 +481,10 @@ int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
>>  int bpf_jit_harden   __read_mostly;
>>  int bpf_jit_kallsyms __read_mostly;
>>  long bpf_jit_limit   __read_mostly;
>> +#define BPF_JIT_RETAIN_MIN 0
>> +#define BPF_JIT_RETAIN_MAX 16
>> +unsigned int bpf_jit_retain_min __read_mostly = BPF_JIT_RETAIN_MIN;
>> +unsigned int bpf_jit_retain_max __read_mostly = BPF_JIT_RETAIN_MAX;
>>  
>>  static __always_inline void
>>  bpf_get_prog_addr_region(const struct bpf_prog *prog,
>> @@ -795,6 +800,43 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
>>  	bpf_jit_uncharge_modmem(pages);
>>  }
>>  
>> +#ifdef CONFIG_XC_RETAIN
>> +static struct xc_retain bpf_jit_retain = {
>> +	.min_pages = BPF_JIT_RETAIN_MIN,
>> +	.max_pages = BPF_JIT_RETAIN_MAX,
>> +	.ops = {
>> +		.free = module_memfree,
>> +	},
>> +};
>> +
>> +void bpf_jit_retain_update_sz(void)
>> +{
>> +	xc_retain_set_min_pages(&bpf_jit_retain, bpf_jit_retain_min);
>> +	xc_retain_set_max_pages(&bpf_jit_retain, bpf_jit_retain_max);
>> +}
>> +
>> +static int __init bpf_jit_retain_init(void)
>> +{
>> +	return xc_retain_register(&bpf_jit_retain);
>> +}
>> +subsys_initcall(bpf_jit_retain_init);
>> +
>> +static void bpf_jit_binary_retain(struct bpf_prog *fp,
>> +				  struct bpf_binary_header *hdr)
>> +{
>> +	u32 pages = hdr->pages;
>> +
>> +	xc_retain_binary(&bpf_jit_retain, hdr, pages);
>> +	bpf_jit_uncharge_modmem(pages);
>> +}
>> +#else
>> +static void bpf_jit_binary_retain(struct bpf_prog *fp,
>> +				  struct bpf_binary_header *hdr)
>> +{
>> +	return bpf_jit_binary_free(hdr);
>> +}
>> +#endif
> 
> I'm strongly against this approach.

Thanks for commenting, but Peter wrote that he prefers this option:

	https://lkml.kernel.org/r/20190109101808.GG1900@hirez.programming.kicks-ass.net

> I understand that it's under CONFIG, but changing kernel
> into garbage collection nightmare even under config
> or sysctl is not an option.
> In many cases bpf progs are loaded/unloaded a lot.
> Consider CI test system that runs tests 24/7.
> bpf progs are loaded/unloaded in huge numbers.

Which is not really a real use-case.

For PT, the data recorded is generally too large (e.g. 100MB per cpu per second)
to record continuously.  But there is a "snapshot" mode that just captures
the PT data that is currently buffered.  In that case we want to have a
kernel image that reflects what was running for the last small period of
time.

> Such system will suffer non deterministic test and
> performance results due to shrinkers.

The default was 16 pages, after that everything gets freed immediately. 
That is assuming you don't turn it off for test purposes.  That would
not have an effect on your tests.

Also, why would copying it out of memory be less intrusive that keeping it
in memory.  Copying every bpf jit to disk for every load would be much more
intrusive.

> perf analysis with PT becomes inaccurate and main goal
> of retaining accurate instruction info is not achieved.

For the majority of real use-cases, yes it is.

> bpf_jit_retain_min/max tunables is not an option either.
> Please see how perf record is handling bpf prog/unload.
> What stops you from doing the same for PT?

Opens the door for the bpf program to be unloaded before it is copied out. 
All the corner cases where it is not possible to get accurate decoding start
to add up.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-11  7:54   ` Adrian Hunter
@ 2019-02-11  8:18     ` Alexei Starovoitov
  2019-02-11  8:24       ` Adrian Hunter
  0 siblings, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2019-02-11  8:18 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: Ingo Molnar, Peter Zijlstra, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel, netdev

On Mon, Feb 11, 2019 at 09:54:01AM +0200, Adrian Hunter wrote:
> 
> Which is not really a real use-case.
..
> > perf analysis with PT becomes inaccurate and main goal
> > of retaining accurate instruction info is not achieved.
> 
> For the majority of real use-cases, yes it is.

In our fleet not a single server is using Intel PT, yet you're
proposing to penalize all of them with shrinker-based JIT freeing?
There is no negotiation here.
NACK


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing
  2019-02-11  8:18     ` Alexei Starovoitov
@ 2019-02-11  8:24       ` Adrian Hunter
  0 siblings, 0 replies; 9+ messages in thread
From: Adrian Hunter @ 2019-02-11  8:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Peter Zijlstra, Andi Kleen, Alexander Shishkin,
	Arnaldo Carvalho de Melo, Jiri Olsa, Song Liu, Daniel Borkmann,
	Alexei Starovoitov, linux-kernel, netdev

On 11/02/19 10:18 AM, Alexei Starovoitov wrote:
> On Mon, Feb 11, 2019 at 09:54:01AM +0200, Adrian Hunter wrote:
>>
>> Which is not really a real use-case.
> ..
>>> perf analysis with PT becomes inaccurate and main goal
>>> of retaining accurate instruction info is not achieved.
>>
>> For the majority of real use-cases, yes it is.
> 
> In our fleet not a single server is using Intel PT, yet you're
> proposing to penalize all of them with shrinker-based JIT freeing?

I already responded to that.

> There is no negotiation here.

Apart from Peter and Ingo already having indicated a different approach is
preferred, why not? Shouldn't maintainers provide technical reasons.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-02-11  8:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-02-07 11:19 [RFC PATCH] perf, bpf: Retain kernel executable code in memory to aid Intel PT tracing Adrian Hunter
2019-02-07 20:02 ` Peter Zijlstra
2019-02-08  8:53   ` Adrian Hunter
2019-02-08  9:12     ` Peter Zijlstra
2019-02-11  7:46   ` Ingo Molnar
2019-02-08 23:29 ` Alexei Starovoitov
2019-02-11  7:54   ` Adrian Hunter
2019-02-11  8:18     ` Alexei Starovoitov
2019-02-11  8:24       ` Adrian Hunter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox