* [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management)
@ 2006-09-21 16:00 Mathieu Desnoyers
2006-09-21 16:06 ` Ingo Molnar
2006-09-21 17:56 ` Frank Ch. Eigler
0 siblings, 2 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2006-09-21 16:00 UTC (permalink / raw)
To: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna,
Andrew Morton, Ingo Molnar, Mathieu Desnoyers, Paul Mundt,
linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore,
Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman,
Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox
Hello,
Yet, again, a new version. I integrated a full probe management mechanism. See
below.
Comments are welcome,
Mathieu
---BEGIN---
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1082,6 +1082,8 @@ config KPROBES
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+source "kernel/Kconfig.marker"
+
source "ltt/Kconfig"
endmenu
--- /dev/null
+++ b/include/linux/marker.h
@@ -0,0 +1,118 @@
+/*****************************************************************************
+ * marker.h
+ *
+ * Code markup for dynamic and static tracing.
+ *
+ * Example :
+ *
+ * MARK(subsystem_event, "%d %s", someint, somestring);
+ * Where :
+ * - Subsystem is the name of your subsystem.
+ * - event is the name of the event to mark.
+ * - "%d %s" is the formatted string for printk.
+ * - someint is an integer.
+ * - somestring is a char *.
+ * - subsystem_event must be unique thorough the kernel!
+ *
+ * Dynamically overridable function call based on marker mechanism
+ * from Frank Ch. Eigler <fche@redhat.com>.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#define MARK_KPROBE_PREFIX "__mark_kprobe_"
+#define MARK_CALL_PREFIX "__mark_call_"
+#define MARK_JUMP_SELECT_PREFIX "__mark_jump_select_"
+#define MARK_JUMP_CALL_PREFIX "__mark_jump_call_"
+#define MARK_JUMP_INLINE_PREFIX "__mark_jump_inline_"
+#define MARK_JUMP_OVER_PREFIX "__mark_jump_over_"
+
+#ifdef CONFIG_MARK_SYMBOL
+#define MARK_SYM(name) \
+ do { \
+ __label__ here; \
+ here: asm volatile \
+ (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \
+ } while(0)
+#else
+#define MARK_SYM(name)
+#endif
+
+#ifdef CONFIG_MARK_JUMP_CALL
+#define MARK_JUMP_CALL_PROTOTYPE(name) \
+ static void \
+ (*__mark_call_##name)(const char *fmt, ...) \
+ asm (MARK_CALL_PREFIX#name) = \
+ __mark_empty_function
+#define MARK_JUMP_CALL(name, format, args...) \
+ do { \
+ preempt_disable(); \
+ (*__mark_call_##name)(format, ## args); \
+ preempt_enable_no_resched(); \
+ } while(0)
+#else
+#define MARK_JUMP_CALL_PROTOTYPE(name)
+#define MARK_JUMP_CALL(name, format, args...)
+#endif
+
+#ifdef CONFIG_MARK_JUMP_INLINE
+#define MARK_JUMP_INLINE(name, format, args...) \
+ (void) (__mark_inline_##name(format, ## args))
+#else
+#define MARK_JUMP_INLINE(name, format, args...)
+#endif
+
+#define MARK_JUMP(name, format, args...) \
+ do { \
+ __label__ over_label, call_label, inline_label; \
+ volatile static void *__mark_jump_select_##name \
+ asm (MARK_JUMP_SELECT_PREFIX#name) = \
+ &&over_label; \
+ volatile static void *__mark_jump_call_##name \
+ asm (MARK_JUMP_CALL_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&call_label; \
+ volatile static void *__mark_jump_inline_##name \
+ asm (MARK_JUMP_INLINE_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&inline_label; \
+ volatile static void *__mark_jump_over_##name \
+ asm (MARK_JUMP_OVER_PREFIX#name) \
+ __attribute__((unused)) = \
+ &&over_label; \
+ MARK_JUMP_CALL_PROTOTYPE(name); \
+ goto *__mark_jump_select_##name; \
+call_label: \
+ MARK_JUMP_CALL(name, format, ## args); \
+ goto over_label; \
+inline_label: \
+ MARK_JUMP_INLINE(name, format, ## args); \
+over_label: \
+ do {} while(0); \
+ } while(0)
+
+#define MARK(name, format, args...) \
+ do { \
+ __mark_check_format(format, ## args); \
+ MARK_SYM(name); \
+ MARK_JUMP(name, format, ## args); \
+ } while(0)
+
+enum marker_type { MARKER_CALL, MARKER_INLINE };
+
+typedef void (*marker_probe)(const char *fmt, ...);
+
+static inline __attribute__ ((format (printf, 1, 2)))
+void __mark_check_format(const char *fmt, ...)
+{ }
+
+extern void __mark_empty_function(const char *fmt, ...);
+
+int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type);
+
+void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type);
--- /dev/null
+++ b/kernel/Kconfig.marker
@@ -0,0 +1,31 @@
+# Code markers configuration
+
+menu "Marker configuration"
+
+
+config MARK_SYMBOL
+ bool "Replace markers with symbols"
+ default n
+ help
+ Put symbols in place of markers, useful for kprobe.
+
+config MARK_JUMP_CALL
+ bool "Replace markers with a jump over an inactive function call"
+ default n
+ help
+ Put a jump over a call in place of markers.
+
+config MARK_JUMP_INLINE
+ bool "Replace markers with a jump over an inline function"
+ default n
+ help
+ Put a jump over an inline function.
+
+config MARK_JUMP
+ bool "Jump marker probes set/disable infrastructure"
+ select KALLSYMS
+ default n
+ help
+ Install or remove probes from markers.
+
+endmenu
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_MARK_JUMP) += marker.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,178 @@
+/*****************************************************************************
+ * marker.c
+ *
+ * Code markup for dynamic and static tracing. Marker control module.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ *
+ * Design :
+ * kernel/marker.c deals with all marker activation from a centralized,
+ * coherent mechanism. The functions that will be called will simply sit in
+ * modules.
+ *
+ * Before activating a probe, the marker module :
+ * 1 - takes proper locking
+ * 2 - verifies that the function pointer and jmp target are at their default
+ * values, otherwise the "set" operation fails.
+ * 4 - does function pointer and jump setup.
+ *
+ * Setting them back to disabled is :
+ * 1 - setting back the default jmp and call values
+ * 2 - call synchronize_sched()
+ *
+ * A probe module must call marker disable on all its markers before module
+ * unload.
+ *
+ * The marker module will also deal with inline jump selection, which is
+ * the same case as presented here, but without the function pointer.
+ */
+
+
+#include <linux/marker.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/string.h>
+
+static DEFINE_SPINLOCK(marker_lock);
+
+struct marker_pointers {
+ void **call;
+ void **jmpselect;
+ void **jmpcall;
+ void **jmpinline;
+ void **jmpover;
+};
+
+void __mark_empty_function(const char *fmt, ...)
+{
+}
+EXPORT_SYMBOL(__mark_empty_function);
+
+/* Pointers can be used around preemption disabled */
+static int marker_get_pointers(const char *name,
+ struct marker_pointers *ptrs)
+{
+ char call_sym[KSYM_NAME_LEN] = MARK_CALL_PREFIX;
+ char jmpselect_sym[KSYM_NAME_LEN] = MARK_JUMP_SELECT_PREFIX;
+ char jmpcall_sym[KSYM_NAME_LEN] = MARK_JUMP_CALL_PREFIX;
+ char jmpinline_sym[KSYM_NAME_LEN] = MARK_JUMP_INLINE_PREFIX;
+ char jmpover_sym[KSYM_NAME_LEN] = MARK_JUMP_OVER_PREFIX;
+ unsigned int call_sym_len = sizeof(MARK_CALL_PREFIX);
+ unsigned int jmpselect_sym_len = sizeof(MARK_JUMP_SELECT_PREFIX);
+ unsigned int jmpcall_sym_len = sizeof(MARK_JUMP_CALL_PREFIX);
+ unsigned int jmpinline_sym_len = sizeof(MARK_JUMP_INLINE_PREFIX);
+ unsigned int jmpover_sym_len = sizeof(MARK_JUMP_OVER_PREFIX);
+
+ strncat(call_sym, name, KSYM_NAME_LEN-call_sym_len);
+ strncat(jmpselect_sym, name, KSYM_NAME_LEN-jmpselect_sym_len);
+ strncat(jmpcall_sym, name, KSYM_NAME_LEN-jmpcall_sym_len);
+ strncat(jmpinline_sym, name, KSYM_NAME_LEN-jmpinline_sym_len);
+ strncat(jmpover_sym, name, KSYM_NAME_LEN-jmpover_sym_len);
+
+ ptrs->call = (void**)kallsyms_lookup_name(call_sym);
+ ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym);
+ ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym);
+ ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym);
+ ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym);
+
+ if(!(ptrs->call && ptrs->jmpselect && ptrs->jmpcall
+ && ptrs->jmpinline && ptrs->jmpover)) {
+ return ENOENT;
+ }
+ return 0;
+}
+
+int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type)
+{
+ int result = 0;
+ struct marker_pointers ptrs;
+
+ spin_lock(&marker_lock);
+ result = marker_get_pointers(name, &ptrs);
+ if(result) {
+ printk(KERN_NOTICE
+ "Unable to find kallsyms for markers in %s\n",
+ name);
+ goto unlock;
+ }
+
+ switch(type) {
+ case MARKER_CALL:
+ if(*ptrs.call != __mark_empty_function) {
+ result = EBUSY;
+ printk(KERN_NOTICE
+ "Probe already installed on "
+ "marker in %s\n",
+ name);
+ goto unlock;
+ }
+ /* Setup the call pointer */
+ *ptrs.call = probe;
+ /* Setup the jump */
+ *ptrs.jmpselect = *ptrs.jmpcall;
+ break;
+ case MARKER_INLINE:
+ if(*ptrs.jmpover == *ptrs.jmpinline) {
+ result = ENODEV;
+ printk(KERN_NOTICE
+ "No inline probe exists "
+ "for marker in %s\n",
+ name);
+ goto unlock;
+ }
+ /* Setup the call pointer */
+ *ptrs.call = __mark_empty_function;
+ /* Setup the jump */
+ *ptrs.jmpselect = *ptrs.jmpinline;
+ break;
+ default:
+ result = ENOENT;
+ printk(KERN_ERR
+ "Unknown marker type\n");
+ break;
+ }
+unlock:
+ spin_unlock(&marker_lock);
+ return result;
+}
+EXPORT_SYMBOL_GPL(marker_set_probe);
+
+void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...),
+ enum marker_type type)
+{
+ int result = 0;
+ struct marker_pointers ptrs;
+
+ spin_lock(&marker_lock);
+ result = marker_get_pointers(name, &ptrs);
+ if(result)
+ goto unlock;
+
+ switch(type) {
+ case MARKER_CALL:
+ if(*ptrs.call == probe) {
+ *ptrs.jmpselect = *ptrs.jmpover;
+ *ptrs.call = __mark_empty_function;
+ }
+ break;
+ case MARKER_INLINE:
+ if(*ptrs.jmpselect == *ptrs.jmpinline)
+ *ptrs.jmpselect = *ptrs.jmpover;
+ break;
+ default:
+ result = ENOENT;
+ printk(KERN_ERR
+ "Unknown marker type\n");
+ break;
+ }
+unlock:
+ spin_unlock(&marker_lock);
+ if(!result && type == MARKER_CALL)
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(marker_disable_probe);
---END---
-- probe module ---
---BEGIN---
/* probe.c
*
* Loads a function at a marker call site.
*
* (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
*
* This file is released under the GPLv2.
* See the file COPYING for more details.
*/
#include <linux/marker.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
/* function to install */
void do_mark1(const char *format, int value)
{
printk("value is %d\n", value);
}
int init_module(void)
{
return marker_set_probe("subsys_mark1", (marker_probe)do_mark1,
MARKER_CALL);
}
void cleanup_module(void)
{
marker_disable_probe("subsys_mark1", (marker_probe)do_mark1,
MARKER_CALL);
}
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Probe");
---END---
-- sample test marked module ---
---BEGIN---
/* test-mark.c
*
*/
#include <linux/marker.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
int x=7;
struct proc_dir_entry *pentry = NULL;
static int my_open(struct inode *inode, struct file *file)
{
MARK(subsys_mark1, "%d", 1);
MARK(subsys_mark2, "%d %s", 2, "blah2");
MARK(subsys_mark3, "%d %s", x, "blah3");
return -EPERM;
}
static struct file_operations my_operations = {
.open = my_open,
};
int init_module(void)
{
pentry = create_proc_entry("testmark", 0444, NULL);
if(pentry)
pentry->proc_fops = &my_operations;
return 0;
}
void cleanup_module(void)
{
remove_proc_entry("testmark", NULL);
}
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Marker Test");
---END---
OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg
Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 16:00 [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) Mathieu Desnoyers @ 2006-09-21 16:06 ` Ingo Molnar 2006-09-21 21:42 ` Mathieu Desnoyers 2006-09-23 16:51 ` Mathieu Desnoyers 2006-09-21 17:56 ` Frank Ch. Eigler 1 sibling, 2 replies; 27+ messages in thread From: Ingo Molnar @ 2006-09-21 16:06 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Mathieu Desnoyers, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > +config MARK_SYMBOL > +config MARK_JUMP_CALL > +config MARK_JUMP_INLINE > +config MARK_JUMP same NACK over the proliferation of options as before: http://marc.theaimsgroup.com/?l=linux-kernel&m=115881457219562&w=2 Tap, tap, is this thing on? ;) found one related reply from you that i didnt answer yet: "As an example, LTTng traces the page fault handler, when kprobes just can't instrument it." but tracing a raw pagefault at the arch level is a bad idea anyway, we want to trace __handle_mm_fault(). That way you can avoid having to modify every architecture's pagefault handler ... but the other points remained unanswered as far as i can see. Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 16:06 ` Ingo Molnar @ 2006-09-21 21:42 ` Mathieu Desnoyers 2006-09-21 21:49 ` Mathieu Desnoyers ` (4 more replies) 2006-09-23 16:51 ` Mathieu Desnoyers 1 sibling, 5 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-21 21:42 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Ingo Molnar (mingo@elte.hu) wrote: > "As an example, LTTng traces the page fault handler, when kprobes just > can't instrument it." > > but tracing a raw pagefault at the arch level is a bad idea anyway, we > want to trace __handle_mm_fault(). That way you can avoid having to > modify every architecture's pagefault handler ... > Then you lose the ability to trace in-kernel minor page faults. > but the other points remained unanswered as far as i can see. > Hi Ingo, I clearly expressed my position in the previous emails, so did you. You argued about a use of tracing that is not relevant to my vision of reality, which is : - Embedded systems developers won't want a breakpoint-based probe - High performance computing users won't want a breakpoint-based probe - djprobe is far away from being in an acceptable state on architectures with very inconvenient erratas (x86). - kprobe and djprobe cannot access local variables in every cases For those reasons, I prefer a jump-over-call approach which lets gcc give us the local variables. No need of DWARF or SystemTAP macro Kung Fu. Just C and a loadable module. By no means is it a replacement for a completely dynamic breakpoint-based instrumentation mechanism. I really think that both mechanism should coexist. This is my position : I let the distribution/user decide what is appropriate for their use. My goal is to provide them a flexible mechanism that takes the multiple variety of uses in account without performance impact if they are not willing to pay it to benefit from tracing. With all due respect, yes, there are Linux users different from the typical Redhat client. If your vision is still limited to this scope after a 500 emails debate, I am afraid that there is very little I can do about it in one more. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 21:42 ` Mathieu Desnoyers @ 2006-09-21 21:49 ` Mathieu Desnoyers 2006-09-22 6:29 ` Karim Yaghmour ` (3 subsequent siblings) 4 siblings, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-21 21:49 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers (compudj@krystal.dyndns.org) wrote: > * Ingo Molnar (mingo@elte.hu) wrote: > > > "As an example, LTTng traces the page fault handler, when kprobes just > > can't instrument it." > > > > but tracing a raw pagefault at the arch level is a bad idea anyway, we > > want to trace __handle_mm_fault(). That way you can avoid having to > > modify every architecture's pagefault handler ... > > > > Then you lose the ability to trace in-kernel minor page faults. > But I agree with you that an upstream MARKER makes more sense in __handle_mm_fault(). :-) Regards, Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 21:42 ` Mathieu Desnoyers 2006-09-21 21:49 ` Mathieu Desnoyers @ 2006-09-22 6:29 ` Karim Yaghmour 2006-09-22 6:49 ` Ingo Molnar ` (2 subsequent siblings) 4 siblings, 0 replies; 27+ messages in thread From: Karim Yaghmour @ 2006-09-22 6:29 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox And now for some friendly fire :) No just kidding, but I thought I'd pour some water in your wine here. Mathieu Desnoyers wrote: > With all due respect, yes, there are Linux users different from the typical > Redhat client. If your vision is still limited to this scope after a 500 > emails debate, I am afraid that there is very little I can do about it in > one more. Actually I think Ingo should be commended for having come around on a significant number of crucial and sensitive issues. He's accepted on principle the need for static markup, the fact that said markup needs to somewhat be contextually accurate (variables, etc.) and the fact that any form of markup made could be trivially used for fast-path direct calls (albeit he wants you to package that as a separate patch -- which you obviously don't like very much, but I think I've fixed this one to the satisfaction of both of you below.) Now, we should stop for a moment, look back and contrast Ingo's current position to this week's painful thread, and we should thank him for having come around on these things. I can't say any of it was enjoyable myself and I'm sure others would agree, but it remains that Ingo has taken a second look at things. He hasn't come to our specific position, but that, I think, can be diplomatically resolved, more below. Now with regards to the fact that Ingo works for RedHat, I think this should only be used to understand his position. For if the "gut feeling" of one of RedHat's top kernel engineers is that something is wrong, then maybe we should try to understand why. Now I don't want to judge or qualify whether he successfully conveyed the real essence of why he's got this "gut feeling", we've lost enough time already on non-essential banter. So let's stick to the facts. Just the facts ma'am. And to that end, I suggest you and I (and everyone for that matter that agrees with us) do some role-playing. Now what I say below could be off, but I'm convinced there's some element of truth here -- and the reason I say this is that I can imagine myself thinking the same as Ingo if I were in his position *and* the assumptions I make below were correct. I believe everyone at RedHat understands the value of tracing. In fact, I believe their customers have been asking for just that. Their investment in SystemTap can only be seen in this light. But why, oh why, if the customers are asking for it is nobody just stick the freeking thing in (all static) and be done with it? Well, it's unfortunately not that simple. There are tons of issues (and that's obviously why LTT has been held up for so many years -- the arguments against it were just a big ball of wax and nobody ever took the time to look at what that ball of wax was made of.) Some of them, indeed I think the most important ones, we've already covered, and I don't wish to get into those again. Suffice it to say, though, that we've taken enough wax off the ball that some things have settled in: some form of static markup being a major one. Ingo's continued insistence, though, seems to point to the fact that there's just a little bit more wax to remove. So on with my little thought experiment. Say you're RedHat -- and I'm just using RedHat as an example, but I'm sure much of this applies to any other distro or distro maintainer. Say you've been shipping LTT for a couple of versions (and, you'll have to admit Mathieu, that's what we'd like), and obviously yours being the most widely used distro out there, LTT gets a huge following and everybody and his little sister uses it for tracing. All is well and everybody's happy, right? Unfortunately not. Somewhere down the road somebody tries to push a change into mainline that breaks a tracepoint. Ooops, that's a problem for Ingo (and all other kernel developers from RedHat I'd say) you see. Because then he has two nasty choices. Either he opposes the change -- which may in context be totally nonsensical, like trying to preserve a variable that doesn't exist no more or worse a tracepoint that makes absolutely no sense (and I think that's what Ingo means when he compares those calls to "System calls" that will have to be maintained forever.) Or, and that's probably worse from RedHat's position and Ingo's of course, somebody has to go fix all of LTT's now complicated stack of analysis tools to get it to continue working for the next release. Judging from the patches Ingo has been pumping out for the last year, I'd say fixing up userspace analysis tools isn't on his radars. Of course as the LTT maintainer, in defense, you would say either: a) you don't need to fix LTT because it gracefully fails when events go missing (which is true, but doesn't help RedHat because now its customers don't get the exact same analysis they were used to), b) that you'd obviously fix LTT's analysis stack as a consequence (but this too is problematic because of the reasons that follow.) And then Ingo could say: 1) you don't work for RedHat 2) even if you did work for RedHat (and Ingo could just pull your ear to have you "fix" LTT), there's nobody that can predict two or three years down the road what will have been built on top of your tool by 3rd parties and which may actually be used by RedHat customers. This is as best I make of Ingo's continued insistence on keeping the direct call stuff out of tree. Because if it's dynamic, then the above issue *somewhat* goes away: If it's a statically-marked-up dynamic point, then even if he doesn't control the stack of tools that get built around those, he can certainly hack up the SystemTap scripts to continue showing the same "events" to the upper-layer stack, even if somebody down the line makes the event obsolete. With a tool that depends on direct non-dynamic instrumentation point, he wouldn't be able to do that. He'd then have to maintain a RedHat-specific patch for making that event continue showing up, and looking at the list of patches those distros end up including, I'm sure he's got zero interest in doing that. And with the temporary hack in the SystemTap scripts done, he can then hand that off to someone else and inform them that eventually that specific event has become obsolete. With a tool that depends on direct static calls he can't do that. The stack of analysis tools breaks right away. So when Ingo insists that you keep your direct call stuff out of tree, what he's really telling you is that he doesn't want to have to fix your stuff for your users. If your users depend on your external script to make it work, RedHat (and therefore Ingo in some way) is not accountable for that (though it could be some contract work), you are. Now bare with me, I'm not asking you to drop your position, I think there's a middle ground everybody will agree on -- of course if I got the above right. So below is kind of like a "review" of all the mechanisms that have been put forth and how they rank depending on perspective. The closer to "1" the better. And here are the mechanisms I'm reviewing: - kprobes - djprobes - bprobes (Martin's thing amended with a 5-byte filler per instrumented function.) - direct calls (i.e. "static" stuff.) I've deliberately included Martin's idea here because everything I have seen about it points to it working. Within the specific case of where a 5-byte filler "single instruction" is put at the entry of a function of interest, then djprobes can be safely used to get to an alternate function. And don't worry, I was one of the first to actually doubt that djprobes could work. See the SystemTap archives if you don't believe me. What I've always doubted, and continue to doubt actually, is its ability to replace bytes covering multiple instructions with a 5-byte jump. But that's a different topic. Very early on in my argumentation against djprobes on the SystemTap list, it became evident that it could be used to effectively replace anything that had 5 bytes or more. And I knew about the errata stuff, but this ability was confirmed by the folks at IBM who had to deal with this errata before. So if they said it works, I had no reason to doubt that it could safely do what it claimed it could. So, in sum, unless somebody can mount a solid case against Martin's slightly-amended proposed mechanism (with the 5-byte filler) in combination with djprobes, I think we can proceed. Note that the need for djprobes is x86-specific. On other archs errata limitations may not be present. And, if worse came to worse, the filler could be a simple static-jump to an outside replacement (without any parameters being passed); even with that crude a method the advantages of bprobes would remain unchanged: a) fast (both while dormant or active), b) flexible, c) easily used by distro kernel maintainers for trivially fixing up instrumentation. One more thing. I sometimes make a distinction below about djprobes. The "full djprobes" is djprobes that can replace instructions of any length -- this hasn't been figured out yet as others have stated. The "limited djprobes" is djprobes that can replace any instruction of 5 bytes or more. I've yet to see anything showing this can't work today. So here's the summary of this "review" with a comment on which of Ingo or Mathieu insists on which. There are, of course, many other people that may agree with either, but let's just keep this simple for now. In terms of runtime speed of event-to-log (this view of things is favored by Mathieu): 1- direct calls 2- bprobes 3- full djprobes 4- kprobes It could be argued that 1, 2, and 3 are the same here, and I won't disagree too much with that -- if 3 existed. Lowest cost in "dormant" state (this view of things is favored by Ingo): 1- kprobes/full djprobes 2- bprobes (single 5byte filler for each instrumented function, no matter how many events) 3- direct call (linearly incrementing cost with each event) Technical simplicity of implementation on x86 (this one is tricky) (this view of things is favored by Roman -- well, ok, he's not part of my two choice rule above): 1- direct calls 2- bprobes without binary fixup (filler is direct call or funnction pointer -- not elegant, but very simple) 3- kprobes 4- limited djprobes (can replace any instruction of 5 bytes or more) 5- bprobes with binary fixup (filler is 5-byte replaced using limited djprobes.) 6- full djprobes (can replace any length of instruction -- this has yet to be shown to work.) Technical simplicity of implementation on "non-errata" arch: 1- direct calls 2- bprobes without binary fixup 3- bprobes with binary fixup 4- kprobes 5- djprobes Least potential for maintainability nightmare for vendor kernel engineers (a.k.a. simplicity of fixup in case event doesn't exist no more or variables are rendered non-relevant) (this is inferred by me as being Ingo's point-of-view, but I could be wrong): 1- kprobes/full djprobes/bprobes 2- direct calls Note: I've put bprobes up there with kprobes and djprobes in this evaluation because I think that the above mind-experiment about how Ingo could do a simple fixup without imposing the requirement for a patch to the kernel shipped by RedHat works in the same way. IOW, what works for SystemTap scripts in this regards, works just fine for bprobes. I'll take Ingo's positive comments on Martin's idea as an indication that my conclusion here is correct. And indeed Martin himself would like to have this in order to avoid having to do with SystemTap scripts altogether. How closely does the fixup environment mimic normal kernel code hacking (this would be Martin's point-of-view): 1- direct calls / bprobes 2- kprobes / full djprobes (in the form of SystemTap of course) Ability to insert probe points on non-previously marked up regions (I think everybody would agree on this one): 1- kprobes / full djprobes 2- bprobes (even if the given event we're looking for isn't specifically marked up, if the encompassing function is, then we can still get our way.) 3- direct calls (of course this sucks, you've got to recompile, and that's why it comes strong third and would likely move down the list if other mechanisms were found that didn't require rebuilding.) And there may be other criteria which I missed, but I think the important ones are there. Now, if you tally this whole thing up, I think you would get that bprobes is the single thing that would make everyone content. Really, it does. It's very fast during active tracing, there is no correlation between the number of markers in a function and the cost on that function when no tracing is conducted, it should be relatively easy to implement (all right, I'm using the word "easy" quite liberally here), it's easy to fixup for distro kernel maintainers, it provides a work environment which kernel developers are already very familiar with (the kernel source code), etc. Now I've never been a fan of binary gunk work. But, gut feelings running rampant here, I was kind of naturally attracted to Martin's idea. The above more rational analysis seems to support such intuition. For all these reasons, I think bprobes warrants some very serious investigation. If made to work, and I have yet to see any reason why it shouldn't, I think it would make most everybody happy -- at least that's what I can make up of all this. Mathieu, in fact, if you look at it from the LTT perspective, this is just fine, it provides the events without perturbing the system. And for Ingo, he gets to be able to do very much what he would have been able to do with SystemTap for fixing up events without having to go fix some userspace analysis tool. And with regards to the fate of embedded systems, which trust me I know everything about, then I'm sure Ingo won't object to your markers generating direct static calls if that's tied to CONFIG_EMBEDDED. And if you talk to him on a sunny day, he might just agree that direct static calls would just be fine on m68k too ;) And if, and only if, bprobes can't be made to work, then you might just have to keep that static direct call outside the tree until you can prove to Ingo that on the long run you'll be able to manage keeping LTT in-sync with kernel changes -- or unless Ingo's point of view is ignored by everybody upstairs, which I have my doubts about. Which, of course, wouldn't preclude RedHat shipping LTT so long as it depended on a mechanism which allowed Ingo to fix up events for it and not have to delve into fixing the LTT analysis stack for RedHat customers to be happy. Hope this helps dissipate some confusion and focus attention where progress would help achieving a more universal consensus, Karim -- President / Opersys Inc. Embedded Linux Training and Expertise www.opersys.com / 1.866.677.4546 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 21:42 ` Mathieu Desnoyers 2006-09-21 21:49 ` Mathieu Desnoyers 2006-09-22 6:29 ` Karim Yaghmour @ 2006-09-22 6:49 ` Ingo Molnar 2006-09-22 14:03 ` Mathieu Desnoyers 2006-09-22 7:07 ` Ingo Molnar 2006-09-22 14:31 ` Christoph Hellwig 4 siblings, 1 reply; 27+ messages in thread From: Ingo Molnar @ 2006-09-22 6:49 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > "As an example, LTTng traces the page fault handler, when kprobes > > just can't instrument it." > > > > but tracing a raw pagefault at the arch level is a bad idea anyway, > > we want to trace __handle_mm_fault(). That way you can avoid having > > to modify every architecture's pagefault handler ... > > Then you lose the ability to trace in-kernel minor page faults. that's wrong, minor pagefaults go through __handle_mm_fault() just as much. Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 6:49 ` Ingo Molnar @ 2006-09-22 14:03 ` Mathieu Desnoyers 2006-09-22 16:53 ` Ingo Molnar 0 siblings, 1 reply; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 14:03 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Ingo Molnar (mingo@elte.hu) wrote: > > * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > > > "As an example, LTTng traces the page fault handler, when kprobes > > > just can't instrument it." > > > > > > but tracing a raw pagefault at the arch level is a bad idea anyway, > > > we want to trace __handle_mm_fault(). That way you can avoid having > > > to modify every architecture's pagefault handler ... > > > > Then you lose the ability to trace in-kernel minor page faults. > > that's wrong, minor pagefaults go through __handle_mm_fault() just as > much. > Hi Ingo, On a 2.6.17 kernel tree : In do_page_fault() if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) return; In vmalloc_fault() static inline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } It seems like a shortcut path that will never call __handle_mm_fault. This path is precisely used to handle vmalloc faults. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 14:03 ` Mathieu Desnoyers @ 2006-09-22 16:53 ` Ingo Molnar 2006-09-22 17:11 ` Mathieu Desnoyers 0 siblings, 1 reply; 27+ messages in thread From: Ingo Molnar @ 2006-09-22 16:53 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > > Then you lose the ability to trace in-kernel minor page faults. > > > > that's wrong, minor pagefaults go through __handle_mm_fault() just as > > much. > > > > Hi Ingo, > > On a 2.6.17 kernel tree : > It seems like a shortcut path that will never call __handle_mm_fault. > This path is precisely used to handle vmalloc faults. yes, but you said "minor fault", not "vmalloc fault". minor faults are the things that happen when a task does read-after-COW or read-mmap-ed-pagecache-page, and they very much go through __handle_mm_fault(). vmalloc faults are extremely rare, x86-specific and they are a pure kernel-internal matter. (I'd never want to trace them, especially if it pushes tracepoints into every architecture's page fault handler. I implemented the initial version of them IIRC, but my memory fails precisely why. I think it was 4:4 related, but i'm unsure.) (i now realize that above you said "in-kernel minor faults" - under that you meant vmalloc faults?) Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 16:53 ` Ingo Molnar @ 2006-09-22 17:11 ` Mathieu Desnoyers 2006-09-22 17:12 ` Ingo Molnar 0 siblings, 1 reply; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 17:11 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Ingo Molnar (mingo@elte.hu) wrote: > > * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > > > > Then you lose the ability to trace in-kernel minor page faults. > > > > > > that's wrong, minor pagefaults go through __handle_mm_fault() just as > > > much. > > > > > > > Hi Ingo, > > > > On a 2.6.17 kernel tree : > > > It seems like a shortcut path that will never call __handle_mm_fault. > > This path is precisely used to handle vmalloc faults. > > yes, but you said "minor fault", not "vmalloc fault". > > minor faults are the things that happen when a task does read-after-COW > or read-mmap-ed-pagecache-page, and they very much go through > __handle_mm_fault(). > > vmalloc faults are extremely rare, x86-specific and they are a pure > kernel-internal matter. (I'd never want to trace them, especially if it > pushes tracepoints into every architecture's page fault handler. I > implemented the initial version of them IIRC, but my memory fails > precisely why. I think it was 4:4 related, but i'm unsure.) > > (i now realize that above you said "in-kernel minor faults" - under that > you meant vmalloc faults?) > Yes, sorry, my mistake. This kind of fault is not as infrequent as you may think, as every newly allocated vmalloc region will cause vmalloc faults on every processes on the system that are trying to access them. I agree that it should not be a standard event people would be interested in. Regards, Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 17:11 ` Mathieu Desnoyers @ 2006-09-22 17:12 ` Ingo Molnar 2006-09-22 17:28 ` Mathieu Desnoyers 0 siblings, 1 reply; 27+ messages in thread From: Ingo Molnar @ 2006-09-22 17:12 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > * Ingo Molnar (mingo@elte.hu) wrote: > > > > * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > > > > > > Then you lose the ability to trace in-kernel minor page faults. > > > > > > > > that's wrong, minor pagefaults go through __handle_mm_fault() just as > > > > much. > > > > > > > > > > Hi Ingo, > > > > > > On a 2.6.17 kernel tree : > > > > > It seems like a shortcut path that will never call __handle_mm_fault. > > > This path is precisely used to handle vmalloc faults. > > > > yes, but you said "minor fault", not "vmalloc fault". > > > > minor faults are the things that happen when a task does read-after-COW > > or read-mmap-ed-pagecache-page, and they very much go through > > __handle_mm_fault(). > > > > vmalloc faults are extremely rare, x86-specific and they are a pure > > kernel-internal matter. (I'd never want to trace them, especially if it > > pushes tracepoints into every architecture's page fault handler. I > > implemented the initial version of them IIRC, but my memory fails > > precisely why. I think it was 4:4 related, but i'm unsure.) > > > > (i now realize that above you said "in-kernel minor faults" - under that > > you meant vmalloc faults?) > > > > Yes, sorry, my mistake. This kind of fault is not as infrequent as you > may think, as every newly allocated vmalloc region will cause vmalloc > faults on every processes on the system that are trying to access > them. I agree that it should not be a standard event people would be > interested in. most of the vmalloc area that is allocated on a typical system are modules - and they get loaded on bootup and rarely unloaded. Even for other vmalloc-ed areas like netfilter, the activation of them is during bootup. So from that point on the number of vmalloc faults is quite low. (zero on most systems) If you still want to trace it i'd suggest a separate type of event for it. (meanwhile i remember why i implemented vmalloc faults to begin with: during vmalloc() we used to have a for_each_process() over all kernel-pagetables of tasks to fix up their pagetables. This caused both high latencies and overhead back in the days when we still were frequent vmalloc()ers.) Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 17:12 ` Ingo Molnar @ 2006-09-22 17:28 ` Mathieu Desnoyers 0 siblings, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 17:28 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Ingo Molnar (mingo@elte.hu) wrote: > > > > Yes, sorry, my mistake. This kind of fault is not as infrequent as you > > may think, as every newly allocated vmalloc region will cause vmalloc > > faults on every processes on the system that are trying to access > > them. I agree that it should not be a standard event people would be > > interested in. > > most of the vmalloc area that is allocated on a typical system are > modules - and they get loaded on bootup and rarely unloaded. Even for > other vmalloc-ed areas like netfilter, the activation of them is during > bootup. So from that point on the number of vmalloc faults is quite low. > (zero on most systems) If you still want to trace it i'd suggest a > separate type of event for it. Yes, with this typical vmalloc usage pattern in perspective, we completely agree. We also agree on having a separate kind of event for this, as it requires the tracer code to be vmalloc-fault-reentrant (very tricky). Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 21:42 ` Mathieu Desnoyers ` (2 preceding siblings ...) 2006-09-22 6:49 ` Ingo Molnar @ 2006-09-22 7:07 ` Ingo Molnar 2006-09-22 8:14 ` Karim Yaghmour 2006-09-22 15:08 ` Mathieu Desnoyers 2006-09-22 14:31 ` Christoph Hellwig 4 siblings, 2 replies; 27+ messages in thread From: Ingo Molnar @ 2006-09-22 7:07 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > I clearly expressed my position in the previous emails, so did you. > You argued about a use of tracing that is not relevant to my vision of > reality, which is : > > - Embedded systems developers won't want a breakpoint-based probe are you arguing that i'm trying to force breakpoint-based probing on you? I dont. In fact i explicitly mentioned that i'd accept and support a 5-byte NOP in the body of the marker, in the following mails: "just go for [...] the 5-NOP variant" http://marc.theaimsgroup.com/?l=linux-kernel&m=115859771924187&w=2 (my reply to your second proposal) "or at most one NOP" http://marc.theaimsgroup.com/?l=linux-kernel&m=115865412332230&w=2 (my reply to your third proposal) "at most a NOP inserted" http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2 (my reply to your fifth proposal) That enables the probe to be turned into a function call - not an INT3 breakpoint. Does it take some effort to implement that on your part? Yes, of course, but getting code upstream is never easy, /especially/ in cases where most of the users wont use a particular feature. > - High performance computing users won't want a breakpoint-based probe I am not forcing breakpoint-based probing, at all. I dont want _static, build-time function call based_ probing, and there is a big difference. And one reason why i want to avoid "static, build-time function call based probing" is because high-performance computing users dont want any overhead at all in the kernel fastpath. > - djprobe is far away from being in an acceptable state on > architectures with very inconvenient erratas (x86). djprobes over a NOP marker are perfectly usable and safe: just add a simple constraint to them to only allow a djprobes insertion if it replaces a 5-byte NOP. > - kprobe and djprobe cannot access local variables in every cases it is possible with the marker mechanism i outlined before: http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2 have i missed to address any concern of yours? Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 7:07 ` Ingo Molnar @ 2006-09-22 8:14 ` Karim Yaghmour 2006-09-22 15:08 ` Mathieu Desnoyers 1 sibling, 0 replies; 27+ messages in thread From: Karim Yaghmour @ 2006-09-22 8:14 UTC (permalink / raw) To: Ingo Molnar Cc: Mathieu Desnoyers, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Ingo Molnar wrote: > are you arguing that i'm trying to force breakpoint-based probing on > you? I dont. In fact i explicitly mentioned that i'd accept and support > a 5-byte NOP in the body of the marker, in the following mails: Actually that won't work if the kernel runs directly from rom/flash simply because it's not possible to do any form of binary editing on the image, as is possible in the more common desktop or workstation case where the kernel runs from ram. Karim -- President / Opersys Inc. Embedded Linux Training and Expertise www.opersys.com / 1.866.677.4546 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 7:07 ` Ingo Molnar 2006-09-22 8:14 ` Karim Yaghmour @ 2006-09-22 15:08 ` Mathieu Desnoyers 2006-09-22 16:24 ` Karim Yaghmour 2006-09-22 16:45 ` Ingo Molnar 1 sibling, 2 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 15:08 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Good morning Ingo, * Ingo Molnar (mingo@elte.hu) wrote: > > * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > > > I clearly expressed my position in the previous emails, so did you. > > You argued about a use of tracing that is not relevant to my vision of > > reality, which is : > > > > - Embedded systems developers won't want a breakpoint-based probe > > are you arguing that i'm trying to force breakpoint-based probing on > you? I dont. In fact i explicitly mentioned that i'd accept and support > a 5-byte NOP in the body of the marker, in the following mails: > > "just go for [...] the 5-NOP variant" > http://marc.theaimsgroup.com/?l=linux-kernel&m=115859771924187&w=2 > (my reply to your second proposal) > > "or at most one NOP" > http://marc.theaimsgroup.com/?l=linux-kernel&m=115865412332230&w=2 > (my reply to your third proposal) > > "at most a NOP inserted" > http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2 > (my reply to your fifth proposal) > > That enables the probe to be turned into a function call - not an INT3 > breakpoint. Does it take some effort to implement that on your part? > Yes, of course, but getting code upstream is never easy, /especially/ in > cases where most of the users wont use a particular feature. > Some details are worth to be mentioned : - The 5-NOP variant will imply a replacement of 5 1 bytes instructions with 1 5 bytes one, which is trickier. Masami Hiramatsu's proposal of 2 bytes near jump + 3 NOPS is nicer. - Patching such a 5-bytes instruction memory region doesn't turn markers into a complete function call, which includes argument passing. - The argument "most of the users wont use a particular feature" contradicts what you said earlier about every distribution wanting to enable a tracing mechanism for their users. > > - High performance computing users won't want a breakpoint-based probe > > I am not forcing breakpoint-based probing, at all. I dont want _static, > build-time function call based_ probing, and there is a big difference. > And one reason why i want to avoid "static, build-time function call > based probing" is because high-performance computing users dont want any > overhead at all in the kernel fastpath. > I think that the performance benefits gained by using tracing information for studying a system makes the overhead of a jump in the kernel fast path insignificant. Having a stack setup + function call already put there by the compiler has the following advantages : - It is very robust (I could think of using it on a live server, which is not true of the djprobe approach). - It is predictable on every architecture. - The information extracted is _always_ coherent with the marked variables, because the compiler itself created the full function call (stack setup included). > > - djprobe is far away from being in an acceptable state on > > architectures with very inconvenient erratas (x86). > > djprobes over a NOP marker are perfectly usable and safe: just add a > simple constraint to them to only allow a djprobes insertion if it > replaces a 5-byte NOP. > 2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an illegal instruction, but how ? Are you aware that their approach has to : - put an int3 - wait for _all_ the CPUs to execute this int3 - then change the 5 bytes instruction I can think of a lot of cases where the CPUs will never execute this int3. Probably that sending an IPI or launching a kernel thread on each CPU to make sure that this int3 is executed could give more guarantees there. But my point is not even there : I have seen very skillful teams work hard on those hardware-caused problems for years and the result is still not usable. It looks to me like a race between software developers and hardware manufacturers, where the software guy is always one step behind. This kind of scenario happens when you want to use an architecture in a way it was not designed and tested for. As long as CPU manufacturers won't design for live instruction patching (and why should they do that ? the in3 breakpoint is all what is needed from their perspective), this will be a race where software developers will lose. > > - kprobe and djprobe cannot access local variables in every cases > > it is possible with the marker mechanism i outlined before: > > http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2 > > have i missed to address any concern of yours? > Interesting idea. That would make it possible to probe local variables at the marker site. That's very good for use of kprobes on low rate debug-type markers, but that doesn't solve my concern about the cat-and-mouse race expressed earlier about live kernel polymorphic code. I would be all in for this kind of combo : If you can find a way to make a kprobe-based probe extract the variables from such a variable-dependency marked site, that would be great for dynamic of low event rate code paths. For the high event rate, and while we wait for such a probe to exist, I think that the load+jump over a complete call is the lowest cost, most robust, coherent, predictable and portable mechanism I have seen so far. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 15:08 ` Mathieu Desnoyers @ 2006-09-22 16:24 ` Karim Yaghmour 2006-09-22 16:13 ` Mathieu Desnoyers 2006-09-22 16:45 ` Ingo Molnar 1 sibling, 1 reply; 27+ messages in thread From: Karim Yaghmour @ 2006-09-22 16:24 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Funny, I never thought I'd be defending djprobes ... Mathieu Desnoyers wrote: > 2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an > illegal instruction, but how ? Are you aware that their approach has to : > - put an int3 > - wait for _all_ the CPUs to execute this int3 > - then change the 5 bytes instruction > > I can think of a lot of cases where the CPUs will never execute this int3. > Probably that sending an IPI or launching a kernel thread on each CPU to make > sure that this int3 is executed could give more guarantees there. But my point > is not even there : I have seen very skillful teams work hard on those > hardware-caused problems for years and the result is still not usable. It looks > to me like a race between software developers and hardware manufacturers, where > the software guy is always one step behind. This kind of scenario happens when > you want to use an architecture in a way it was not designed and tested for. > > As long as CPU manufacturers won't design for live instruction patching (and why > should they do that ? the in3 breakpoint is all what is needed from their > perspective), this will be a race where software developers will lose. I'm with you all the way if we're talking about patching instructions which are less than 5 bytes. And I must fault djprobes backers for their insistence on trying to get it to work for all possible instruction lengths. But in the specific case discussed between Hiramatsu-san and myself (5 byte short jmp + nops) I have no reason to believe it doesn't work. Continuing to try to get it to work on any instruction length can be argued to be a waste of time, but not what has been talked of of late. With regards to all CPUs executing the int3, here's a rather savage, but effective way of making this work: - launch one thread per cpu (as you explained) - have each thread make a direct jump to the location of the int3 - catch the int3 and kill active thread if this is a forced jump This is deterministic. So if your proposal is to amend the markup to use the short-jmp+nops at every marker site instead of my earlier suggestion for the bprobes thing, I'm all with you. And I agree, 5 NOPs is *not* the right thing. 1 short jmp + 3 nops, this works. Karim -- President / Opersys Inc. Embedded Linux Training and Expertise www.opersys.com / 1.866.677.4546 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 16:24 ` Karim Yaghmour @ 2006-09-22 16:13 ` Mathieu Desnoyers 2006-09-22 17:03 ` Karim Yaghmour 0 siblings, 1 reply; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 16:13 UTC (permalink / raw) To: Karim Yaghmour Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Karim Yaghmour (karim@opersys.com) wrote: > So if your proposal is to amend the markup to use the short-jmp+nops > at every marker site instead of my earlier suggestion for the bprobes > thing, I'm all with you. > First of all, I think that specific architecture-specific optimisations can and should be integrated in a more generic portable framework. Hrm, your comment makes me think of an interesting idea : .align jump_address: near jump to end setup_stack_address: setup stack call empty function end: So, instead of putting nops in the target area, we fill it with a useful function call. Near jump being 2 bytes, it might be much easier to modify. If necessary, making sure the instruction is aligned would help to change it atomically. If we mark the jump address, the setup stack address and the end tag address with symbols, we can easily calculate (portably) the offset of the near jump to activate either the setup_stack_address or end tags. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 16:13 ` Mathieu Desnoyers @ 2006-09-22 17:03 ` Karim Yaghmour 2006-09-22 18:06 ` Mathieu Desnoyers 0 siblings, 1 reply; 27+ messages in thread From: Karim Yaghmour @ 2006-09-22 17:03 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Mathieu Desnoyers wrote: > First of all, I think that specific architecture-specific optimisations can and > should be integrated in a more generic portable framework. No disagreement there. If Ingo would care to comment, I think it might be an acceptable compromise to have x86 fully use kprobes/djprobes immediately, and the other archs could walk there at their rate. Practically, some stuff in include/asm-i386/markers.h and include/asm-x86_64/markers.h would contain the binary modifiable stuff and include/asm-generic/markers.h could contain a platform-independent fallback. > Hrm, your comment makes me think of an interesting idea : > > .align > jump_address: > near jump to end > setup_stack_address: > setup stack > call empty function > end: > > So, instead of putting nops in the target area, we fill it with a useful > function call. Near jump being 2 bytes, it might be much easier to modify. > If necessary, making sure the instruction is aligned would help to change it > atomically. If we mark the jump address, the setup stack address and the end > tag address with symbols, we can easily calculate (portably) the offset of the > near jump to activate either the setup_stack_address or end tags. That's another possibility. It seems more C friendly than the simple short-jump+3bytes. Ingo? Karim -- President / Opersys Inc. Embedded Linux Training and Expertise www.opersys.com / 1.866.677.4546 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 17:03 ` Karim Yaghmour @ 2006-09-22 18:06 ` Mathieu Desnoyers 2006-09-22 19:24 ` Karim Yaghmour 0 siblings, 1 reply; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-22 18:06 UTC (permalink / raw) To: Karim Yaghmour Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Karim Yaghmour (karim@opersys.com) wrote: > > > Hrm, your comment makes me think of an interesting idea : > > > > .align > > jump_address: > > near jump to end > > setup_stack_address: > > setup stack > > call empty function > > end: > > > > So, instead of putting nops in the target area, we fill it with a useful > > function call. Near jump being 2 bytes, it might be much easier to modify. > > If necessary, making sure the instruction is aligned would help to change it > > atomically. If we mark the jump address, the setup stack address and the end > > tag address with symbols, we can easily calculate (portably) the offset of the > > near jump to activate either the setup_stack_address or end tags. > > That's another possibility. It seems more C friendly than the simple > short-jump+3bytes. > Here is the implementation :-) Comments are welcome. /***************************************************************************** * marker.h * * Code markup for dynamic and static tracing. i386 architecture optimisations. * * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> * * This file is released under the GPLv2. * See the file COPYING for more details. */ #define MARK_NEAR_JUMP_PREFIX "__mark_near_jump_" #define MARK_NEAR_JUMP_SELECT_PREFIX "__mark_near_jump_select_" /* Note : max 256 bytes between over_label and near_jump */ #define MARK_DO_JUMP(name) \ do { \ __label__ near_jump; \ volatile static void *__mark_near_jump_##name \ asm (MARK_NEAR_JUMP_PREFIX#name) \ __attribute__((unused)) = &&near_jump; \ volatile static void *__mark_near_jump_select_##name \ asm (MARK_NEAR_JUMP_SELECT_PREFIX#name) \ __attribute__((unused)) = &&near_jump_select; \ asm volatile ( ".align 16;\n\t" : : ); \ asm volatile ( ".byte 0xeb;\n\t" : : ); \ near_jump_select: \ asm volatile ( ".byte %0-%1;\n\t" : : \ "m" (*&&over_label), "m" (*&&near_jump)); \ near_jump: \ asm volatile ( "" : : ); \ } while(0) To change it, we can dynamically overwrite the __mark_near_jump_select_##name value (a byte) to (__mark_jump_call_##name - __mark_near_jump_##name). So we have one architecture specific optimisation within the architecture agnostic marking mechanism. Comments are welcome. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 18:06 ` Mathieu Desnoyers @ 2006-09-22 19:24 ` Karim Yaghmour 0 siblings, 0 replies; 27+ messages in thread From: Karim Yaghmour @ 2006-09-22 19:24 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Mathieu Desnoyers wrote: > Here is the implementation :-) Trigger happy :) > To change it, we can dynamically overwrite the __mark_near_jump_select_##name > value (a byte) to (__mark_jump_call_##name - __mark_near_jump_##name). Hmm... I don't know if you won't still need to resort to int3 and then overwrite the byte. From my understanding it sounds like you wouldn't but that's where Richard's insight on the errata stuff might come in handy. Have you actually tried to run this code by any chance? *If* it did work, I think this might just mean that you don't need either kprobes or djprobes. > So we have one architecture specific optimisation within the architecture > agnostic marking mechanism. That would seem reasonable, I think. You might want to test out your mechanism, get confirmation from Richard and then post an update to your patches. Karim ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-22 15:08 ` Mathieu Desnoyers 2006-09-22 16:24 ` Karim Yaghmour @ 2006-09-22 16:45 ` Ingo Molnar 1 sibling, 0 replies; 27+ messages in thread From: Ingo Molnar @ 2006-09-22 16:45 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Mathieu Desnoyers <compudj@krystal.dyndns.org> wrote: > Some details are worth to be mentioned : > > - The 5-NOP variant will imply a replacement of 5 1 bytes instructions > with 1 5 bytes one, which is trickier. Masami Hiramatsu's proposal > of 2 bytes near jump + 3 NOPS is nicer. yeah. > - Patching such a 5-bytes instruction memory region doesn't turn > markers into a complete function call, which includes argument > passing. please read: Message-ID: <20060921185029.GB12048@elte.hu> about how to turn it into a complete function call / jump. [ i came back to this point from the end of the mail, where you consider this an "interesting idea" - so i guess your above claim is moot. If it is not moot not then by all means i'm happy to further debate it. ] > > And one reason why i want to avoid "static, build-time function call > > based probing" is because high-performance computing users dont want > > any overhead at all in the kernel fastpath. > > - The argument "most of the users wont use a particular feature" > contradicts what you said earlier about every distribution wanting > to enable a tracing mechanism for their users. enabling a feature does not mean it's actually used by most users! For example, Fedora currently enables: CONFIG_NETPOLL=y but only a tiny, tiny minority of users actually make use of it. Then why is it still enabled? Because it has little to zero overhead to have it enabled but not utilized. In the fastpath it has zero overhead. (and yes, i originally designed and implemented netpoll/netconsole to be like that, and i intentionally shaped it to be zero-overhead because i knew it would be used as a debug feature.) the same goes for tracing. We want to have it enabled in distros, but only if it's near zero-cost. But because tracing wants to be there in every fastpath, we have to work /hard/ to achieve this desired end-result. I am "in your way" unfortunately /precisely/ because tracing we want to enable in Fedora, but we want to pay as little cycles for it on 99.998% of the Fedora boxes that wont be using any tracing at all. > > > - High performance computing users won't want a breakpoint-based probe > > > > I am not forcing breakpoint-based probing, at all. I dont want _static, > > build-time function call based_ probing, and there is a big difference. > > And one reason why i want to avoid "static, build-time function call > > based probing" is because high-performance computing users dont want any > > overhead at all in the kernel fastpath. > > > > I think that the performance benefits gained by using tracing > information for studying a system makes the overhead of a jump in the > kernel fast path insignificant. [...] sorry, but as Alan mentioned it before, this leads to "death by a million cuts". See above to understand what kind of feature is acceptable to a distro (and hence to the upstream kernel) and what not. > > > - djprobe is far away from being in an acceptable state on > > > architectures with very inconvenient erratas (x86). > > > > djprobes over a NOP marker are perfectly usable and safe: just add a > > simple constraint to them to only allow a djprobes insertion if it > > replaces a 5-byte NOP. > > > > 2 bytes jump + 3 bytes nops.. Yes, it should modify it without causing an > illegal instruction, but how ? Are you aware that their approach has to : > - put an int3 > - wait for _all_ the CPUs to execute this int3 > - then change the 5 bytes instruction > > I can think of a lot of cases where the CPUs will never execute this > int3. Probably that sending an IPI or launching a kernel thread on > each CPU to make sure that this int3 is executed could give more > guarantees there. [...] this is easy to solve, for example via the use of freeze_processes() and thaw_processes(). > > > - kprobe and djprobe cannot access local variables in every cases > > > > it is possible with the marker mechanism i outlined before: > > > > http://marc.theaimsgroup.com/?l=linux-kernel&m=115886524224874&w=2 > > > > have i missed to address any concern of yours? > > Interesting idea. That would make it possible to probe local variables > at the marker site. That's very good for use of kprobes on low rate > debug-type markers, but that doesn't solve my concern about the > cat-and-mouse race expressed earlier about live kernel polymorphic > code. i dont consider polymorphic code a problem at all. Java JITs (and the dot-NET runtime) are faced with it every day and they have no problem _dynamically rewriting high-performance code all the time_. In fact, in the kernel we have /more/ tools to solve such problems. We can disable interrupts, we can flush caches, we can send IRQs, we have total control over every task in the system, etc., etc. Furthermore, it is even more of a side issue in our case because unlike Java JITs, for us the polymorphic code is in the /absolute slowpath/. We could even turn off all caches while doing the code patching and nobody would notice. (but such drastic measures are not needed at all) Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 21:42 ` Mathieu Desnoyers ` (3 preceding siblings ...) 2006-09-22 7:07 ` Ingo Molnar @ 2006-09-22 14:31 ` Christoph Hellwig 4 siblings, 0 replies; 27+ messages in thread From: Christoph Hellwig @ 2006-09-22 14:31 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Ingo Molnar, Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox I hate AOL-style me-toos, but there's nothing to add to this mail. Thanks for this coherent writeup Mathieu. On Thu, Sep 21, 2006 at 05:42:48PM -0400, Mathieu Desnoyers wrote: > Hi Ingo, > > I clearly expressed my position in the previous emails, so did you. You argued > about a use of tracing that is not relevant to my vision of reality, which is : > > - Embedded systems developers won't want a breakpoint-based probe > - High performance computing users won't want a breakpoint-based probe > - djprobe is far away from being in an acceptable state on architectures with > very inconvenient erratas (x86). > - kprobe and djprobe cannot access local variables in every cases > > For those reasons, I prefer a jump-over-call approach which lets gcc give us the > local variables. No need of DWARF or SystemTAP macro Kung Fu. Just C and a > loadable module. > > By no means is it a replacement for a completely dynamic breakpoint-based > instrumentation mechanism. I really think that both mechanism should coexist. > > This is my position : I let the distribution/user decide what is appropriate for > their use. My goal is to provide them a flexible mechanism that takes the > multiple variety of uses in account without performance impact if they are not > willing to pay it to benefit from tracing. > > With all due respect, yes, there are Linux users different from the typical > Redhat client. If your vision is still limited to this scope after a 500 > emails debate, I am afraid that there is very little I can do about it in > one more. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 16:06 ` Ingo Molnar 2006-09-21 21:42 ` Mathieu Desnoyers @ 2006-09-23 16:51 ` Mathieu Desnoyers 1 sibling, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-23 16:51 UTC (permalink / raw) To: Ingo Molnar Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Ingo Molnar (mingo@elte.hu) wrote: > but tracing a raw pagefault at the arch level is a bad idea anyway, we > want to trace __handle_mm_fault(). That way you can avoid having to > modify every architecture's pagefault handler ... > The problem with __handle_mm_fault() is that the struct pt_regs * is not passed to this function. An event containing both the address where the fault occurs and the instruction pointer that caused the fault is very useful. Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 16:00 [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) Mathieu Desnoyers 2006-09-21 16:06 ` Ingo Molnar @ 2006-09-21 17:56 ` Frank Ch. Eigler 2006-09-21 18:50 ` Ingo Molnar 2006-09-21 20:59 ` Mathieu Desnoyers 1 sibling, 2 replies; 27+ messages in thread From: Frank Ch. Eigler @ 2006-09-21 17:56 UTC (permalink / raw) To: Mathieu Desnoyers Cc: Martin Bligh, Frank Ch. Eigler, Masami Hiramatsu, prasanna, Andrew Morton, Ingo Molnar, Mathieu Desnoyers, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox [-- Attachment #1: Type: text/plain, Size: 3079 bytes --] Hi - > Yet, again, a new version. I integrated a full probe management > mechanism. [...] Thank you for continuing on. > +#define MARK_SYM(name) \ > + here: asm volatile \ > + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \ Regarding MARK_SYM, if I read Ingo's message correctly, this is the only type of marker he believes is necessary, since it would put a place for kprobes to put a breakpoint. FWIW, this still appears to be applicable only if the int3 overheads are tolerable, and if parameter data extraction is unnecessary or sufficiently robust "by accident". Regarding: > +#define MARK_JUMP(name, format, args...) \ > [...] All this leap/jump/branch elaboration may be based upon scanty benchmarks. The plain conditional-indirect function call is already "fast", especially if its hosting compilation unit is compiled with -freorder-blocks. Direct jumps to instrumentation are unlikely to be of a great deal of use, in that there would have to be some assembly-level code in between to save/restore affected registers. Remember, ultimately the handler will be written in C. Regarding use of an empty_function() sentinel instead of NULLs, it is worth measuring carefully whether a unconditional indirect call to such a dummy function is faster than a conditional indirect call. It may have to be a per-architecture internal implementation option. Regarding varargs, I still believe that varargs have poorer type-safety. Remember, it's not just type-safety at the marker site (which gcc's printf-format-checker could validate), but the handler's too. I believe it is incorrect that a non-varargs function can always safely take a call from a varargs call - varargs changes the calling conventions. This would mean that the handler would itself have to be varargs, with all the attendant run-time overheads. (Plus the idea of using a build-time script to generate the non-verargs handlers from analyzing particular MARK() calls is itself probably a bit complex for its own good.) Regarding measurements in general, one must avoid being misled by microbenchmarks such as those that represent an extreme of caching friendliness. It may be necessary to run large system-level tests to meaningfully compare alternatives. > +int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...), > + enum marker_type type); > +void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...), > + enum marker_type type); I'm unclear about what a merker_type value represents. Could you go over that again? > +static int marker_get_pointers(const char *name, > + [...] > + ptrs->call = (void**)kallsyms_lookup_name(call_sym); > + ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym); > + ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym); > + ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym); > + ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym); > [...] I wonder whether, as for kprobes, it will be necessary to lock into memory a module containing an active marker. - FChE [-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 17:56 ` Frank Ch. Eigler @ 2006-09-21 18:50 ` Ingo Molnar 2006-09-21 19:54 ` Jeremy Fitzhardinge 2006-09-25 17:45 ` Frank Ch. Eigler 2006-09-21 20:59 ` Mathieu Desnoyers 1 sibling, 2 replies; 27+ messages in thread From: Ingo Molnar @ 2006-09-21 18:50 UTC (permalink / raw) To: Frank Ch. Eigler Cc: Mathieu Desnoyers, Martin Bligh, Masami Hiramatsu, prasanna, Andrew Morton, Mathieu Desnoyers, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox * Frank Ch. Eigler <fche@redhat.com> wrote: > > +#define MARK_SYM(name) \ > > + here: asm volatile \ > > + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \ > > Regarding MARK_SYM, if I read Ingo's message correctly, this is the > only type of marker he believes is necessary, since it would put a > place for kprobes to put a breakpoint. FWIW, this still appears to be > applicable only if the int3 overheads are tolerable, and if parameter > data extraction is unnecessary or sufficiently robust "by accident". let me qualify that: parameters must be prepared there too - but no actual function call inserted. (at most a NOP inserted). The register filling doesnt even have to be function-calling-convention compliant - that makes the symbolic probe almost zero-impact to register allocation/scheduling, the only thing it should ensure is that the parameters that are annotated to be available in register, stack or memory _somewhere_. (i.e. not hidden or destroyed at that point by gcc) Does a simple asm() that takes read-only parameters but only adds a NOP achieve this result? Ingo ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 18:50 ` Ingo Molnar @ 2006-09-21 19:54 ` Jeremy Fitzhardinge 2006-09-25 17:45 ` Frank Ch. Eigler 1 sibling, 0 replies; 27+ messages in thread From: Jeremy Fitzhardinge @ 2006-09-21 19:54 UTC (permalink / raw) To: Ingo Molnar Cc: Frank Ch. Eigler, Mathieu Desnoyers, Martin Bligh, Masami Hiramatsu, prasanna, Andrew Morton, Mathieu Desnoyers, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox Ingo Molnar wrote: > let me qualify that: parameters must be prepared there too - but no > actual function call inserted. (at most a NOP inserted). The register > filling doesnt even have to be function-calling-convention compliant - > that makes the symbolic probe almost zero-impact to register > allocation/scheduling, the only thing it should ensure is that the > parameters that are annotated to be available in register, stack or > memory _somewhere_. (i.e. not hidden or destroyed at that point by gcc) > Does a simple asm() that takes read-only parameters but only adds a NOP > achieve this result? Do you mean using the asm to make sure gcc puts a reference to a variable into the DWARF info, or some other way of encoding the value locations? Hm, another problem. If the mark is in a loop, and gcc decides to unroll the loop, then you'll probably only get a mark in one iteration of the loop (or 1/Nth of the iterations). Or worse, assembler errors. The only way I can see to deal with this is to not use symbols, but put records in a special section. That way, if the asm() inserting them gets duplicated, you'll get duplicate records in the marker section. I guess you'd get a similar problem with markers inserted in inlined functions. (How does gdb deal with breakpoints in unrolled loops?) J ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 18:50 ` Ingo Molnar 2006-09-21 19:54 ` Jeremy Fitzhardinge @ 2006-09-25 17:45 ` Frank Ch. Eigler 1 sibling, 0 replies; 27+ messages in thread From: Frank Ch. Eigler @ 2006-09-25 17:45 UTC (permalink / raw) To: Ingo Molnar Cc: Frank Ch. Eigler, Mathieu Desnoyers, Martin Bligh, Masami Hiramatsu, prasanna, Andrew Morton, Mathieu Desnoyers, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox [-- Attachment #1: Type: text/plain, Size: 852 bytes --] Hi - On Thu, Sep 21, 2006 at 08:50:29PM +0200, Ingo Molnar wrote: > [...] let me qualify that: parameters must be prepared there too - > but no actual function call inserted. (at most a NOP > inserted). [...] Does a simple asm() that takes read-only > parameters but only adds a NOP achieve this result? You mean something like this? #define MARK(n,v1,v2,v3) asm ("__mark_" #n ": nop" :: \ "X" (v1), "X" (v2), "X" (v3)) I haven't been able to get gcc to emit any better debuginfo for parameters pseudo-passed like this. (I've tested such a marker inserted into an inner loop of dhrystone. It was compiled with "-ggdb -O3". Neither gdb nor systemtap could resolve the same values/symbols being passed as MARK() arguments, though at least the breakpoint address was nicely marked.) - FChE [-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) 2006-09-21 17:56 ` Frank Ch. Eigler 2006-09-21 18:50 ` Ingo Molnar @ 2006-09-21 20:59 ` Mathieu Desnoyers 1 sibling, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2006-09-21 20:59 UTC (permalink / raw) To: Frank Ch. Eigler Cc: Martin Bligh, Masami Hiramatsu, prasanna, Andrew Morton, Ingo Molnar, Paul Mundt, linux-kernel, Jes Sorensen, Tom Zanussi, Richard J Moore, Michel Dagenais, Christoph Hellwig, Greg Kroah-Hartman, Thomas Gleixner, William Cohen, ltt-dev, systemtap, Alan Cox [-- Attachment #1: Type: text/plain, Size: 6182 bytes --] * Frank Ch. Eigler (fche@redhat.com) wrote: > Thank you for continuing on. > Thanks for the comments, see below : > > +#define MARK_SYM(name) \ > > + here: asm volatile \ > > + (MARK_KPROBE_PREFIX#name " = %0" : : "m" (*&&here)); \ > > Regarding MARK_SYM, if I read Ingo's message correctly, this is the > only type of marker he believes is necessary, since it would put a > place for kprobes to put a breakpoint. FWIW, this still appears to be > applicable only if the int3 overheads are tolerable, and if parameter > data extraction is unnecessary or sufficiently robust "by accident". > I agree. And I don't see how the int3 approach fills the need of embedded systems and high performance computing developers, so I think that a "call" alternative, where the stack is set up by gcc itself, is worthy. > > Regarding: > > > +#define MARK_JUMP(name, format, args...) \ > > [...] > > All this leap/jump/branch elaboration may be based upon scanty > benchmarks. The plain conditional-indirect function call is already > "fast", especially if its hosting compilation unit is compiled with > -freorder-blocks. > Yes, I just wanted to show that there was some performance merits to this technique, more benchmarks will be needed. The main problem I have seen with your condition+call approach is that there is no safe way to remove the function pointer without causing a huge mess (null pointer dereference). This is why I thought of this alternative that has the abolity of keep a consistent execution flow at _all_ times. > Direct jumps to instrumentation are unlikely to be of a great deal of > use, in that there would have to be some assembly-level code in > between to save/restore affected registers. Remember, ultimately the > handler will be written in C. > Yup, I agree, but it's nice to have the ability to jump over inline functions :) > Regarding use of an empty_function() sentinel instead of NULLs, it is > worth measuring carefully whether a unconditional indirect call to > such a dummy function is faster than a conditional indirect call. It > may have to be a per-architecture internal implementation option. > Yes, but you also have to take in account the stack setup cost. Idoubt that it will be faster than a jump, but this is worthy to test. > Regarding varargs, I still believe that varargs have poorer > type-safety. Remember, it's not just type-safety at the marker site > (which gcc's printf-format-checker could validate), but the handler's > too. I believe it is incorrect that a non-varargs function can always > safely take a call from a varargs call - varargs changes the calling > conventions. This would mean that the handler would itself have to be > varargs, with all the attendant run-time overheads. (Plus the idea of > using a build-time script to generate the non-verargs handlers from > analyzing particular MARK() calls is itself probably a bit complex for > its own good.) > Absolutely, you are right. Good catch! I am putting a asmlinkage prefix to the probe function, which will make sure that every argument is passed on the stack. From http://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html "regparm (number) On the Intel 386, the regparm attribute causes the compiler to pass arguments number one to number if they are of integral type in registers EAX, EDX, and ECX instead of on the stack. Functions that take a variable number of arguments will continue to be passed all of their arguments on the stack. Beware that on some ELF systems this attribute is unsuitable for global functions in shared libraries with lazy binding (which is the default). Lazy binding will send the first call via resolving code in the loader, which might assume EAX, EDX and ECX can be clobbered, as per the standard calling conventions. Solaris 8 is affected by this. GNU systems with GLIBC 2.1 or higher, and FreeBSD, are believed to be safe since the loaders there save all registers. (Lazy binding can be disabled with the linker or the loader if desired, to avoid the problem.)" so a regparm(0) seems like a good solution there. > Regarding measurements in general, one must avoid being misled by > microbenchmarks such as those that represent an extreme of caching > friendliness. It may be necessary to run large system-level tests to > meaningfully compare alternatives. > Yes, the problem is that, after having ran such tests on a slower approach, the macrobenchmarks failed to show any difference. Yes, those tests should be done, but I don't expect any revelation from them. > > > +int marker_set_probe(const char *name, void (*probe)(const char *fmt, ...), > > + enum marker_type type); > > +void marker_disable_probe(const char *name, void (*probe)(const char *fmt, ...), > > + enum marker_type type); > > I'm unclear about what a marker_type value represents. Could you go > over that again? > marker_type MARKER_CALL Sets the jump to a function call marker_type MARKER_INLINE Sets the jump to the provided inline function > > > +static int marker_get_pointers(const char *name, > > + [...] > > + ptrs->call = (void**)kallsyms_lookup_name(call_sym); > > + ptrs->jmpselect = (void**)kallsyms_lookup_name(jmpselect_sym); > > + ptrs->jmpcall = (void**)kallsyms_lookup_name(jmpcall_sym); > > + ptrs->jmpinline = (void**)kallsyms_lookup_name(jmpinline_sym); > > + ptrs->jmpover = (void**)kallsyms_lookup_name(jmpover_sym); > > [...] > > I wonder whether, as for kprobes, it will be necessary to lock into > memory a module containing an active marker. > Nope, as the module_exit _has_ to call marker_disable_probe and that marker_disable_probe does a synchronize_sched(). Because the function call is surrounded by preempt_disable(), there will be no execution stream within the function when the module will be unloaded. Regards, Mathieu OpenPGP public key: http://krystal.dyndns.org:8080/key/compudj.gpg Key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68 [-- Attachment #2: Digital signature --] [-- Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2006-09-25 17:48 UTC | newest] Thread overview: 27+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2006-09-21 16:00 [PATCH] Linux Kernel Markers 0.5 for Linux 2.6.17 (with probe management) Mathieu Desnoyers 2006-09-21 16:06 ` Ingo Molnar 2006-09-21 21:42 ` Mathieu Desnoyers 2006-09-21 21:49 ` Mathieu Desnoyers 2006-09-22 6:29 ` Karim Yaghmour 2006-09-22 6:49 ` Ingo Molnar 2006-09-22 14:03 ` Mathieu Desnoyers 2006-09-22 16:53 ` Ingo Molnar 2006-09-22 17:11 ` Mathieu Desnoyers 2006-09-22 17:12 ` Ingo Molnar 2006-09-22 17:28 ` Mathieu Desnoyers 2006-09-22 7:07 ` Ingo Molnar 2006-09-22 8:14 ` Karim Yaghmour 2006-09-22 15:08 ` Mathieu Desnoyers 2006-09-22 16:24 ` Karim Yaghmour 2006-09-22 16:13 ` Mathieu Desnoyers 2006-09-22 17:03 ` Karim Yaghmour 2006-09-22 18:06 ` Mathieu Desnoyers 2006-09-22 19:24 ` Karim Yaghmour 2006-09-22 16:45 ` Ingo Molnar 2006-09-22 14:31 ` Christoph Hellwig 2006-09-23 16:51 ` Mathieu Desnoyers 2006-09-21 17:56 ` Frank Ch. Eigler 2006-09-21 18:50 ` Ingo Molnar 2006-09-21 19:54 ` Jeremy Fitzhardinge 2006-09-25 17:45 ` Frank Ch. Eigler 2006-09-21 20:59 ` Mathieu Desnoyers
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox