* [patch] x86, ptrace: in-kernel BTS interface
@ 2008-04-30 11:54 Markus Metzger
2008-04-30 12:26 ` Andrew Morton
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Markus Metzger @ 2008-04-30 11:54 UTC (permalink / raw)
To: andi-suse, hpa, linux-kernel, mingo, tglx
Cc: markus.t.metzger, markus.t.metzger, suresh.b.siddha, roland, akpm,
mtk.manpages, eranian, juan.villacis
Provide an in-kernel interface to Branch Trace Store and implement the
ptrace user interface on top of it.
Fix a few bugs that were detected during the perfmon2 adaptation to
the DS interface by Stephane Eranian.
The ptrace implementation becomes a rather thin layer that only
forwards requests.
The BTS interface may later be morphed into a utrace interface for
execution trace, or it may be used to implement such a utrace
interface on top of it.
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
---
Index: gits.x86/arch/x86/kernel/bts.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ gits.x86/arch/x86/kernel/bts.c 2008-04-30 11:30:18.%N +0200
@@ -0,0 +1,505 @@
+/*
+ * Branch Trace Store (BTS) support
+ *
+ * This provides a low-level interface to the hardware's Branch Trace Store
+ * feature that is used for execution tracing.
+ *
+ * It manages:
+ * - per-thread and per-cpu BTS configuration
+ * - buffer memory allocation and overflow handling
+ *
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2008
+ */
+
+#ifdef CONFIG_X86_BTS
+
+#include <asm/bts.h>
+#include <asm/ds.h>
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+
+/*
+ * The configuration for a particular BTS hardware implementation.
+ */
+struct bts_configuration {
+ /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
+ unsigned char sizeof_bts;
+ /* the size of a field in the BTS record in bytes */
+ unsigned char sizeof_field;
+ /* a bitmask to enable/disable various parts of BTS in DEBUGCTL MSR */
+ unsigned long debugctl_tr;
+ unsigned long debugctl_btint;
+ unsigned long debugctl_user_off;
+ unsigned long debugctl_kernel_off;
+ unsigned long debugctl_all;
+};
+static struct bts_configuration bts_cfg;
+
+#define BTS_MAX_RECORD_SIZE (8 * 3)
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ */
+
+enum bts_field {
+ bts_from = 0,
+ bts_to,
+ bts_flags,
+
+ bts_escape = (unsigned long)-1,
+ bts_qual = bts_to,
+ bts_jiffies = bts_flags
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+ base += (bts_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
+}
+
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+{
+ base += (bts_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
+
+/*
+ * Check the parameter raw BTS record.
+ * Return 1 if it is invalid; 0, otherwise
+ *
+ * raw: the raw BTS record to check
+ */
+static int bts_invalid(const void *raw)
+{
+ if ((bts_get(raw, bts_from) == 0) &&
+ (bts_get(raw, bts_to) == 0))
+ return 1;
+ return 0;
+}
+
+/*
+ * Translate a BTS record from the raw format into the bts_struct format
+ *
+ * out (out): bts_struct interpretation
+ * raw: raw BTS record
+ */
+static void bts_translate_record(struct bts_struct *out, const void *raw)
+{
+ memset(out, 0, sizeof(*out));
+ if (bts_get(raw, bts_from) == bts_escape) {
+ out->qualifier = bts_get(raw, bts_qual);
+ out->variant.jiffies = bts_get(raw, bts_jiffies);
+ } else if (bts_invalid(raw)) {
+ out->qualifier = BTS_INVALID;
+ } else {
+ out->qualifier = BTS_BRANCH;
+ out->variant.lbr.from = bts_get(raw, bts_from);
+ out->variant.lbr.to = bts_get(raw, bts_to);
+ }
+}
+
+
+int bts_request(struct task_struct *task, void *base, size_t size,
+ bts_ovfl_callback_t ovfl)
+{
+ if (!bts_cfg.sizeof_bts)
+ return -EOPNOTSUPP;
+
+ return ds_request_bts(task, base, size, ovfl);
+}
+EXPORT_SYMBOL(bts_request);
+
+int bts_release(struct task_struct *task)
+{
+ int error;
+
+ /* We need to disable branch recording for the parameter task
+ * (or current cpu) before we may release the DS resources.
+ */
+ error = bts_configure(task, /* flags = */ 0);
+ if (error < 0)
+ return error;
+
+ return ds_release_bts(task);
+}
+EXPORT_SYMBOL(bts_release);
+
+int bts_configure(struct task_struct *task, unsigned int flags)
+{
+ unsigned long debugctl_mask = 0;
+ int error;
+
+ /* Before we touch the configuration, let's validate that we
+ * are allowed to.
+ */
+ error = ds_validate_bts(task);
+ if (error < 0)
+ return error;
+
+ if (flags & BTS_O_TRACE)
+ debugctl_mask |= bts_cfg.debugctl_tr;
+ if (flags & BTS_O_USER_OFF)
+ debugctl_mask |= bts_cfg.debugctl_user_off;
+ if (flags & BTS_O_KERNEL_OFF)
+ debugctl_mask |= bts_cfg.debugctl_kernel_off;
+
+ if (task) {
+ task->thread.debugctlmsr &= ~bts_cfg.debugctl_all;
+ task->thread.debugctlmsr |= debugctl_mask;
+
+ if (task == current)
+ update_debugctlmsr(task->thread.debugctlmsr);
+
+ if (flags & BTS_O_TIMESTAMP)
+ set_tsk_thread_flag(task, TIF_BTS_TRACE_TS);
+ else
+ clear_tsk_thread_flag(task, TIF_BTS_TRACE_TS);
+
+ if (task->thread.debugctlmsr)
+ set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ else
+ clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ }
+
+ if (!task) {
+ update_debugctlmsr((get_debugctlmsr() & ~bts_cfg.debugctl_all)
+ | debugctl_mask);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL(bts_configure);
+
+int bts_status(struct task_struct *task, size_t *bsize)
+{
+ unsigned long debugctl = 0;
+ const void *base, *max;
+ size_t end;
+ int error, flags = 0;
+
+ error = ds_validate_bts(task);
+ if (error < 0)
+ return error;
+
+ if (task) {
+ if (test_tsk_thread_flag(task, TIF_BTS_TRACE_TS))
+ flags |= BTS_O_TIMESTAMP;
+ debugctl = task->thread.debugctlmsr;
+ } else
+ debugctl = get_debugctlmsr();
+
+ if (debugctl & bts_cfg.debugctl_tr)
+ flags |= BTS_O_TRACE;
+ if (debugctl & bts_cfg.debugctl_user_off)
+ flags |= BTS_O_USER_OFF;
+ if (debugctl & bts_cfg.debugctl_kernel_off)
+ flags |= BTS_O_KERNEL_OFF;
+
+ if (bsize) {
+ error = ds_get_bts_end(task, &end);
+ if (error < 0)
+ return error;
+
+ error = ds_access_bts(task, /* index = */ 0, &base);
+ if (error < 0)
+ return error;
+
+ error = ds_access_bts(task, /* index = */ end, &max);
+ if (error < 0)
+ return error;
+
+ *bsize = (max - base);
+ }
+
+ return flags;
+}
+EXPORT_SYMBOL(bts_status);
+
+int bts_size(struct task_struct *task, size_t *pos)
+{
+ const void *raw;
+ size_t bts_index;
+ int error;
+
+ /* Depending on the buffer overflow policy and whether an
+ * overflow actually occured, the size is either the write
+ * index or the total size of the array.
+ * We try to read at the write index position. If we read a
+ * valid record, this seems to be a cyclic buffer after the
+ * first overflow and we return the total array size;
+ * otherwise, we return the write index.
+ */
+ error = ds_get_bts_index(task, &bts_index);
+ if (error < 0)
+ return error;
+
+ error = ds_access_bts(task, bts_index, &raw);
+ if ((error < 0) || bts_invalid(raw)) {
+ if (pos)
+ *pos = bts_index;
+ return bts_index;
+ }
+
+ return ds_get_bts_end(task, pos);
+}
+EXPORT_SYMBOL(bts_size);
+
+int bts_read(struct task_struct *task,
+ size_t index, size_t count,
+ struct bts_struct *kbuf,
+ struct bts_struct __user *ubuf,
+ unsigned int flags)
+{
+ struct bts_struct bts;
+ const unsigned char *raw, *bob, *eob;
+ size_t bts_index, bts_end, size;
+ int error, bytes, inc;
+
+ if (!count)
+ return 0;
+
+ error = ds_get_bts_end(task, &bts_end);
+ if (error < 0)
+ return error;
+
+ if (bts_end <= index)
+ return -EINVAL;
+
+ error = bts_size(task, &size);
+ if (error < 0)
+ return error;
+
+ if (size < (index + count))
+ return -EINVAL;
+
+ error = ds_get_bts_index(task, &bts_index);
+ if (error < 0)
+ return error;
+
+ error = ds_access_bts(task, 0, (const void **)&bob);
+ if (error < 0)
+ return error;
+
+ error = ds_access_bts(task, bts_end, (const void **)&eob);
+ if (error < 0)
+ return error;
+
+ /* translate the bts index into the ds bts index */
+ if (flags & BTS_READ_O_REVERSE) {
+ bts_index += bts_end - (index + count);
+ inc = bts_cfg.sizeof_bts;
+ } else {
+ bts_index += bts_end - (index + 1);
+ inc = -bts_cfg.sizeof_bts;
+ }
+ if (bts_end <= bts_index)
+ bts_index -= bts_end;
+
+ error = ds_access_bts(task, bts_index, (const void **)&raw);
+ if (error < 0)
+ return error;
+
+ bytes = 0;
+ while (count-- > 0) {
+ if (kbuf)
+ bts_translate_record(kbuf++, raw);
+
+ if (ubuf) {
+ bts_translate_record(&bts, raw);
+
+ if (copy_to_user(ubuf++, &bts, sizeof(bts)))
+ return -EFAULT;
+ }
+
+ raw += inc;
+ if (raw < bob)
+ raw = eob + inc;
+ if (raw >= eob)
+ raw = bob;
+ bytes += sizeof(bts);
+ }
+
+ return bytes;
+}
+EXPORT_SYMBOL(bts_read);
+
+int bts_reset(struct task_struct *task)
+{
+ return ds_reset_bts(task);
+}
+EXPORT_SYMBOL(bts_reset);
+
+int bts_clear(struct task_struct *task)
+{
+ return ds_clear_bts(task);
+}
+EXPORT_SYMBOL(bts_clear);
+
+/*
+ * Write a single branch trace record into the branch trace buffer of
+ * the parameter task or the current cpu at the current position.
+ *
+ * task: the eventing task (NULL for current cpu)
+ * in: the record to write
+ */
+static int bts_write(struct task_struct *task,
+ const struct bts_struct *in)
+{
+ unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+
+ BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
+
+ memset(bts_record, 0, bts_cfg.sizeof_bts);
+ switch (in->qualifier) {
+ case BTS_INVALID:
+ break;
+
+ case BTS_BRANCH:
+ bts_set(bts_record, bts_from, in->variant.lbr.from);
+ bts_set(bts_record, bts_to, in->variant.lbr.to);
+ break;
+
+ case BTS_TASK_ARRIVES:
+ case BTS_TASK_DEPARTS:
+ bts_set(bts_record, bts_from, bts_escape);
+ bts_set(bts_record, bts_qual, in->qualifier);
+ bts_set(bts_record, bts_jiffies, in->variant.jiffies);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ /* The writing task will be the switched-to task on a context
+ * switch. It needs to write into the switched-from task's BTS
+ * buffer. */
+ return ds_unchecked_write_bts(task, bts_record, bts_cfg.sizeof_bts);
+}
+
+/*
+ * Record a scheduling event and its timestamp in the parameter task's
+ * branch trace buffer.
+ *
+ * If the parameter task is NULL, the timestamp is recorded in the
+ * current cpu's branch trace buffer.
+ *
+ * task: the eventing task (NULL for current cpu)
+ * qualifier: the type of scheduling event
+ */
+void bts_take_timestamp(struct task_struct *task,
+ enum bts_qualifier qualifier)
+{
+ struct bts_struct rec = {
+ .qualifier = qualifier,
+ .variant.jiffies = jiffies_64
+ };
+
+ bts_write(task, &rec);
+}
+EXPORT_SYMBOL(bts_take_timestamp);
+
+
+static const struct bts_configuration bts_cfg_netburst = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_tr = (1<<2)|(1<<3),
+ .debugctl_btint = (1<<4),
+ .debugctl_user_off = (1<<6),
+ .debugctl_kernel_off = (1<<5)
+};
+
+static const struct bts_configuration bts_cfg_pentium_m = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_tr = (1<<6)|(1<<7),
+ .debugctl_btint = (1<<8),
+ .debugctl_user_off = 0,
+ .debugctl_kernel_off = 0
+};
+
+static const struct bts_configuration bts_cfg_core2 = {
+ .sizeof_bts = 8 * 3,
+ .sizeof_field = 8,
+ .debugctl_tr = (1<<6)|(1<<7),
+ .debugctl_btint = (1<<8),
+ .debugctl_user_off = (1<<10),
+ .debugctl_kernel_off = (1<<9)
+};
+
+static inline void bts_init(const struct bts_configuration *cfg)
+{
+ bts_cfg = *cfg;
+
+ bts_cfg.debugctl_all =
+ (bts_cfg.debugctl_tr |
+ bts_cfg.debugctl_btint |
+ bts_cfg.debugctl_user_off |
+ bts_cfg.debugctl_kernel_off);
+}
+
+void __cpuinit bts_init_intel(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+ case 0x6:
+ switch (c->x86_model) {
+ case 0xD:
+ case 0xE: /* Pentium M */
+ bts_init(&bts_cfg_pentium_m);
+ break;
+ case 0xF: /* Core2 */
+ case 0x1C: /* Atom */
+ bts_init(&bts_cfg_core2);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ case 0xF:
+ switch (c->x86_model) {
+ case 0x0:
+ case 0x1:
+ case 0x2: /* Netburst */
+ bts_init(&bts_cfg_netburst);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+}
+
+#endif /* CONFIG_X86_BTS */
Index: gits.x86/include/asm-x86/bts.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ gits.x86/include/asm-x86/bts.h 2008-04-30 11:30:36.%N +0200
@@ -0,0 +1,224 @@
+/*
+ * Branch Trace Store (BTS) support
+ *
+ * This provides a low-level interface to the hardware's Branch Trace Store
+ * feature that is used for execution tracing.
+ *
+ * The abstraction is an array of branch records viewed backwards in time.
+ * That is, the newest record is at index 0; bigger indices give older data.
+ *
+ *
+ * It manages:
+ * - per-thread and per-cpu BTS configuration
+ * - buffer memory allocation and overflow handling
+ *
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2008
+ */
+
+#ifndef _ASM_X86_BTS_H
+#define _ASM_X86_BTS_H
+
+#ifdef CONFIG_X86_BTS
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+struct task_struct;
+
+
+/* A branch trace record entry
+ *
+ * In order to unify the interface between various processor versions,
+ * we use the below data structure for all processors.
+ */
+enum bts_qualifier {
+ BTS_INVALID = 0,
+ BTS_BRANCH,
+ BTS_TASK_ARRIVES,
+ BTS_TASK_DEPARTS
+};
+
+#ifdef __KERNEL__
+struct bts_struct {
+ u64 qualifier;
+ union {
+ /* BTS_BRANCH */
+ struct {
+ u64 from;
+ u64 to;
+ } lbr;
+ /* BTS_TASK_ARRIVES or
+ BTS_TASK_DEPARTS */
+ u64 jiffies;
+ } variant;
+};
+#else /* !__KERNEL__ */
+struct bts_struct {
+ __u64 qualifier;
+ union {
+ /* BTS_BRANCH */
+ struct {
+ __u64 from;
+ __u64 to;
+ } lbr;
+ /* BTS_TASK_ARRIVES or
+ BTS_TASK_DEPARTS */
+ __u64 jiffies;
+ } variant;
+#endif /* __KERNEL__ */
+
+/*
+ * Request branch tracing for the parameter task or for the current cpu.
+ *
+ * Due to alignement constraints, the actual buffer may be slightly
+ * smaller than the requested or provided buffer.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to request recording for;
+ * NULL for per-cpu recording on the current cpu
+ * base: the base pointer for the (non-pageable) buffer;
+ * NULL if buffer allocation requested
+ * size: the size of the requested or provided buffer
+ * ovfl: pointer to a function to be called on buffer overflow;
+ * NULL if cyclic buffer requested
+ */
+
+typedef void (*bts_ovfl_callback_t)(struct task_struct *);
+extern int bts_request(struct task_struct *task, void *base, size_t size,
+ bts_ovfl_callback_t ovfl);
+
+/*
+ * Release branch recording resources (and stop recording).
+ *
+ * Frees buffers allocated on bts_request.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to release resources for;
+ * NULL to release resources for the current cpu
+ */
+extern int bts_release(struct task_struct *task);
+
+/*
+ * Configure branch recording.
+ *
+ * Not all processors support all variants.
+ * If a variant is not supported, the respective flag is ignored.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to configure BTS for;
+ * NULL to configure BTS for the current cpu
+ * flags: a bit-mask of the below configuration flags
+ */
+#define BTS_O_TRACE (1<<0) /* record branch trace */
+#define BTS_O_TIMESTAMP (1<<1) /* record scheduling timestamps */
+#define BTS_O_USER_OFF (1<<2) /* do not trace user mode */
+#define BTS_O_KERNEL_OFF (1<<3) /* do not trace kernel mode */
+
+extern int bts_configure(struct task_struct *task, unsigned int flags);
+
+/*
+ * Query the branch recording configuration.
+ *
+ * Returns a bit-mask of configuration flags on success; -Eerrno otherwise
+ * Optionally stores the actual buffer size in bytes at the location
+ * pointed to by bsize, if bsize is not NULL.
+ *
+ * task: the task to query the configuration for;
+ * NULL to query for the current cpu configuration
+ * bsize (out): optional pointer to the buffer size variable
+ */
+extern int bts_status(struct task_struct *task, size_t *bsize);
+
+/*
+ * Return the size of the recorded branch trace in number of trace
+ * records, i.e. returns the index one entry beyond the end of the
+ * branch trace array.
+ *
+ * Returns -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * pos (out): if not NULL, will hold the result
+ */
+extern int bts_size(struct task_struct *task, size_t *pos);
+
+/*
+ * Copy count branch trace records starting from index into the
+ * non-NULL buffers.
+ *
+ * The output may be reversed to allow consecutive reads into a bigger
+ * buffer. Indices still follow the array abstraction, i.e. trace
+ * is read from index + count - 1 to index.
+ *
+ * Both buffers are optional. If the respective parameter is not NULL,
+ * it is expected to point to a caller-allocated buffer of sufficient size.
+ *
+ * Returns the number of bytes copied on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ * index: the index of the first requested record
+ * count: the number of requested records
+ * kbuf (out): pointer to the caller-allocated in-kernel bts_struct array
+ * NULL to ignore this buffer
+ * ubuf (out): pointer to the caller-allocated user-space bts_struct array
+ * NULL to ignore this buffer
+ * flags: a bit-mask of read flags
+ */
+#define BTS_READ_O_REVERSE (1<<0) /* reverse output */
+
+extern int bts_read(struct task_struct *task,
+ size_t index, size_t count,
+ struct bts_struct *kbuf,
+ struct bts_struct __user *ubuf,
+ unsigned int flags);
+
+/*
+ * Reset the write pointer of the BTS buffer.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ */
+extern int bts_reset(struct task_struct *task);
+
+/*
+ * Clear the BTS buffer and reset the write pointer.
+ * The entire buffer will be zeroed out.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * task: the task to access;
+ * NULL to access the current cpu
+ */
+extern int bts_clear(struct task_struct *task);
+
+
+/*
+ * Initialization
+ */
+struct cpuinfo_x86;
+extern void __cpuinit bts_init_intel(struct cpuinfo_x86 *);
+
+/*
+ * Called by the scheduler to record a timestamp in the branch trace.
+ */
+extern void bts_take_timestamp(struct task_struct *, enum bts_qualifier);
+
+#else /* CONFIG_X86_BTS */
+
+#define bts_init_intel(config) do {} while (0)
+#define bts_take_timestamp(task, qualifier) do {} while (0)
+
+#endif /* CONFIG_X86_DS */
+#endif /* _ASM_X86_BTS_H */
Index: gits.x86/arch/x86/kernel/ptrace.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/ptrace.c 2008-04-30 11:16:35.%N +0200
+++ gits.x86/arch/x86/kernel/ptrace.c 2008-04-30 11:35:10.%N +0200
@@ -32,6 +32,7 @@
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/ds.h>
+#include <asm/bts.h>
#include "tls.h"
@@ -555,155 +556,6 @@
}
#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
- /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
- unsigned char sizeof_bts;
- /* the size of a field in the BTS record in bytes */
- unsigned char sizeof_field;
- /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
- unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
- bts_from = 0,
- bts_to,
- bts_flags,
-
- bts_escape = (unsigned long)-1,
- bts_qual = bts_to,
- bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
- base += (bts_cfg.sizeof_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
- base += (bts_cfg.sizeof_field * field);;
- (*(unsigned long *)base) = val;
-}
-
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
- memset(out, 0, sizeof(*out));
- if (bts_get(raw, bts_from) == bts_escape) {
- out->qualifier = bts_get(raw, bts_qual);
- out->variant.jiffies = bts_get(raw, bts_jiffies);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = bts_get(raw, bts_from);
- out->variant.lbr.to_ip = bts_get(raw, bts_to);
- }
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
- struct bts_struct __user *out)
-{
- struct bts_struct ret;
- const void *bts_record;
- size_t bts_index, bts_end;
- int error;
-
- error = ds_get_bts_end(child, &bts_end);
- if (error < 0)
- return error;
-
- if (bts_end <= index)
- return -EINVAL;
-
- error = ds_get_bts_index(child, &bts_index);
- if (error < 0)
- return error;
-
- /* translate the ptrace bts index into the ds bts index */
- bts_index += bts_end - (index + 1);
- if (bts_end <= bts_index)
- bts_index -= bts_end;
-
- error = ds_access_bts(child, bts_index, &bts_record);
- if (error < 0)
- return error;
-
- ptrace_bts_translate_record(&ret, bts_record);
-
- if (copy_to_user(out, &ret, sizeof(ret)))
- return -EFAULT;
-
- return sizeof(ret);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
- long size,
- struct bts_struct __user *out)
-{
- struct bts_struct ret;
- const unsigned char *raw;
- size_t end, i;
- int error;
-
- error = ds_get_bts_index(child, &end);
- if (error < 0)
- return error;
-
- if (size < (end * sizeof(struct bts_struct)))
- return -EIO;
-
- error = ds_access_bts(child, 0, (const void **)&raw);
- if (error < 0)
- return error;
-
- for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
- ptrace_bts_translate_record(&ret, raw);
-
- if (copy_to_user(out, &ret, sizeof(ret)))
- return -EFAULT;
- }
-
- error = ds_clear_bts(child);
- if (error < 0)
- return error;
-
- return end;
-}
-
static void ptrace_bts_ovfl(struct task_struct *child)
{
send_sig(child->thread.bts_ovfl_signal, child, 0);
@@ -714,75 +566,47 @@
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
+ unsigned int flags = 0;
int error = 0;
- error = -EOPNOTSUPP;
- if (!bts_cfg.sizeof_bts)
- goto errout;
-
- error = -EIO;
if (cfg_size < sizeof(cfg))
- goto errout;
+ return -EIO;
- error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- goto errout;
+ return -EFAULT;
- error = -EINVAL;
if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
!(cfg.flags & PTRACE_BTS_O_ALLOC))
- goto errout;
+ return -EINVAL;
if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = 0;
+ ds_ovfl_callback_t ovfl = NULL;
unsigned int sig = 0;
/* we ignore the error in case we were not tracing child */
- (void)ds_release_bts(child);
+ (void)bts_release(child);
if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
if (!cfg.signal)
- goto errout;
+ return -EINVAL;
sig = cfg.signal;
ovfl = ptrace_bts_ovfl;
}
- error = ds_request_bts(child, /* base = */ 0, cfg.size, ovfl);
+ error = bts_request(child, /* base = */ NULL, cfg.size, ovfl);
if (error < 0)
- goto errout;
+ return error;
child->thread.bts_ovfl_signal = sig;
}
- error = -EINVAL;
- if (!child->thread.ds_ctx && cfg.flags)
- goto errout;
-
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
- else
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-
+ flags |= (BTS_O_TRACE | BTS_O_KERNEL_OFF);
if (cfg.flags & PTRACE_BTS_O_SCHED)
- set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- else
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ flags |= BTS_O_TIMESTAMP;
- error = sizeof(cfg);
-
-out:
- if (child->thread.debugctlmsr)
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- else
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
- return error;
-
-errout:
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- goto out;
+ return bts_configure(child, flags);
}
static int ptrace_bts_status(struct task_struct *child,
@@ -790,38 +614,28 @@
struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- size_t end;
- const void *base, *max;
- int error;
+ size_t size;
+ int status;
if (cfg_size < sizeof(cfg))
return -EIO;
- error = ds_get_bts_end(child, &end);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ 0, &base);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ end, &max);
- if (error < 0)
- return error;
+ status = bts_status(child, &size);
+ if (status < 0)
+ return status;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = (max - base);
+ cfg.size = size;
cfg.signal = child->thread.bts_ovfl_signal;
cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ if (status & BTS_O_TRACE)
cfg.flags |= PTRACE_BTS_O_TRACE;
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ if (status & BTS_O_TIMESTAMP)
cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -830,108 +644,28 @@
return sizeof(cfg);
}
-static int ptrace_bts_write_record(struct task_struct *child,
- const struct bts_struct *in)
-{
- unsigned char bts_record[BTS_MAX_RECORD_SIZE];
-
- BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
-
- memset(bts_record, 0, bts_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
-
- case BTS_BRANCH:
- bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
- bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
- break;
-
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- bts_set(bts_record, bts_from, bts_escape);
- bts_set(bts_record, bts_qual, in->qualifier);
- bts_set(bts_record, bts_jiffies, in->variant.jiffies);
- break;
-
- default:
- return -EINVAL;
- }
-
- /* The writing task will be the switched-to task on a context
- * switch. It needs to write into the switched-from task's BTS
- * buffer. */
- return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
-}
-
-void ptrace_bts_take_timestamp(struct task_struct *tsk,
- enum bts_qualifier qualifier)
+static int ptrace_bts_drain(struct task_struct *child,
+ long size,
+ struct bts_struct __user *out)
{
- struct bts_struct rec = {
- .qualifier = qualifier,
- .variant.jiffies = jiffies_64
- };
+ size_t bsize;
+ int error;
- ptrace_bts_write_record(tsk, &rec);
-}
+ error = bts_size(child, &bsize);
+ if (error < 0)
+ return error;
-static const struct bts_configuration bts_cfg_netburst = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
+ if (size < (bsize * sizeof(*out)))
+ return -EIO;
-static const struct bts_configuration bts_cfg_pentium_m = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<6)|(1<<7)
-};
+ error = bts_read(child, 0, bsize, /* kbuf = */ NULL, out,
+ BTS_READ_O_REVERSE);
+ if (error < 0)
+ return error;
-static const struct bts_configuration bts_cfg_core2 = {
- .sizeof_bts = 8 * 3,
- .sizeof_field = 8,
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
+ (void)bts_clear(child);
-static inline void bts_configure(const struct bts_configuration *cfg)
-{
- bts_cfg = *cfg;
-}
-
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
-{
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0xD:
- case 0xE: /* Pentium M */
- bts_configure(&bts_cfg_pentium_m);
- break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- bts_configure(&bts_cfg_core2);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- case 0xF:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- bts_configure(&bts_cfg_netburst);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
+ return (error / sizeof(*out));
}
#endif /* CONFIG_X86_PTRACE_BTS */
@@ -947,13 +681,7 @@
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
#ifdef CONFIG_X86_PTRACE_BTS
- (void)ds_release_bts(child);
-
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ (void)bts_release(child);
#endif /* CONFIG_X86_PTRACE_BTS */
}
@@ -1086,16 +814,16 @@
break;
case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ 0);
+ ret = bts_size(child, /* bsize = */ 0);
break;
case PTRACE_BTS_GET:
- ret = ptrace_bts_read_record
- (child, data, (struct bts_struct __user *) addr);
+ ret = bts_read(child, data, /* count = */ 1, /* kbuf = */ NULL,
+ (struct bts_struct __user *) addr, /* flags = */ 0);
break;
case PTRACE_BTS_CLEAR:
- ret = ds_clear_bts(child);
+ ret = bts_clear(child);
break;
case PTRACE_BTS_DRAIN:
Index: gits.x86/arch/x86/Kconfig.cpu
===================================================================
--- gits.x86.orig/arch/x86/Kconfig.cpu 2008-04-30 11:16:32.%N +0200
+++ gits.x86/arch/x86/Kconfig.cpu 2008-04-30 11:36:52.%N +0200
@@ -428,12 +428,19 @@
This allows the kernel to provide a memory buffer to the hardware
to store various profiling and tracing events.
-config X86_PTRACE_BTS
- bool "ptrace interface to Branch Trace Store"
+config X86_BTS
+ bool "Branch Trace Store support"
default y
depends on (X86_DS && X86_DEBUGCTLMSR)
help
- Add a ptrace interface to allow collecting an execution trace
- of the traced task.
- This collects control flow changes in a (cyclic) buffer and allows
- debuggers to fill in the gaps and show an execution trace of the debuggee.
+ Add support for Branch Trace Store.
+ This allows the kernel to collect branch trace information per
+ task or per cpu.
+
+config X86_PTRACE_BTS
+ bool "A ptrace interface to Branch Trace Store support"
+ default y
+ depends on (X86_BTS)
+ help
+ Add a ptrace interface to expose Branch Trace Store support to user space.
+ This allows debuggers to show an execution trace of the debuggee.
Index: gits.x86/arch/x86/kernel/process_32.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/process_32.c 2008-04-30 11:16:35.%N +0200
+++ gits.x86/arch/x86/kernel/process_32.c 2008-04-30 11:46:18.%N +0200
@@ -55,6 +55,8 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
#include <asm/kdebug.h>
+#include <asm/ds.h>
+#include <asm/bts.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -485,8 +487,9 @@
}
#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
+static unsigned long update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long debugctl)
{
unsigned long ds_prev = 0;
unsigned long ds_next = 0;
@@ -506,8 +509,9 @@
return debugctl;
}
#else
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
+static unsigned long update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long debugctl)
{
return debugctl;
}
@@ -547,13 +551,13 @@
hard_enable_TSC();
}
-#ifdef CONFIG_X86_PTRACE_BTS
+#ifdef CONFIG_X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+ bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
+ bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif /* CONFIG_X86_BTS */
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
Index: gits.x86/arch/x86/kernel/process_64.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/process_64.c 2008-04-30 11:16:35.%N +0200
+++ gits.x86/arch/x86/kernel/process_64.c 2008-04-30 11:45:12.%N +0200
@@ -52,6 +52,8 @@
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
+#include <asm/ds.h>
+#include <asm/bts.h>
asmlinkage extern void ret_from_fork(void);
@@ -510,20 +512,14 @@
*/
#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
-static inline void __switch_to_xtra(struct task_struct *prev_p,
- struct task_struct *next_p,
- struct tss_struct *tss)
+#ifdef CONFIG_X86_DS
+static unsigned long update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long debugctl)
{
- struct thread_struct *prev, *next;
- unsigned long debugctl;
- unsigned long ds_prev = 0, ds_next = 0;
+ unsigned long ds_prev = 0;
+ unsigned long ds_next = 0;
- prev = &prev_p->thread,
- next = &next_p->thread;
-
- debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
if (prev->ds_ctx)
ds_prev = (unsigned long)prev->ds_ctx->ds;
if (next->ds_ctx)
@@ -536,8 +532,29 @@
update_debugctlmsr(0);
wrmsrl(MSR_IA32_DS_AREA, ds_next);
}
+ return debugctl;
+}
+#else
+static unsigned long update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long debugctl)
+{
+ return debugctl;
+}
#endif /* CONFIG_X86_DS */
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+ struct task_struct *next_p,
+ struct tss_struct *tss)
+{
+ struct thread_struct *prev, *next;
+ unsigned long debugctl;
+
+ prev = &prev_p->thread,
+ next = &next_p->thread;
+
+ debugctl = update_debugctl(prev, next, prev->debugctlmsr);
+
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -574,13 +591,13 @@
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-#ifdef CONFIG_X86_PTRACE_BTS
+#ifdef CONFIG_X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+ bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
+ bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif /* CONFIG_X86_BTS */
}
/*
Index: gits.x86/arch/x86/kernel/Makefile
===================================================================
--- gits.x86.orig/arch/x86/kernel/Makefile 2008-04-30 11:16:33.%N +0200
+++ gits.x86/arch/x86/kernel/Makefile 2008-04-30 11:49:48.%N +0200
@@ -33,7 +33,8 @@
obj-y += process.o
obj-y += i387.o
obj-y += ptrace.o
-obj-y += ds.o
+obj-$(CONFIG_X86_DS) += ds.o
+obj-$(CONFIG_X86_BTS) += bts.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
Index: gits.x86/include/asm-x86/processor.h
===================================================================
--- gits.x86.orig/include/asm-x86/processor.h 2008-04-30 11:18:32.%N +0200
+++ gits.x86/include/asm-x86/processor.h 2008-04-30 11:48:25.%N +0200
@@ -750,6 +750,18 @@
extern void cpu_init(void);
extern void init_gdt(int cpu);
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+ return debugctlmsr;
+}
+
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
Index: gits.x86/arch/x86/kernel/cpu/intel.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/cpu/intel.c 2008-04-30 11:16:34.%N +0200
+++ gits.x86/arch/x86/kernel/cpu/intel.c 2008-04-30 11:51:21.%N +0200
@@ -13,6 +13,7 @@
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/ds.h>
+#include <asm/bts.h>
#include <asm/bugs.h>
#include "cpu.h"
@@ -226,7 +227,7 @@
}
if (cpu_has_bts)
- ptrace_bts_init_intel(c);
+ bts_init_intel(c);
}
static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
Index: gits.x86/arch/x86/kernel/setup_64.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/setup_64.c 2008-04-30 11:16:35.%N +0200
+++ gits.x86/arch/x86/kernel/setup_64.c 2008-04-30 11:50:53.%N +0200
@@ -68,6 +68,7 @@
#include <asm/cacheflush.h>
#include <asm/mce.h>
#include <asm/ds.h>
+#include <asm/bts.h>
#include <asm/topology.h>
#include <asm/trampoline.h>
@@ -924,7 +925,7 @@
if (cpu_has_bts)
- ptrace_bts_init_intel(c);
+ bts_init_intel(c);
n = c->extended_cpuid_level;
if (n >= 0x80000008) {
Index: gits.x86/include/asm-x86/ptrace-abi.h
===================================================================
--- gits.x86.orig/include/asm-x86/ptrace-abi.h 2008-04-30 11:18:32.%N +0200
+++ gits.x86/include/asm-x86/ptrace-abi.h 2008-04-30 12:20:15.%N +0200
@@ -82,24 +82,6 @@
#ifdef CONFIG_X86_PTRACE_BTS
-#ifndef __ASSEMBLY__
-#include <asm/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
- PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
- /* requested or actual size of BTS buffer in bytes */
- __u32 size;
- /* bitmask of below flags */
- __u32 flags;
- /* buffer overflow signal */
- __u32 signal;
- /* actual size of bts_struct in bytes */
- __u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
Index: gits.x86/include/asm-x86/ptrace.h
===================================================================
--- gits.x86.orig/include/asm-x86/ptrace.h 2008-04-30 11:18:32.%N +0200
+++ gits.x86/include/asm-x86/ptrace.h 2008-04-30 12:22:26.%N +0200
@@ -7,6 +7,25 @@
#ifndef __ASSEMBLY__
+#ifdef CONFIG_X86_PTRACE_BTS
+#include <linux/types.h>
+#include <asm/bts.h>
+
+/* configuration/status structure used in PTRACE_BTS_CONFIG and
+ PTRACE_BTS_STATUS commands.
+*/
+struct ptrace_bts_config {
+ /* requested or actual size of BTS buffer in bytes */
+ __u32 size;
+ /* bitmask of below flags */
+ __u32 flags;
+ /* buffer overflow signal */
+ __u32 signal;
+ /* actual size of bts_struct in bytes */
+ __u32 bts_size;
+};
+#endif /* CONFIG_X86_PTRACE_BTS */
+
#ifdef __i386__
/* this struct defines the way the registers are stored on the
stack during a system call. */
@@ -126,48 +145,12 @@
#endif /* !__i386__ */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* a branch trace record entry
- *
- * In order to unify the interface between various processor versions,
- * we use the below data structure for all processors.
- */
-enum bts_qualifier {
- BTS_INVALID = 0,
- BTS_BRANCH,
- BTS_TASK_ARRIVES,
- BTS_TASK_DEPARTS
-};
-
-struct bts_struct {
- __u64 qualifier;
- union {
- /* BTS_BRANCH */
- struct {
- __u64 from_ip;
- __u64 to_ip;
- } lbr;
- /* BTS_TASK_ARRIVES or
- BTS_TASK_DEPARTS */
- __u64 jiffies;
- } variant;
-};
-#endif /* CONFIG_X86_PTRACE_BTS */
-
#ifdef __KERNEL__
#include <linux/init.h>
-struct cpuinfo_x86;
struct task_struct;
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
-extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
-#else
-#define ptrace_bts_init_intel(config) do {} while (0)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
extern unsigned long profile_pc(struct pt_regs *regs);
extern unsigned long
Index: gits.x86/arch/x86/kernel/ds.c
===================================================================
--- gits.x86.orig/arch/x86/kernel/ds.c 2008-04-30 11:16:34.%N +0200
+++ gits.x86/arch/x86/kernel/ds.c 2008-04-30 12:27:02.%N +0200
@@ -29,6 +29,7 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/module.h>
/*
@@ -131,9 +132,14 @@
if (!context)
return -EPERM;
+ /* The one who did the ds_request() is OK */
if (context->owner[qual] == current)
return 0;
+ /* Every task may access its own configuration */
+ if (context->task == current)
+ return 0;
+
return -EPERM;
}
@@ -333,24 +339,27 @@
*/
static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
{
+ struct mm_struct *mm = current->mm;
unsigned long rlim, vm, pgsz;
- void *buffer;
+ void *buffer = NULL;
pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ down_write(&mm->mmap_sem);
+
rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
vm = current->mm->total_vm + pgsz;
if (rlim < vm)
- return NULL;
+ goto out;
rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
vm = current->mm->locked_vm + pgsz;
if (rlim < vm)
- return NULL;
+ goto out;
buffer = kzalloc(size, GFP_KERNEL);
if (!buffer)
- return NULL;
+ goto out;
current->mm->total_vm += pgsz;
current->mm->locked_vm += pgsz;
@@ -358,16 +367,18 @@
if (pages)
*pages = pgsz;
+ out:
+ up_write(&mm->mmap_sem);
return buffer;
}
static int ds_request(struct task_struct *task, void *base, size_t size,
ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
{
- struct ds_context *context;
+ struct ds_context *context = NULL;
unsigned long buffer, adj;
const unsigned long alignment = (1 << 3);
- int error = 0;
+ int error;
if (!ds_cfg.sizeof_ds)
return -EOPNOTSUPP;
@@ -383,8 +394,9 @@
spin_lock(&ds_lock);
+ error = -EPERM;
if (!check_tracer(task))
- return -EPERM;
+ goto out_unlock;
error = -ENOMEM;
context = ds_alloc_context(task);
@@ -395,7 +407,7 @@
if (context->owner[qual] == current)
goto out_unlock;
error = -EPERM;
- if (context->owner[qual] != NULL)
+ if (context->owner[qual] != 0)
goto out_unlock;
context->owner[qual] = current;
@@ -445,7 +457,7 @@
return error;
out_release:
- context->owner[qual] = NULL;
+ context->owner[qual] = 0;
ds_put_context(context);
return error;
@@ -460,15 +472,18 @@
{
return ds_request(task, base, size, ovfl, ds_bts);
}
+EXPORT_SYMBOL(ds_request_bts);
int ds_request_pebs(struct task_struct *task, void *base, size_t size,
ds_ovfl_callback_t ovfl)
{
return ds_request(task, base, size, ovfl, ds_pebs);
}
+EXPORT_SYMBOL(ds_request_pebs);
static int ds_release(struct task_struct *task, enum ds_qualifier qual)
{
+ struct mm_struct *mm = current->mm;
struct ds_context *context;
int error;
@@ -477,13 +492,20 @@
if (error < 0)
goto out;
- kfree(context->buffer[qual]);
- context->buffer[qual] = 0;
+ if (context->buffer[qual]) {
+ kfree(context->buffer[qual]);
+ context->buffer[qual] = NULL;
- current->mm->total_vm -= context->pages[qual];
- current->mm->locked_vm -= context->pages[qual];
- context->pages[qual] = 0;
- context->owner[qual] = 0;
+ down_write(&mm->mmap_sem);
+
+ current->mm->total_vm -= context->pages[qual];
+ current->mm->locked_vm -= context->pages[qual];
+
+ up_write(&mm->mmap_sem);
+
+ context->pages[qual] = 0;
+ context->owner[qual] = 0;
+ }
/*
* we put the context twice:
@@ -500,11 +522,25 @@
{
return ds_release(task, ds_bts);
}
+EXPORT_SYMBOL(ds_release_bts);
int ds_release_pebs(struct task_struct *task)
{
return ds_release(task, ds_pebs);
}
+EXPORT_SYMBOL(ds_release_pebs);
+
+int ds_validate_bts(struct task_struct *task)
+{
+ return ds_validate_access(ds_get_context(task), ds_bts);
+}
+EXPORT_SYMBOL(ds_validate_bts);
+
+int ds_validate_pebs(struct task_struct *task)
+{
+ return ds_validate_access(ds_get_context(task), ds_pebs);
+}
+EXPORT_SYMBOL(ds_validate_pebs);
static int ds_get_index(struct task_struct *task, size_t *pos,
enum ds_qualifier qual)
@@ -533,11 +569,13 @@
{
return ds_get_index(task, pos, ds_bts);
}
+EXPORT_SYMBOL(ds_get_bts_index);
int ds_get_pebs_index(struct task_struct *task, size_t *pos)
{
return ds_get_index(task, pos, ds_pebs);
}
+EXPORT_SYMBOL(ds_get_pebs_index);
static int ds_get_end(struct task_struct *task, size_t *pos,
enum ds_qualifier qual)
@@ -566,11 +604,13 @@
{
return ds_get_end(task, pos, ds_bts);
}
+EXPORT_SYMBOL(ds_get_bts_end);
int ds_get_pebs_end(struct task_struct *task, size_t *pos)
{
return ds_get_end(task, pos, ds_pebs);
}
+EXPORT_SYMBOL(ds_get_pebs_end);
static int ds_access(struct task_struct *task, size_t index,
const void **record, enum ds_qualifier qual)
@@ -605,11 +645,13 @@
{
return ds_access(task, index, record, ds_bts);
}
+EXPORT_SYMBOL(ds_access_bts);
int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
{
return ds_access(task, index, record, ds_pebs);
}
+EXPORT_SYMBOL(ds_access_pebs);
static int ds_write(struct task_struct *task, const void *record, size_t size,
enum ds_qualifier qual, int force)
@@ -693,23 +735,27 @@
{
return ds_write(task, record, size, ds_bts, /* force = */ 0);
}
+EXPORT_SYMBOL(ds_write_bts);
int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
{
return ds_write(task, record, size, ds_pebs, /* force = */ 0);
}
+EXPORT_SYMBOL(ds_write_pebs);
int ds_unchecked_write_bts(struct task_struct *task,
const void *record, size_t size)
{
return ds_write(task, record, size, ds_bts, /* force = */ 1);
}
+EXPORT_SYMBOL(ds_unchecked_write_bts);
int ds_unchecked_write_pebs(struct task_struct *task,
const void *record, size_t size)
{
return ds_write(task, record, size, ds_pebs, /* force = */ 1);
}
+EXPORT_SYMBOL(ds_unchecked_write_pebs);
static int ds_reset_or_clear(struct task_struct *task,
enum ds_qualifier qual, int clear)
@@ -741,21 +787,25 @@
{
return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
}
+EXPORT_SYMBOL(ds_reset_bts);
int ds_reset_pebs(struct task_struct *task)
{
return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
}
+EXPORT_SYMBOL(ds_reset_pebs);
int ds_clear_bts(struct task_struct *task)
{
return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
}
+EXPORT_SYMBOL(ds_clear_bts);
int ds_clear_pebs(struct task_struct *task)
{
return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
}
+EXPORT_SYMBOL(ds_clear_pebs);
int ds_get_pebs_reset(struct task_struct *task, u64 *value)
{
@@ -777,6 +827,7 @@
ds_put_context(context);
return error;
}
+EXPORT_SYMBOL(ds_get_pebs_reset);
int ds_set_pebs_reset(struct task_struct *task, u64 value)
{
@@ -795,22 +846,27 @@
ds_put_context(context);
return error;
}
+EXPORT_SYMBOL(ds_set_pebs_reset);
static const struct ds_configuration ds_cfg_var = {
.sizeof_ds = sizeof(long) * 12,
.sizeof_field = sizeof(long),
.sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
.sizeof_rec[ds_pebs] = sizeof(long) * 10
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18
+#endif
};
static const struct ds_configuration ds_cfg_64 = {
.sizeof_ds = 8 * 12,
.sizeof_field = 8,
.sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 10
+ .sizeof_rec[ds_pebs] = 8 * 18
};
static inline void
-ds_configure(const struct ds_configuration *cfg)
+ds_init(const struct ds_configuration *cfg)
{
ds_cfg = *cfg;
}
@@ -822,11 +878,11 @@
switch (c->x86_model) {
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_var);
+ ds_init(&ds_cfg_var);
break;
case 0xF: /* Core2 */
case 0x1C: /* Atom */
- ds_configure(&ds_cfg_64);
+ ds_init(&ds_cfg_64);
break;
default:
/* sorry, don't know about them */
@@ -838,7 +894,7 @@
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_var);
+ ds_init(&ds_cfg_var);
break;
default:
/* sorry, don't know about them */
@@ -860,4 +916,6 @@
while (leftovers--)
ds_put_context(context);
}
+EXPORT_SYMBOL(ds_free);
+
#endif /* CONFIG_X86_DS */
Index: gits.x86/include/asm-x86/ds.h
===================================================================
--- gits.x86.orig/include/asm-x86/ds.h 2008-04-30 11:18:31.%N +0200
+++ gits.x86/include/asm-x86/ds.h 2008-04-30 12:24:17.%N +0200
@@ -67,6 +67,18 @@
extern int ds_release_pebs(struct task_struct *task);
/*
+ * Validate that the current task is allowed to access the BTS or PEBS
+ * resources.
+ *
+ * Returns 0 on success; -Eerrno otherwise
+ *
+ * task: the task to validate ownership for;
+ * NULL to validate for the current cpu
+ */
+extern int ds_validate_bts(struct task_struct *task);
+extern int ds_validate_pebs(struct task_struct *task);
+
+/*
* Return the (array) index of the write pointer.
* (assuming an array of BTS/PEBS records)
*
---------------------------------------------------------------------
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052
This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 11:54 [patch] x86, ptrace: in-kernel BTS interface Markus Metzger
@ 2008-04-30 12:26 ` Andrew Morton
2008-04-30 12:43 ` Metzger, Markus T
2008-04-30 13:03 ` Andi Kleen
2008-05-03 1:43 ` Roland McGrath
2 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2008-04-30 12:26 UTC (permalink / raw)
To: Markus Metzger
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
suresh.b.siddha, roland, mtk.manpages, eranian, juan.villacis
On Wed, 30 Apr 2008 13:54:40 +0200 Markus Metzger <markus.t.metzger@intel.com> wrote:
> Provide an in-kernel interface to Branch Trace Store and implement the
> ptrace user interface on top of it.
>
> Fix a few bugs that were detected during the perfmon2 adaptation to
> the DS interface by Stephane Eranian.
>
> The ptrace implementation becomes a rather thin layer that only
> forwards requests.
>
> The BTS interface may later be morphed into a utrace interface for
> execution trace, or it may be used to implement such a utrace
> interface on top of it.
This appears to be against a version of x86-ptrace-pebs-support
which I don't have.
I dropped everything - let's start again.
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 12:26 ` Andrew Morton
@ 2008-04-30 12:43 ` Metzger, Markus T
2008-04-30 12:55 ` Andrew Morton
0 siblings, 1 reply; 13+ messages in thread
From: Metzger, Markus T @ 2008-04-30 12:43 UTC (permalink / raw)
To: Andrew Morton
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
Siddha, Suresh B, roland, mtk.manpages, eranian, Villacis, Juan
>-----Original Message-----
>From: Andrew Morton [mailto:akpm@linux-foundation.org]
>Sent: Mittwoch, 30. April 2008 14:27
>To: Metzger, Markus T
>> Provide an in-kernel interface to Branch Trace Store and
>implement the
>> ptrace user interface on top of it.
>>
>> Fix a few bugs that were detected during the perfmon2 adaptation to
>> the DS interface by Stephane Eranian.
>>
>> The ptrace implementation becomes a rather thin layer that only
>> forwards requests.
>>
>> The BTS interface may later be morphed into a utrace interface for
>> execution trace, or it may be used to implement such a utrace
>> interface on top of it.
>
>This appears to be against a version of x86-ptrace-pebs-support
>which I don't have.
>
>I dropped everything - let's start again.
That was against x86.git#latest.
Do you want me to send patches against some other git? Which one?
You complained about the size of the patch. If we start over, I could
split it into 3 pieces, one for each layer.
regards,
markus.
---------------------------------------------------------------------
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052
This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 12:43 ` Metzger, Markus T
@ 2008-04-30 12:55 ` Andrew Morton
0 siblings, 0 replies; 13+ messages in thread
From: Andrew Morton @ 2008-04-30 12:55 UTC (permalink / raw)
To: Metzger, Markus T
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
Siddha, Suresh B, roland, mtk.manpages, eranian, Villacis, Juan
On Wed, 30 Apr 2008 13:43:20 +0100 "Metzger, Markus T" <markus.t.metzger@intel.com> wrote:
> >-----Original Message-----
> >From: Andrew Morton [mailto:akpm@linux-foundation.org]
> >Sent: Mittwoch, 30. April 2008 14:27
> >To: Metzger, Markus T
>
> >> Provide an in-kernel interface to Branch Trace Store and
> >implement the
> >> ptrace user interface on top of it.
> >>
> >> Fix a few bugs that were detected during the perfmon2 adaptation to
> >> the DS interface by Stephane Eranian.
> >>
> >> The ptrace implementation becomes a rather thin layer that only
> >> forwards requests.
> >>
> >> The BTS interface may later be morphed into a utrace interface for
> >> execution trace, or it may be used to implement such a utrace
> >> interface on top of it.
> >
> >This appears to be against a version of x86-ptrace-pebs-support
> >which I don't have.
> >
> >I dropped everything - let's start again.
>
> That was against x86.git#latest.
oh, ok. Hopefully this run-multiple-trees-to-confuse-everyone problem will
go away soon.
> Do you want me to send patches against some other git? Which one?
I'll let you and Ingo sort it out.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 11:54 [patch] x86, ptrace: in-kernel BTS interface Markus Metzger
2008-04-30 12:26 ` Andrew Morton
@ 2008-04-30 13:03 ` Andi Kleen
2008-04-30 15:30 ` Metzger, Markus T
2008-05-03 1:43 ` Roland McGrath
2 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2008-04-30 13:03 UTC (permalink / raw)
To: Markus Metzger
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
suresh.b.siddha, roland, akpm, mtk.manpages, eranian,
juan.villacis
I'm not quite sure on the kernel interface. How would a in kernel
subsystem use it for tracing itself for example?
> Index: gits.x86/arch/x86/kernel/bts.c
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ gits.x86/arch/x86/kernel/bts.c 2008-04-30 11:30:18.%N +0200
> @@ -0,0 +1,505 @@
> +/*
> + * Branch Trace Store (BTS) support
> + *
> + * This provides a low-level interface to the hardware's Branch Trace Store
> + * feature that is used for execution tracing.
Perhaps say it is only supported on modern Intel CPUs.
> + *
> + * It manages:
> + * - per-thread and per-cpu BTS configuration
> + * - buffer memory allocation and overflow handling
> + *
> + * It assumes:
> + * - get_task_struct on all parameter tasks
What is a parameter task?
> + * - current is allowed to trace parameter tasks
> + *
> + *
> + * Copyright (C) 2008 Intel Corporation.
> + * Markus Metzger <markus.t.metzger@intel.com>, 2008
> + */
> +
> +#ifdef CONFIG_X86_BTS
Ifdef around whole file should be in the Makefile instead.
In fact it is already in there so it is obsolet
> +struct bts_configuration {
> + /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
> + unsigned char sizeof_bts;
> + /* the size of a field in the BTS record in bytes */
> + unsigned char sizeof_field;
> + /* a bitmask to enable/disable various parts of BTS in DEBUGCTL MSR */
> + unsigned long debugctl_tr;
> + unsigned long debugctl_btint;
> + unsigned long debugctl_user_off;
> + unsigned long debugctl_kernel_off;
> + unsigned long debugctl_all;
> +};
> +static struct bts_configuration bts_cfg;
Should have a comment describing the locking of the variable. Is there is
no need for some reason that should be also documented.
> +}
> +EXPORT_SYMBOL(bts_request);
Why again is that all exported?
> + if (kbuf)
> + bts_translate_record(kbuf++, raw);
> +
> + if (ubuf) {
> + bts_translate_record(&bts, raw);
> +
> + if (copy_to_user(ubuf++, &bts, sizeof(bts)))
copy_to_user is a macro and using expressions with side effects in
macro arguments is usually a bad idea.
> +static const struct bts_configuration bts_cfg_netburst = {
> + .sizeof_bts = sizeof(long) * 3,
> + .sizeof_field = sizeof(long),
> + .debugctl_tr = (1<<2)|(1<<3),
> + .debugctl_btint = (1<<4),
> + .debugctl_user_off = (1<<6),
> + .debugctl_kernel_off = (1<<5)
Define symbols for the magic numbers?
> + switch (c->x86_model) {
> + case 0xD:
> + case 0xE: /* Pentium M */
> + bts_init(&bts_cfg_pentium_m);
> + break;
> + case 0xF: /* Core2 */
> + case 0x1C: /* Atom */
> + bts_init(&bts_cfg_core2);
> + break;
> + default:
> + /* sorry, don't know about them */
There should be a printk probably once at kernel boot time.
> + break;
> + }
> + break;
> + case 0xF:
> + switch (c->x86_model) {
> + case 0x0:
> + case 0x1:
> + case 0x2: /* Netburst */
> + bts_init(&bts_cfg_netburst);
Are you sure that's complete?
> +#ifdef __KERNEL__
> +struct bts_struct {
> + u64 qualifier;
> + union {
> + /* BTS_BRANCH */
> + struct {
> + u64 from;
> + u64 to;
> + } lbr;
> + /* BTS_TASK_ARRIVES or
> + BTS_TASK_DEPARTS */
> + u64 jiffies;
> + } variant;
> +};
> +#else /* !__KERNEL__ */
> +struct bts_struct {
> + __u64 qualifier;
You could always use the __ typed variant even for the kernel.
> +
> +/*
> + * Request branch tracing for the parameter task or for the current cpu.
> + *
> + * Due to alignement constraints, the actual buffer may be slightly
> + * smaller than the requested or provided buffer.
> + *
> + * Returns 0 on success; -Eerrno otherwise
> + *
> + * task: the task to request recording for;
> + * NULL for per-cpu recording on the current cpu
> + * base: the base pointer for the (non-pageable) buffer;
> + * NULL if buffer allocation requested
> + * size: the size of the requested or provided buffer
> + * ovfl: pointer to a function to be called on buffer overflow;
> + * NULL if cyclic buffer requested
If you write these comments in kerneldoc format (see
Documentation/kernel-doc-nano-HOWTO.txt)
it might end up automatically in the extracted documentation.
> + * Not all processors support all variants.
> + * If a variant is not supported, the respective flag is ignored.
Is that really a good way to handle such an error? How does the
user program find out?
> }
>
> #ifdef CONFIG_X86_PTRACE_BTS
Hmm I suspect since that is not mainline you'll need to just ask
for the previous patches to be dropped, not remove the code explicitely.
-Andi
^ permalink raw reply [flat|nested] 13+ messages in thread* RE: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 13:03 ` Andi Kleen
@ 2008-04-30 15:30 ` Metzger, Markus T
2008-04-30 16:06 ` Andi Kleen
0 siblings, 1 reply; 13+ messages in thread
From: Metzger, Markus T @ 2008-04-30 15:30 UTC (permalink / raw)
To: Andi Kleen
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
Siddha, Suresh B, roland, akpm, mtk.manpages, eranian,
Villacis, Juan
>-----Original Message-----
>From: Andi Kleen [mailto:andi@firstfloor.org]
>Sent: Mittwoch, 30. April 2008 15:03
>To: Metzger, Markus T
Thanks for your feedback.
>I'm not quite sure on the kernel interface. How would a in kernel
>subsystem use it for tracing itself for example?
The task parameter would be current.
Apart from that, you would use it the same way it is used in
arch/x86/kernel/ptrace.c.
You would want to bts_configure() it to disable tracing before you start
reading your own BTS buffer.
>> Index: gits.x86/arch/x86/kernel/bts.c
>> ===================================================================
>> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
>> +++ gits.x86/arch/x86/kernel/bts.c 2008-04-30 11:30:18.%N +0200
>> @@ -0,0 +1,505 @@
>> +/*
>> + * Branch Trace Store (BTS) support
>> + *
>> + * This provides a low-level interface to the hardware's
>Branch Trace Store
>> + * feature that is used for execution tracing.
>
>Perhaps say it is only supported on modern Intel CPUs.
I will add such a comment.
>> + *
>> + * It manages:
>> + * - per-thread and per-cpu BTS configuration
>> + * - buffer memory allocation and overflow handling
>> + *
>> + * It assumes:
>> + * - get_task_struct on all parameter tasks
>
>What is a parameter task?
I was referring to the 'task' parameter for the below functions.
I will try to find a better wording.
>> + * - current is allowed to trace parameter tasks
>> + *
>> + *
>> + * Copyright (C) 2008 Intel Corporation.
>> + * Markus Metzger <markus.t.metzger@intel.com>, 2008
>> + */
>> +
>> +#ifdef CONFIG_X86_BTS
>
>Ifdef around whole file should be in the Makefile instead.
>In fact it is already in there so it is obsolet
I'll remove it - for ds.c, as well.
>> +struct bts_configuration {
>> + /* the size of a BTS record in bytes; at most
>BTS_MAX_RECORD_SIZE */
>> + unsigned char sizeof_bts;
>> + /* the size of a field in the BTS record in bytes */
>> + unsigned char sizeof_field;
>> + /* a bitmask to enable/disable various parts of BTS in
>DEBUGCTL MSR */
>> + unsigned long debugctl_tr;
>> + unsigned long debugctl_btint;
>> + unsigned long debugctl_user_off;
>> + unsigned long debugctl_kernel_off;
>> + unsigned long debugctl_all;
>> +};
>> +static struct bts_configuration bts_cfg;
>
>Should have a comment describing the locking of the variable.
>Is there is
>no need for some reason that should be also documented.
The variable is write-once. It is written during boot time. It is not
locked.
I'll add a comment.
>> +}
>> +EXPORT_SYMBOL(bts_request);
>
>Why again is that all exported?
Stephane asked for those exports - for the DS symbols, at least - in
order to use it with perfmon2.
>> + if (kbuf)
>> + bts_translate_record(kbuf++, raw);
>> +
>> + if (ubuf) {
>> + bts_translate_record(&bts, raw);
>> +
>> + if (copy_to_user(ubuf++, &bts, sizeof(bts)))
>
>copy_to_user is a macro and using expressions with side effects in
>macro arguments is usually a bad idea.
I'll replace it.
>> +static const struct bts_configuration bts_cfg_netburst = {
>> + .sizeof_bts = sizeof(long) * 3,
>> + .sizeof_field = sizeof(long),
>> + .debugctl_tr = (1<<2)|(1<<3),
>> + .debugctl_btint = (1<<4),
>> + .debugctl_user_off = (1<<6),
>> + .debugctl_kernel_off = (1<<5)
>
>Define symbols for the magic numbers?
Hmmm, the symbols would have the same name as the respective
bts_configuration field (except for _tr, which actually is TR | BTS).
>
>> + switch (c->x86_model) {
>> + case 0xD:
>> + case 0xE: /* Pentium M */
>> + bts_init(&bts_cfg_pentium_m);
>> + break;
>> + case 0xF: /* Core2 */
>> + case 0x1C: /* Atom */
>> + bts_init(&bts_cfg_core2);
>> + break;
>> + default:
>> + /* sorry, don't know about them */
>
>There should be a printk probably once at kernel boot time.
Ok.
Is there some special format that I should use?
>> + break;
>> + }
>> + break;
>> + case 0xF:
>> + switch (c->x86_model) {
>> + case 0x0:
>> + case 0x1:
>> + case 0x2: /* Netburst */
>> + bts_init(&bts_cfg_netburst);
>
>Are you sure that's complete?
I'm sure that's not complete.
That's all the hardware I have direct access to.
Supporting more hardware is a small thing, but it adds to the testing
obligation.
I'd rather add more hardware after the feature is working OK.
>> +#ifdef __KERNEL__
>> +struct bts_struct {
>> + u64 qualifier;
>> + union {
>> + /* BTS_BRANCH */
>> + struct {
>> + u64 from;
>> + u64 to;
>> + } lbr;
>> + /* BTS_TASK_ARRIVES or
>> + BTS_TASK_DEPARTS */
>> + u64 jiffies;
>> + } variant;
>> +};
>> +#else /* !__KERNEL__ */
>> +struct bts_struct {
>> + __u64 qualifier;
>
>You could always use the __ typed variant even for the kernel.
Good.
>> + * Request branch tracing for the parameter task or for the
>current cpu.
>> + *
>> + * Due to alignement constraints, the actual buffer may be slightly
>> + * smaller than the requested or provided buffer.
>> + *
>> + * Returns 0 on success; -Eerrno otherwise
>> + *
>> + * task: the task to request recording for;
>> + * NULL for per-cpu recording on the current cpu
>> + * base: the base pointer for the (non-pageable) buffer;
>> + * NULL if buffer allocation requested
>> + * size: the size of the requested or provided buffer
>> + * ovfl: pointer to a function to be called on buffer overflow;
>> + * NULL if cyclic buffer requested
>
>If you write these comments in kerneldoc format (see
>Documentation/kernel-doc-nano-HOWTO.txt)
>it might end up automatically in the extracted documentation.
I'll do that - and for ds.c, as well.
>> + * Not all processors support all variants.
>> + * If a variant is not supported, the respective flag is ignored.
>
>Is that really a good way to handle such an error? How does the
>user program find out?
It's not a serious error condition; it's more a QoI thing. You might get
more trace than you asked for.
I'll add a function to return a bit-mask of supported features for this
architecture and fail with an error if a requested feature is not
supported.
>> }
>>
>> #ifdef CONFIG_X86_PTRACE_BTS
>
>Hmm I suspect since that is not mainline you'll need to just ask
>for the previous patches to be dropped, not remove the code
>explicitely.
I tried to git-revert an old version of the patch to replace it with an
updated version.
I had also split the patch into three smaller pieces for the three
different layers.
But there's so much on top of it that I ended up with conflicts. The
easiy thing for me would be to just send out another patch on top of it.
For this one, I expect it won't be applied, so I simply resend a
corrected version some time next week.
thanks and regards,
markus.
---------------------------------------------------------------------
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052
This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 15:30 ` Metzger, Markus T
@ 2008-04-30 16:06 ` Andi Kleen
2008-05-05 6:25 ` Metzger, Markus T
0 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2008-04-30 16:06 UTC (permalink / raw)
To: Metzger, Markus T
Cc: Andi Kleen, andi-suse, hpa, linux-kernel, mingo, tglx,
markus.t.metzger, Siddha, Suresh B, roland, akpm, mtk.manpages,
eranian, Villacis, Juan
>
> >I'm not quite sure on the kernel interface. How would a in kernel
> >subsystem use it for tracing itself for example?
>
> The task parameter would be current.
>
> Apart from that, you would use it the same way it is used in
> arch/x86/kernel/ptrace.c.
Hmm, sounds a little too complicated. Surely that could be easier?
>
> You would want to bts_configure() it to disable tracing before you start
> reading your own BTS buffer.
I would suggest to write a simple example and put it with some overview
into Documentation
-Andi
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 16:06 ` Andi Kleen
@ 2008-05-05 6:25 ` Metzger, Markus T
0 siblings, 0 replies; 13+ messages in thread
From: Metzger, Markus T @ 2008-05-05 6:25 UTC (permalink / raw)
To: Andi Kleen
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
Siddha, Suresh B, roland, akpm, mtk.manpages, eranian,
Villacis, Juan
>-----Original Message-----
>From: Andi Kleen [mailto:andi@firstfloor.org]
>Sent: Mittwoch, 30. April 2008 18:06
>> >I'm not quite sure on the kernel interface. How would a in kernel
>> >subsystem use it for tracing itself for example?
>>
>> The task parameter would be current.
>>
>> Apart from that, you would use it the same way it is used in
>> arch/x86/kernel/ptrace.c.
>
>Hmm, sounds a little too complicated. Surely that could be easier?
Actually, it's just a bts_request() followed by a bts_configure() call.
regards,
markus.
---------------------------------------------------------------------
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052
This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-04-30 11:54 [patch] x86, ptrace: in-kernel BTS interface Markus Metzger
2008-04-30 12:26 ` Andrew Morton
2008-04-30 13:03 ` Andi Kleen
@ 2008-05-03 1:43 ` Roland McGrath
2008-05-03 8:45 ` Andi Kleen
2008-05-05 9:09 ` Metzger, Markus T
2 siblings, 2 replies; 13+ messages in thread
From: Roland McGrath @ 2008-05-03 1:43 UTC (permalink / raw)
To: Markus Metzger
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
suresh.b.siddha, akpm, mtk.manpages, eranian, juan.villacis
There are various nits to fix, some of which Andi mentioned.
I'll just mention a couple of those and look closer on the next round.
For the configured-out cases, don't make e.g. ds_init_intel() a macro.
Make it a static inline with an empty body. This follows the rule of,
"Don't make it a macro unless you have to," and in specific it ensures that
callers' get argument type checking in all configurations (reduces future
bit rot potential).
Any #ifdef CONFIG_* in a user-visible header (outside #ifdef __KERNEL__)
is wrong. Users don't have those definitions. If you are defining
user-visible ABI bits, they have to be unconditional.
Now, on more substantive concerns. This is heading in the right general
direction, to having a well-specified internal interface to build from.
* Not all processors support all variants.
* If a variant is not supported, the respective flag is ignored.
...
#define BTS_O_TRACE (1<<0) /* record branch trace */
#define BTS_O_TIMESTAMP (1<<1) /* record scheduling timestamps */
#define BTS_O_USER_OFF (1<<2) /* do not trace user mode */
#define BTS_O_KERNEL_OFF (1<<3) /* do not trace kernel mode */
The three +-- flags are not the natural interface (even if that is what the
hardware bits look like). Just have two bits: trace kernel branches, trace
user branches. Also, the style of silently ignoring flags is generally
bad. I recognize that you have bts_status() to tell what took. But still,
it's awkward. In this case I don't think we need to worry about interface
details for what's not supported, because we can just expose a consistent
interface on all the hardware.
Since we are already transcribing each BTS entry into the ABI form anyway,
it's easy to weed out the ones for the wrong mode along the way. We can
distinguish user from kernel entries with ip < TASK_SIZE. So what I'd do
is code it that way and make it work with ignoring the unwanted entries in
software. Then, add the support for ds_cpl hardware to let the CPU do it,
but that is just an optimization that doesn't change the interface.
This relates to the major thing I find missing in the interface:
multiplexing. I'd like to support an in-kernel global tracing request
at the same time as a user-mode request for tracing on an individual
task. In fact, all permutations and multiple of each. For the
in-kernel interface, I'd like to see an interface to request a new
"tracing context" that can be one task or global and kernel-only or
user-only or both. The hardware setup is to trace the union of all
those requests for the current task plus all the global requests. When
delivering samples from the buffer, we sort them out to who wants to see
what. It seems pretty straightforward. The main thing is deciding how
big a buffer to use.
For global tracing simultaneous with per-task tracing, there are two
ways to go. You can use a single hardware buffer and just write markers
into it at context switch time to distinguish which task each stretch of
entries was in. Or you can switch buffers on context switch, and then
collecting global samples means collecting samples from every task
buffer that's been active since last collection, plus a global buffer
that's switched in for any task that doesn't have anyone doing per-task
tracing.
The implementation of all that would go in the bts.c layer.
Now, a few things about the ds.c layer.
I don't think ds_request should do the buffer allocation, just the
ds_context. This keeps the DS layer just what it has to be: programming
and switching the DS MSR and managing the ds_context for one BTS caller
and one PEBS caller to cooperate. The caller can supply the BTS/PEBS
buffer address and be responsible for meeting the requirements: it has
to be wired and in kernel address space, and aligned (the manual
recommends cacheline-aligned, so might as well make that the interface
requirement) and not odd-sized.
What is ds_validate_access for? I don't think this level of code should
be thinking about concepts like permission at all. What the current
task is making the calls into the ds.c layer should not matter. It's
the job of higher-level code to decide if this is a proper thing to be
doing.
Thanks,
Roland
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-05-03 1:43 ` Roland McGrath
@ 2008-05-03 8:45 ` Andi Kleen
2008-05-05 9:09 ` Metzger, Markus T
1 sibling, 0 replies; 13+ messages in thread
From: Andi Kleen @ 2008-05-03 8:45 UTC (permalink / raw)
To: Roland McGrath
Cc: Markus Metzger, andi-suse, hpa, linux-kernel, mingo, tglx,
markus.t.metzger, suresh.b.siddha, akpm, mtk.manpages, eranian,
juan.villacis
> Now, on more substantive concerns. This is heading in the right general
> direction, to having a well-specified internal interface to build from.
That's actually an unusual requirement for Linux, a Roland Special. Normally
we don't do complicated internal interfaces that are not used by anything yet.
-Andi
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [patch] x86, ptrace: in-kernel BTS interface
2008-05-03 1:43 ` Roland McGrath
2008-05-03 8:45 ` Andi Kleen
@ 2008-05-05 9:09 ` Metzger, Markus T
2008-05-05 23:03 ` Roland McGrath
1 sibling, 1 reply; 13+ messages in thread
From: Metzger, Markus T @ 2008-05-05 9:09 UTC (permalink / raw)
To: Roland McGrath
Cc: andi-suse, hpa, linux-kernel, mingo, tglx, markus.t.metzger,
Siddha, Suresh B, akpm, mtk.manpages, eranian, Villacis, Juan
>-----Original Message-----
>From: Roland McGrath [mailto:roland@redhat.com]
>Sent: Samstag, 3. Mai 2008 03:44
>To: Metzger, Markus T
>Now, on more substantive concerns. This is heading in the
>right general
>direction, to having a well-specified internal interface to build from.
>
> * Not all processors support all variants.
> * If a variant is not supported, the respective flag is ignored.
> ...
> #define BTS_O_TRACE (1<<0) /* record branch trace */
> #define BTS_O_TIMESTAMP (1<<1) /* record scheduling
>timestamps */
> #define BTS_O_USER_OFF (1<<2) /* do not trace user mode */
> #define BTS_O_KERNEL_OFF (1<<3) /* do not trace kernel mode */
>
>The three +-- flags are not the natural interface (even if
>that is what the
>hardware bits look like). Just have two bits: trace kernel
>branches, trace
>user branches. Also, the style of silently ignoring flags is generally
>bad. I recognize that you have bts_status() to tell what
>took. But still,
>it's awkward. In this case I don't think we need to worry
>about interface
>details for what's not supported, because we can just expose a
>consistent
>interface on all the hardware.
OK.
I guess timestamps should be always on.
This would affect the ptrace interface. There will be less options and I
need to drop the DRAIN and CLEAR commands. With multiple tracers, I can
no longer clear the BTS buffer. Drain will morph into a whole-buffer
read.
>Since we are already transcribing each BTS entry into the ABI
>form anyway,
>it's easy to weed out the ones for the wrong mode along the
>way. We can
>distinguish user from kernel entries with ip < TASK_SIZE. So
>what I'd do
>is code it that way and make it work with ignoring the
>unwanted entries in
>software. Then, add the support for ds_cpl hardware to let
>the CPU do it,
>but that is just an optimization that doesn't change the interface.
OK.
>This relates to the major thing I find missing in the interface:
>multiplexing. I'd like to support an in-kernel global tracing request
>at the same time as a user-mode request for tracing on an individual
>task. In fact, all permutations and multiple of each. For the
>in-kernel interface, I'd like to see an interface to request a new
>"tracing context" that can be one task or global and kernel-only or
>user-only or both. The hardware setup is to trace the union of all
>those requests for the current task plus all the global requests. When
>delivering samples from the buffer, we sort them out to who
>wants to see
>what. It seems pretty straightforward. The main thing is deciding how
>big a buffer to use.
Branch trace buffers tend to run full pretty fast.
For example:
- switching from a task (from user-mode to __switch_to_xtra()) takes
~300 entries (7.2k) - handling a segv (from null function pointer call
to __switch_to_xtra()) takes ~400 entries (9.6k)
- 512k are not enough to hold a single time slice (measured on a printf
loop)
On the other hand:
- knowing where you died takes 1 entry
- knowing how you got there takes <100 entries or you lose track, anyway
If we add ~10k to the buffer size the user requested, we should be able
to hold the extra kernel-mode trace and do the filtering in software -
assuming that we will never hold more trace than the tail of a single
time slice.
Who should pay for this memory?
Should it be taken from the requesting task's rlimit?
Who should pay in the case of multiple tracers?
What if the initial tracer leaves while other tracers remain? (the
initial tracer would want its rlimit adjusted, but other tracers may not
even have enough resources).
Should I drop the whole rlimit check and allow everybody to request
arbitrarily big buffers?
>For global tracing simultaneous with per-task tracing, there are two
>ways to go. You can use a single hardware buffer and just
>write markers
>into it at context switch time to distinguish which task each
>stretch of
>entries was in. Or you can switch buffers on context switch, and then
>collecting global samples means collecting samples from every task
>buffer that's been active since last collection, plus a global buffer
>that's switched in for any task that doesn't have anyone doing per-task
>tracing.
Given the vast memory consumption, I would only consider circular
buffers. That's all you need for debugging. Trying to collect a full
system trace even for a few seconds will likely fill up an entire disk
server. We may add support for an overflow interrupt for each of the
various buffers, but I doubt that it will be very useful. Trying to get
a consistent global trace will likely eat up more memory and compute
time than its worth.
I would go for per-task buffers and a catch-all per-cpu buffer (if
per-cpu trace is requested).
I would not try to remember the sequence of recent tasks and construct a
consistent global trace on the fly - we would need too big per-task
buffers. We could copy the per-task trace into the per-cpu buffer on
context switch, but, again, I doubt that this is overly useful.
>I don't think ds_request should do the buffer allocation, just the
>ds_context.
I'm fine to move it into the BTS layer. But it would duplicate the
allocation and accounting code into all of DS users.
>What is ds_validate_access for? I don't think this level of
>code should
>be thinking about concepts like permission at all. What the current
>task is making the calls into the ds.c layer should not matter. It's
>the job of higher-level code to decide if this is a proper thing to be
>doing.
The model was to allow a single owner of the BTS and PEBS configurations
to prevent different users from overwriting their settings.
The first task to make a ds_request() for BTS or PEBS, would own the
respective configuration until it ds_release()s it.
This is essentially a BTS/PEBS resource allocator.
If we cut down on the BTS interface and collect all trace at all times,
anyway, we would not need this, any more.
PEBS would still need something like that, though. I wonder whether a
multiplexing model makes sense for PEBS, at all.
regards,
markus.
---------------------------------------------------------------------
Intel GmbH
Dornacher Strasse 1
85622 Feldkirchen/Muenchen Germany
Sitz der Gesellschaft: Feldkirchen bei Muenchen
Geschaeftsfuehrer: Douglas Lusk, Peter Gleissner, Hannes Schwaderer
Registergericht: Muenchen HRB 47456 Ust.-IdNr.
VAT Registration No.: DE129385895
Citibank Frankfurt (BLZ 502 109 00) 600119052
This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
^ permalink raw reply [flat|nested] 13+ messages in thread
* RE: [patch] x86, ptrace: in-kernel BTS interface
2008-05-05 9:09 ` Metzger, Markus T
@ 2008-05-05 23:03 ` Roland McGrath
2008-05-06 6:39 ` stephane eranian
0 siblings, 1 reply; 13+ messages in thread
From: Roland McGrath @ 2008-05-05 23:03 UTC (permalink / raw)
To: Metzger, Markus T
Cc: linux-kernel, mingo, tglx, Siddha, Suresh B, eranian,
Villacis, Juan
> I guess timestamps should be always on.
Not necessarily. I was just talking about the branch-tracing parts.
> This would affect the ptrace interface. There will be less options and I
> need to drop the DRAIN and CLEAR commands. With multiple tracers, I can
> no longer clear the BTS buffer. Drain will morph into a whole-buffer
> read.
I didn't comment about the higher-level interfaces. For now, I'm
concentrating just on the basic BTS layer (and below). The key point to
me is that the internal buffer the hardware writes need not have a 1:1
relation with any particular higher-level interface's ways of buffering.
> Branch trace buffers tend to run full pretty fast.
[...]
Those numbers are helpful to have in mind while considering this stuff.
Thanks.
> If we add ~10k to the buffer size the user requested, we should be able
> to hold the extra kernel-mode trace and do the filtering in software -
> assuming that we will never hold more trace than the tail of a single
> time slice.
I'd write up a few macros (expressed as number of entries) for the
"rules of thumb" numbers you mentioned, like the user->ctxsw path count.
Then we can calculate from these and tune the values for various
scenarios as we go. Note that for e.g. user-only tracing on hardware
without ds_cpl, there could be many more branch entries for other kinds
of in-kernel paths (system calls, page faults). And, of course, on
ds_cpl hardware when noone wants kernel traces, we don't need to add any
overhead space at all (or maybe one entry to be pedantic).
> Who should pay for this memory?
[...]
My first point about this is that it is not the ds.c layer's problem.
I don't have anything to offer vis a vis PEBS right now. For BTS, the
buffer allocation has a lot to do with how we work out the multiplexing.
> Given the vast memory consumption, I would only consider circular
> buffers. [...]
Firstly, this is not a decision for the ds.c layer. It is
straightforward enough to let the caller who sets up a BTS or PEBS
buffer say whether they want circular or interrupt-threshold mode. It's
easy to implement a basic DS interrupt handler that does trivial
dispatch to make one or both callbacks to function pointers provided by
the BTS/PEBS caller (whichever has hit its threshold). The ds.c layer
would also take care of clearing the MSR bit to disable BTS as the
manual says to do, while the callback runs, and then reenable it (and
the analogous thing for PEBS), but that's about it.
> I'm fine to move it into the BTS layer. But it would duplicate the
> allocation and accounting code into all of DS users.
If both the PEBS code and the BTS code turn out to have similar needs,
nothing says they can't both call the same code for their buffer
allocation. That's just not the ds.c layer's business if they do.
> The model was to allow a single owner of the BTS and PEBS configurations
> to prevent different users from overwriting their settings.
> The first task to make a ds_request() for BTS or PEBS, would own the
> respective configuration until it ds_release()s it.
> This is essentially a BTS/PEBS resource allocator.
The simple mutual exclusion is the right thing to have in the ds.c
layer--it's just that it should not be implicitly tied to calling task.
Instead have it be a struct pointer the caller passes in, or a pointer
the allocator passes back, that must match on later requests or a later
release call. That's all. It's up to each caller to decide how to
organize what callers constitute "its" exclusive use of the DS layer.
> If we cut down on the BTS interface and collect all trace at all times,
> anyway, we would not need this, any more.
This is an orthogonal issue. What I'm saying about the DS layer stands
on its own. The DS layer should be straightforward and complete in
these ways. Then this simple low layer can be finished and stable now,
regardless of how the code that uses the BTS slot or the code that uses
the PEBS slot wind up.
> PEBS would still need something like that, though. I wonder whether a
> multiplexing model makes sense for PEBS, at all.
My discussion about multiplexing referred only to the BTS layer. I have
not discussed PEBS at all, except that the DS layer should provide the
same trivially simple interface to control it. For this discussion, I'm
assuming that PEBS is in the hands of perfmon2 and leaving it at that.
If we're agreed about the plan to make the ds.c layer just the simple
thing it needs to be, we can tie up that part of the code and let it be
stable. Then we can move on to the details of the BTS layer. I'm not
sure I communicated clearly what I had in mind for that, but we can get
back into that.
Thanks,
Roland
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] x86, ptrace: in-kernel BTS interface
2008-05-05 23:03 ` Roland McGrath
@ 2008-05-06 6:39 ` stephane eranian
0 siblings, 0 replies; 13+ messages in thread
From: stephane eranian @ 2008-05-06 6:39 UTC (permalink / raw)
To: Roland McGrath
Cc: Metzger, Markus T, linux-kernel, mingo, tglx, Siddha, Suresh B,
Villacis, Juan
Hi,
On Tue, May 6, 2008 at 1:03 AM, Roland McGrath <roland@redhat.com> wrote:
> > Who should pay for this memory?
> [...]
>
> My first point about this is that it is not the ds.c layer's problem.
> I don't have anything to offer vis a vis PEBS right now. For BTS, the
> buffer allocation has a lot to do with how we work out the multiplexing.
>
I would let the upper layer handle this. Perfmon2 takes care of this today.
But the consumer may be the kernel, so you could not charge this onto
the current/asking task.
>
> > Given the vast memory consumption, I would only consider circular
> > buffers. [...]
>
> Firstly, this is not a decision for the ds.c layer. It is
Exactly!
> If both the PEBS code and the BTS code turn out to have similar needs,
> nothing says they can't both call the same code for their buffer
> allocation. That's just not the ds.c layer's business if they do.
>
In Perfmon2, we want to keep control of the buffer allocation. Thus we want
to be able to pass the buffer base address and size, that's all. Perfmon2
does allow the buffer to be remapped to user level.
>
> > The model was to allow a single owner of the BTS and PEBS configurations
> > to prevent different users from overwriting their settings.
> > The first task to make a ds_request() for BTS or PEBS, would own the
> > respective configuration until it ds_release()s it.
> > This is essentially a BTS/PEBS resource allocator.
>
> The simple mutual exclusion is the right thing to have in the ds.c
> layer--it's just that it should not be implicitly tied to calling task.
I agree. I think I have discussed this with Markus already because I
ran into some
issue when experimenting with ds.c for perfmon PEBS support.
> > PEBS would still need something like that, though. I wonder whether a
> > multiplexing model makes sense for PEBS, at all.
I am not sure I undertstand what you mean by multiplexing?
Are you talking about per-thread vs. system-wide?
Perfmon2 does support both modes incl. for PEBS, although in system-wide
mode, there is not enough information saved in the PEBS buffer to determine
which task generated the sample.
>
> My discussion about multiplexing referred only to the BTS layer. I have
> not discussed PEBS at all, except that the DS layer should provide the
> same trivially simple interface to control it. For this discussion, I'm
> assuming that PEBS is in the hands of perfmon2 and leaving it at that.
>
The interest for using ds for perfmon2 is:
- hides the details about the DS layout (different from P4 to Intel Core)
- ensures mutual exclusion on the DS/PEBS resource
- manages DS save/restore on context switch
So it removes a lot of code.
But perfmon2 does not use:
- buffer allocation
- PEBS record abstraction
- BTS (not yet)
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2008-05-06 6:39 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-30 11:54 [patch] x86, ptrace: in-kernel BTS interface Markus Metzger
2008-04-30 12:26 ` Andrew Morton
2008-04-30 12:43 ` Metzger, Markus T
2008-04-30 12:55 ` Andrew Morton
2008-04-30 13:03 ` Andi Kleen
2008-04-30 15:30 ` Metzger, Markus T
2008-04-30 16:06 ` Andi Kleen
2008-05-05 6:25 ` Metzger, Markus T
2008-05-03 1:43 ` Roland McGrath
2008-05-03 8:45 ` Andi Kleen
2008-05-05 9:09 ` Metzger, Markus T
2008-05-05 23:03 ` Roland McGrath
2008-05-06 6:39 ` stephane eranian
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox