[PATCH] perf_events: AMD event scheduling (v2)

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH]  perf_events: AMD event scheduling (v2)
@ 2010-02-01 20:15 Stephane Eranian
  2010-02-04 14:55 ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Stephane Eranian @ 2010-02-01 20:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian, eranian

	
	This patch adds correct AMD Northbridge event scheduling.
	It must be applied on top tip-x86 + hw_perf_enable() fix.

	NB events are events measuring L3 cache, Hypertransport
	traffic. They are identified by an event code  >= 0xe0.
	They measure events on the Northbride which is shared
	by all cores on a package. NB events are counted on a
	shared set of counters. When a NB event is programmed
	in a counter, the data actually comes from a shared
	counter. Thus, access to those counters needs to be
	synchronized.

	We implement the synchronization such that no two cores
	can be measuring NB events using the same counters. Thus,
	we maintain a per-NB * allocation table. The available slot
	is propagated using the event_constraint structure.
 
	This 2nd version takes into account the changes on how
	constraints are stored by the scheduling code.

	The patch also takes care of hotplug CPU.

	Signed-off-by: Stephane Eranian <eranian@google.com>

--
 arch/x86/kernel/cpu/perf_event.c |  263 ++++++++++++++++++++++++++++++++++++++-
 kernel/perf_event.c              |    5 
 2 files changed, 266 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1846ead..fcdf351 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -80,6 +80,13 @@ struct event_constraint {
 	int	weight;
 };
 
+struct amd_nb {
+	int nb_id;  /* Northbridge id */
+	int refcnt; /* reference count */
+	struct perf_event *owners[X86_PMC_IDX_MAX];
+	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
 struct cpu_hw_events {
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -91,6 +98,7 @@ struct cpu_hw_events {
 	int			n_added;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+	struct amd_nb		*amd_nb;
 };
 
 #define EVENT_CONSTRAINT(c, n, m) {	\
@@ -149,6 +157,8 @@ struct x86_pmu {
 
 static struct x86_pmu x86_pmu __read_mostly;
 
+static raw_spinlock_t amd_nb_lock;
+
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
@@ -2211,6 +2221,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 }
 
 static struct event_constraint unconstrained;
+static struct event_constraint emptyconstraint;
 
 static struct event_constraint bts_constraint =
 	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
@@ -2250,10 +2261,144 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	return &unconstrained;
 }
 
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+	u64 val = hwc->config;
+	/* event code : bits [35-32] | [7-0] */
+	val = (val >> 24) | ( val & 0xff);
+	return val >= 0x0e0;
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+				      struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_event *old;
+	struct amd_nb *nb;
+	int i;
+
+	/*
+	 * only care about NB events
+	 */
+	if(!amd_is_nb_event(hwc))
+		return;
+
+	/*
+	 * NB not initialized
+	 */
+	nb = cpuc->amd_nb;
+	if (!nb)
+		return;
+
+	if (hwc->idx == -1)
+		return;
+
+	/*
+	 * need to scan whole list because event may not have
+	 * been assigned during scheduling
+	 */
+	for(i=0; i < x86_pmu.num_events; i++) {
+		if (nb->owners[i] == event) {
+			old = cmpxchg(nb->owners+i, event, NULL);
+			return;
+		}
+	}
+}
+
+ /*
+  * AMD64 Northbridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code  >= 0xe0.
+  * They measure events on the Northbride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB * allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will evnetually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling amd_put_event_constraints().
+  *
+  * Non NB events are not impacted by this restriction.
+  */
 static struct event_constraint *
 amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
-	return &unconstrained;
+	struct hw_perf_event *hwc = &event->hw;
+	struct amd_nb *nb = cpuc->amd_nb;
+	struct perf_event *old = NULL;
+	int max = x86_pmu.num_events;
+	int i, j, k = -1;
+
+	/*
+	 * if not NB event or no NB, then no constraints
+	 */
+	if (!amd_is_nb_event(hwc) || !nb)
+		return &unconstrained;
+
+	/*
+	 * detect if already present, if so reuse
+	 *
+	 * cannot merge with actual allocation
+	 * because of possible holes
+	 *
+	 * event can already be present yet not assigned (in hwc->idx)
+	 * because of successive calls to x86_schedule_events() from
+	 * hw_perf_group_sched_in() without hw_perf_enable()
+	 */
+	for(i=0; i < max; i++) {
+		/*
+		 * keep track of first free slot
+		 */
+		if (k == -1 && !nb->owners[i])
+			k = i;
+
+		/* already present, reuse */
+		if (nb->owners[i] == event)
+			goto skip;
+	}
+	/*
+	 * not present, so grab a new slot
+	 *
+	 * try to alllcate same counter as before if
+	 * event has already been assigned once. Otherwise,
+	 * try to use free counter k obtained during the 1st
+	 * pass above.
+	 */
+	i = j = hwc->idx != -1 ? hwc->idx : (k == -1 ? 0 : k);
+	do {
+		old = cmpxchg(nb->owners+i, NULL, event);
+		if (!old)
+			break;
+		if (++i == x86_pmu.num_events)
+			i = 0;
+	} while (i != j);
+skip:
+	if (!old)
+		return &nb->event_constraints[i];
+	return &emptyconstraint;
 }
 
 static int x86_event_sched_in(struct perf_event *event,
@@ -2443,7 +2588,8 @@ static __initconst struct x86_pmu amd_pmu = {
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
-	.get_event_constraints	= amd_get_event_constraints
+	.get_event_constraints	= amd_get_event_constraints,
+	.put_event_constraints	= amd_put_event_constraints
 };
 
 static __init int p6_pmu_init(void)
@@ -2561,6 +2707,96 @@ static __init int intel_pmu_init(void)
 	return 0;
 }
 
+static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+{
+        struct amd_nb *nb;
+	int i;
+
+        nb= vmalloc_node(sizeof(struct amd_nb), cpu_to_node(cpu));
+        if (!nb)
+                return NULL;
+
+        memset(nb, 0, sizeof(*nb));
+        nb->nb_id = nb_id;
+
+	/*
+	 * initialize all possible NB constraints
+	 */
+	for(i=0; i < x86_pmu.num_events; i++) {
+		set_bit(i, nb->event_constraints[i].idxmsk);
+		nb->event_constraints[i].weight = 1;
+	}
+        return nb;
+}
+
+static void amd_pmu_cpu_online(int cpu)
+{
+	struct cpu_hw_events *cpu1, *cpu2;
+	struct amd_nb *nb = NULL;
+	int i, nb_id;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	/*
+	 * function may be called too early in the
+	 * boot process, in which case nb_id is bogus
+	 *
+	 * for BSP, there is an explicit call from
+	 * amd_pmu_init()
+	 */
+	nb_id = amd_get_nb_id(cpu);
+	if (nb_id == BAD_APICID)
+		return;
+
+	cpu1 = &per_cpu(cpu_hw_events, cpu);
+	cpu1->amd_nb = NULL;
+
+	raw_spin_lock(&amd_nb_lock);
+
+	for_each_online_cpu(i) {
+		cpu2 = &per_cpu(cpu_hw_events, i);
+		nb = cpu2->amd_nb;
+		if (!nb)
+			continue;
+		if (nb->nb_id == nb_id)
+			goto found;
+	}
+
+	nb = amd_alloc_nb(cpu, nb_id);
+	if (!nb) {
+		pr_err("perf_events: failed to allocate NB storage for CPU%d\n", cpu);
+		raw_spin_unlock(&amd_nb_lock);
+		return;
+	}
+found:
+	nb->refcnt++;
+	cpu1->amd_nb = nb;
+
+	raw_spin_unlock(&amd_nb_lock);
+
+	pr_info("CPU%d NB%d ref=%d\n", cpu, nb_id, nb->refcnt);
+}
+
+static void amd_pmu_cpu_offline(int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	raw_spin_lock(&amd_nb_lock);
+
+	if (--cpuhw->amd_nb->refcnt == 0)
+		vfree(cpuhw->amd_nb);
+
+	cpuhw->amd_nb = NULL;
+
+	raw_spin_unlock(&amd_nb_lock);
+}
+
 static __init int amd_pmu_init(void)
 {
 	/* Performance-monitoring supported from K7 and later: */
@@ -2573,6 +2809,8 @@ static __init int amd_pmu_init(void)
 	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
 	       sizeof(hw_cache_event_ids));
 
+	/* initialize BSP */
+	amd_pmu_cpu_online(smp_processor_id());
 	return 0;
 }
 
@@ -2903,4 +3141,25 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 void hw_perf_event_setup_online(int cpu)
 {
 	init_debug_store_on_cpu(cpu);
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		amd_pmu_cpu_online(cpu);
+		break;
+	default:
+		return;
+	}
+}
+
+void hw_perf_event_setup_offline(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		amd_pmu_cpu_offline(cpu);
+		break;
+	default:
+		return;
+	}
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 40f8b07..f9bbb6c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,6 +98,7 @@ void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
 void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
+void __weak hw_perf_event_setup_offline(int cpu){ barrier(); }
 
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
@@ -5446,6 +5447,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_event_exit_cpu(cpu);
 		break;
 
+        case CPU_DEAD:
+                hw_perf_event_setup_offline(cpu);
+                break;
+
 	default:
 		break;
 	}

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH]  perf_events: AMD event scheduling (v2)
  2010-02-01 20:15 [PATCH] perf_events: AMD event scheduling (v2) Stephane Eranian
@ 2010-02-04 14:55 ` Peter Zijlstra
  2010-02-04 14:59   ` Peter Zijlstra
                     ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Peter Zijlstra @ 2010-02-04 14:55 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

On Mon, 2010-02-01 at 22:15 +0200, Stephane Eranian wrote:
> 	
> 	This patch adds correct AMD Northbridge event scheduling.
> 	It must be applied on top tip-x86 + hw_perf_enable() fix.
> 
> 	NB events are events measuring L3 cache, Hypertransport
> 	traffic. They are identified by an event code  >= 0xe0.
> 	They measure events on the Northbride which is shared
> 	by all cores on a package. NB events are counted on a
> 	shared set of counters. When a NB event is programmed
> 	in a counter, the data actually comes from a shared
> 	counter. Thus, access to those counters needs to be
> 	synchronized.
> 
> 	We implement the synchronization such that no two cores
> 	can be measuring NB events using the same counters. Thus,
> 	we maintain a per-NB * allocation table. The available slot
> 	is propagated using the event_constraint structure.
>  
> 	This 2nd version takes into account the changes on how
> 	constraints are stored by the scheduling code.
> 
> 	The patch also takes care of hotplug CPU.
> 
> 	Signed-off-by: Stephane Eranian <eranian@google.com>

Please run the patch through checkpatch, there's lots of trivial coding
style errors (spaces instead of tabs, for(i=0; etc..)

> @@ -2250,10 +2261,144 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
>  	return &unconstrained;
>  }
>  
> +/*
> + * AMD64 events are detected based on their event codes.
> + */
> +static inline int amd_is_nb_event(struct hw_perf_event *hwc)
> +{
> +	u64 val = hwc->config;

& K7_EVNTSEL_EVENT_MASK ?

> +	/* event code : bits [35-32] | [7-0] */
> +	val = (val >> 24) | ( val & 0xff);
> +	return val >= 0x0e0;
> +}
> +
> +static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
> +				      struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct perf_event *old;
> +	struct amd_nb *nb;
> +	int i;
> +
> +	/*
> +	 * only care about NB events
> +	 */
> +	if(!amd_is_nb_event(hwc))
> +		return;
> +
> +	/*
> +	 * NB not initialized
> +	 */
> +	nb = cpuc->amd_nb;
> +	if (!nb)
> +		return;
> +
> +	if (hwc->idx == -1)
> +		return;
> +
> +	/*
> +	 * need to scan whole list because event may not have
> +	 * been assigned during scheduling
> +	 */
> +	for(i=0; i < x86_pmu.num_events; i++) {
> +		if (nb->owners[i] == event) {
> +			old = cmpxchg(nb->owners+i, event, NULL);

might want to validate old is indeed event.

> +			return;
> +		}
> +	}
> +}
> +
> + /*
> +  * AMD64 Northbridge events need special treatment because
> +  * counter access needs to be synchronized across all cores
> +  * of a package. Refer to BKDG section 3.12
> +  *
> +  * NB events are events measuring L3 cache, Hypertransport
> +  * traffic. They are identified by an event code  >= 0xe0.
> +  * They measure events on the Northbride which is shared
> +  * by all cores on a package. NB events are counted on a
> +  * shared set of counters. When a NB event is programmed
> +  * in a counter, the data actually comes from a shared
> +  * counter. Thus, access to those counters needs to be
> +  * synchronized.
> +  * We implement the synchronization such that no two cores
> +  * can be measuring NB events using the same counters. Thus,
> +  * we maintain a per-NB * allocation table. The available slot
> +  * is propagated using the event_constraint structure.
> +  *
> +  * We provide only one choice for each NB event based on
> +  * the fact that only NB events have restrictions. Consequently,
> +  * if a counter is available, there is a guarantee the NB event
> +  * will be assigned to it. If no slot is available, an empty
> +  * constraint is returned and scheduling will evnetually fail
> +  * for this event.
> +  *
> +  * Note that all cores attached the same NB compete for the same
> +  * counters to host NB events, this is why we use atomic ops. Some
> +  * multi-chip CPUs may have more than one NB.
> +  *
> +  * Given that resources are allocated (cmpxchg), they must be
> +  * eventually freed for others to use. This is accomplished by
> +  * calling amd_put_event_constraints().
> +  *
> +  * Non NB events are not impacted by this restriction.
> +  */
>  static struct event_constraint *
>  amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
>  {
> -	return &unconstrained;
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct amd_nb *nb = cpuc->amd_nb;
> +	struct perf_event *old = NULL;
> +	int max = x86_pmu.num_events;
> +	int i, j, k = -1;
> +
> +	/*
> +	 * if not NB event or no NB, then no constraints
> +	 */
> +	if (!amd_is_nb_event(hwc) || !nb)
> +		return &unconstrained;
> +
> +	/*
> +	 * detect if already present, if so reuse
> +	 *
> +	 * cannot merge with actual allocation
> +	 * because of possible holes
> +	 *
> +	 * event can already be present yet not assigned (in hwc->idx)
> +	 * because of successive calls to x86_schedule_events() from
> +	 * hw_perf_group_sched_in() without hw_perf_enable()
> +	 */
> +	for(i=0; i < max; i++) {
> +		/*
> +		 * keep track of first free slot
> +		 */
> +		if (k == -1 && !nb->owners[i])
> +			k = i;

break?

> +
> +		/* already present, reuse */
> +		if (nb->owners[i] == event)
> +			goto skip;

s/skip/done/ ?

> +	}
> +	/*
> +	 * not present, so grab a new slot
> +	 *
> +	 * try to alllcate same counter as before if
> +	 * event has already been assigned once. Otherwise,
> +	 * try to use free counter k obtained during the 1st
> +	 * pass above.
> +	 */
> +	i = j = hwc->idx != -1 ? hwc->idx : (k == -1 ? 0 : k);

That's patently unreadable, and I'm not sure what happens if we failed
to find an eligible spot in the above loop, should we not somehow jump
out and return emptyconstraint?

> +	do {
> +		old = cmpxchg(nb->owners+i, NULL, event);
> +		if (!old)
> +			break;
> +		if (++i == x86_pmu.num_events)
> +			i = 0;
> +	} while (i != j);
> +skip:
> +	if (!old)
> +		return &nb->event_constraints[i];
> +	return &emptyconstraint;
>  }
>  
>  static int x86_event_sched_in(struct perf_event *event,

> @@ -2561,6 +2707,96 @@ static __init int intel_pmu_init(void)
>  	return 0;
>  }
>  
> +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
> +{
> +        struct amd_nb *nb;
> +	int i;
> +
> +        nb= vmalloc_node(sizeof(struct amd_nb), cpu_to_node(cpu));

$ pahole -C amd_nb build/arch/x86/kernel/cpu/perf_event.o
struct amd_nb {
        int                        nb_id;                /*     0     4 */
        int                        refcnt;               /*     4     4 */
        struct perf_event *        owners[64];           /*     8   512 */
        /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */
        struct event_constraint    event_constraints[64]; /*   520  1536 */
        /* --- cacheline 32 boundary (2048 bytes) was 8 bytes ago --- */

        /* size: 2056, cachelines: 33 */
        /* last cacheline: 8 bytes */
};

Surely we can kmalloc that?

> +        if (!nb)
> +                return NULL;
> +
> +        memset(nb, 0, sizeof(*nb));
> +        nb->nb_id = nb_id;
> +
> +	/*
> +	 * initialize all possible NB constraints
> +	 */
> +	for(i=0; i < x86_pmu.num_events; i++) {
> +		set_bit(i, nb->event_constraints[i].idxmsk);
> +		nb->event_constraints[i].weight = 1;
> +	}
> +        return nb;
> +}

Terrible whilespace damage.

> +
> +static void amd_pmu_cpu_online(int cpu)
> +{
> +	struct cpu_hw_events *cpu1, *cpu2;
> +	struct amd_nb *nb = NULL;
> +	int i, nb_id;
> +
> +	if (boot_cpu_data.x86_max_cores < 2)
> +		return;

So there are no single core AMD chips that have a NorthBridge PMU? What
about the recent 64bit single core laptop chips?

> +
> +	/*
> +	 * function may be called too early in the
> +	 * boot process, in which case nb_id is bogus
> +	 *
> +	 * for BSP, there is an explicit call from
> +	 * amd_pmu_init()
> +	 */

I keep getting flash-backs to doom's graphics engine every time I see
BSP..

> +	nb_id = amd_get_nb_id(cpu);
> +	if (nb_id == BAD_APICID)
> +		return;
> +
> +	cpu1 = &per_cpu(cpu_hw_events, cpu);
> +	cpu1->amd_nb = NULL;
> +
> +	raw_spin_lock(&amd_nb_lock);
> +
> +	for_each_online_cpu(i) {
> +		cpu2 = &per_cpu(cpu_hw_events, i);
> +		nb = cpu2->amd_nb;
> +		if (!nb)
> +			continue;
> +		if (nb->nb_id == nb_id)
> +			goto found;
> +	}
> +
> +	nb = amd_alloc_nb(cpu, nb_id);
> +	if (!nb) {
> +		pr_err("perf_events: failed to allocate NB storage for CPU%d\n", cpu);
> +		raw_spin_unlock(&amd_nb_lock);
> +		return;
> +	}
> +found:
> +	nb->refcnt++;
> +	cpu1->amd_nb = nb;
> +
> +	raw_spin_unlock(&amd_nb_lock);

Can't this be simplified by using the cpu to node mask?

> +	pr_info("CPU%d NB%d ref=%d\n", cpu, nb_id, nb->refcnt);

stray debug stuff?

> +}
> +
> +static void amd_pmu_cpu_offline(int cpu)
> +{
> +	struct cpu_hw_events *cpuhw;
> +
> +	if (boot_cpu_data.x86_max_cores < 2)
> +		return;
> +
> +	cpuhw = &per_cpu(cpu_hw_events, cpu);
> +
> +	raw_spin_lock(&amd_nb_lock);
> +
> +	if (--cpuhw->amd_nb->refcnt == 0)
> +		vfree(cpuhw->amd_nb);
> +
> +	cpuhw->amd_nb = NULL;
> +
> +	raw_spin_unlock(&amd_nb_lock);
> +}
> +
>  static __init int amd_pmu_init(void)
>  {
>  	/* Performance-monitoring supported from K7 and later: */
> @@ -2573,6 +2809,8 @@ static __init int amd_pmu_init(void)
>  	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
>  	       sizeof(hw_cache_event_ids));
>  
> +	/* initialize BSP */

Binary Space Partitioning again?

> +	amd_pmu_cpu_online(smp_processor_id());
>  	return 0;
>  }


Also, I think this is buggy in that:

  perf_disable();
  event->pmu->disable(event);
  ...
  event->pmu->enable(event);
  perf_enable();

can now fail, I think we need to move the put_event_constraint() from
x86_pmu_disable() into x86_perf_enable() or something.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH]  perf_events: AMD event scheduling (v2)
  2010-02-04 14:55 ` Peter Zijlstra
@ 2010-02-04 14:59   ` Peter Zijlstra
  2010-02-04 15:04     ` Stephane Eranian
  2010-02-04 16:05   ` Stephane Eranian
  2010-02-25 18:29   ` Stephane Eranian
  2 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2010-02-04 14:59 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

On Thu, 2010-02-04 at 15:55 +0100, Peter Zijlstra wrote:
> > +     if (boot_cpu_data.x86_max_cores < 2)
> > +             return;
> 
> So there are no single core AMD chips that have a NorthBridge PMU? What
> about the recent 64bit single core laptop chips?

D'0h, for those there is no need to create inter-cpu constraints.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 14:59   ` Peter Zijlstra
@ 2010-02-04 15:04     ` Stephane Eranian
  0 siblings, 0 replies; 9+ messages in thread
From: Stephane Eranian @ 2010-02-04 15:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

On Thu, Feb 4, 2010 at 3:59 PM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, 2010-02-04 at 15:55 +0100, Peter Zijlstra wrote:
> > > +     if (boot_cpu_data.x86_max_cores < 2)
> > > +             return;
> >
> > So there are no single core AMD chips that have a NorthBridge PMU? What
> > about the recent 64bit single core laptop chips?
>
> D'0h, for those there is no need to create inter-cpu constraints.
>
Exactly!


--
Stephane Eranian  | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 14:55 ` Peter Zijlstra
  2010-02-04 14:59   ` Peter Zijlstra
@ 2010-02-04 16:05   ` Stephane Eranian
  2010-02-04 16:19     ` Peter Zijlstra
  2010-02-25 18:29   ` Stephane Eranian
  2 siblings, 1 reply; 9+ messages in thread
From: Stephane Eranian @ 2010-02-04 16:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

On Thu, Feb 4, 2010 at 3:55 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Mon, 2010-02-01 at 22:15 +0200, Stephane Eranian wrote:
>>
>>       This patch adds correct AMD Northbridge event scheduling.
>>       It must be applied on top tip-x86 + hw_perf_enable() fix.
>>
>>       NB events are events measuring L3 cache, Hypertransport
>>       traffic. They are identified by an event code  >= 0xe0.
>>       They measure events on the Northbride which is shared
>>       by all cores on a package. NB events are counted on a
>>       shared set of counters. When a NB event is programmed
>>       in a counter, the data actually comes from a shared
>>       counter. Thus, access to those counters needs to be
>>       synchronized.
>>
>>       We implement the synchronization such that no two cores
>>       can be measuring NB events using the same counters. Thus,
>>       we maintain a per-NB * allocation table. The available slot
>>       is propagated using the event_constraint structure.
>>
>>       This 2nd version takes into account the changes on how
>>       constraints are stored by the scheduling code.
>>
>>       The patch also takes care of hotplug CPU.
>>
>>       Signed-off-by: Stephane Eranian <eranian@google.com>
>
> Please run the patch through checkpatch, there's lots of trivial coding
> style errors (spaces instead of tabs, for(i=0; etc..)
>
Sorry about that.

>> @@ -2250,10 +2261,144 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
>>       return &unconstrained;
>>  }
>>
>> +/*
>> + * AMD64 events are detected based on their event codes.
>> + */
>> +static inline int amd_is_nb_event(struct hw_perf_event *hwc)
>> +{
>> +     u64 val = hwc->config;
>
> & K7_EVNTSEL_EVENT_MASK ?

Yes, except that:
-#define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
+#define K7_EVNTSEL_EVENT_MASK  0xF000000FFULL

>
>> +     /* event code : bits [35-32] | [7-0] */
>> +     val = (val >> 24) | ( val & 0xff);
>> +     return val >= 0x0e0;
>> +}
>> +
>> +static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
>> +                                   struct perf_event *event)
>> +{
>> +     struct hw_perf_event *hwc = &event->hw;
>> +     struct perf_event *old;
>> +     struct amd_nb *nb;
>> +     int i;
>> +
>> +     /*
>> +      * only care about NB events
>> +      */
>> +     if(!amd_is_nb_event(hwc))
>> +             return;
>> +
>> +     /*
>> +      * NB not initialized
>> +      */
>> +     nb = cpuc->amd_nb;
>> +     if (!nb)
>> +             return;
>> +
>> +     if (hwc->idx == -1)
>> +             return;
>> +
>> +     /*
>> +      * need to scan whole list because event may not have
>> +      * been assigned during scheduling
>> +      */
>> +     for(i=0; i < x86_pmu.num_events; i++) {
>> +             if (nb->owners[i] == event) {
>> +                     old = cmpxchg(nb->owners+i, event, NULL);
>
> might want to validate old is indeed event.
>
It was in there during debugging ;->

>> +      * event can already be present yet not assigned (in hwc->idx)
>> +      * because of successive calls to x86_schedule_events() from
>> +      * hw_perf_group_sched_in() without hw_perf_enable()
>> +      */
>> +     for(i=0; i < max; i++) {
>> +             /*
>> +              * keep track of first free slot
>> +              */
>> +             if (k == -1 && !nb->owners[i])
>> +                     k = i;
>
> break?
>
No, we need to look for event, that's the main purpose
of the loop. We simply overlap this with looking for an
empty slot (before the event).

>> +      *
>> +      * try to alllcate same counter as before if
>> +      * event has already been assigned once. Otherwise,
>> +      * try to use free counter k obtained during the 1st
>> +      * pass above.
>> +      */
>> +     i = j = hwc->idx != -1 ? hwc->idx : (k == -1 ? 0 : k);
>
> That's patently unreadable, and I'm not sure what happens if we failed
> to find an eligible spot in the above loop, should we not somehow jump
> out and return emptyconstraint?
>
I have clarified the nested if. The goal of the first loop is to check
if the event is already present (explained why this is possible in the
comment). We may not have scanned all the slots. Furthermore, there
may be concurrent scans going on on other CPUs. The first pass tries
to find an empty slot, it does not reserve it. The second loop is the actual
allocation. We speculate the slot we found in the first pass is still available.
If the second loops fails, then we return emptyconstraints.

>> +     do {
>> +             old = cmpxchg(nb->owners+i, NULL, event);
>> +             if (!old)
>> +                     break;
>> +             if (++i == x86_pmu.num_events)
>> +                     i = 0;
>> +     } while (i != j);
>> +skip:
>> +     if (!old)
>> +             return &nb->event_constraints[i];
>> +     return &emptyconstraint;
>>  }
>>
>>  static int x86_event_sched_in(struct perf_event *event,
>
>> @@ -2561,6 +2707,96 @@ static __init int intel_pmu_init(void)
>>       return 0;
>>  }
>>
>> +static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
>> +{
>> +        struct amd_nb *nb;
>> +     int i;
>> +
>> +        nb= vmalloc_node(sizeof(struct amd_nb), cpu_to_node(cpu));
>
> $ pahole -C amd_nb build/arch/x86/kernel/cpu/perf_event.o
> struct amd_nb {
>        int                        nb_id;                /*     0     4 */
>        int                        refcnt;               /*     4     4 */
>        struct perf_event *        owners[64];           /*     8   512 */
>        /* --- cacheline 8 boundary (512 bytes) was 8 bytes ago --- */
>        struct event_constraint    event_constraints[64]; /*   520  1536 */
>        /* --- cacheline 32 boundary (2048 bytes) was 8 bytes ago --- */
>
>        /* size: 2056, cachelines: 33 */
>        /* last cacheline: 8 bytes */
> };
>
> Surely we can kmalloc that?
>
Ok, I can switch that.

>> +
>> +     /*
>> +      * function may be called too early in the
>> +      * boot process, in which case nb_id is bogus
>> +      *
>> +      * for BSP, there is an explicit call from
>> +      * amd_pmu_init()
>> +      */
>
> I keep getting flash-backs to doom's graphics engine every time I see
> BSP..
>
So how do you call the initial boot processor?


>> +     nb_id = amd_get_nb_id(cpu);
>> +     if (nb_id == BAD_APICID)
>> +             return;
>> +
>> +     cpu1 = &per_cpu(cpu_hw_events, cpu);
>> +     cpu1->amd_nb = NULL;
>> +
>> +     raw_spin_lock(&amd_nb_lock);
>> +
>> +     for_each_online_cpu(i) {
>> +             cpu2 = &per_cpu(cpu_hw_events, i);
>> +             nb = cpu2->amd_nb;
>> +             if (!nb)
>> +                     continue;
>> +             if (nb->nb_id == nb_id)
>> +                     goto found;
>> +     }
>> +
>> +     nb = amd_alloc_nb(cpu, nb_id);
>> +     if (!nb) {
>> +             pr_err("perf_events: failed to allocate NB storage for CPU%d\n", cpu);
>> +             raw_spin_unlock(&amd_nb_lock);
>> +             return;
>> +     }
>> +found:
>> +     nb->refcnt++;
>> +     cpu1->amd_nb = nb;
>> +
>> +     raw_spin_unlock(&amd_nb_lock);
>
> Can't this be simplified by using the cpu to node mask?

You mean to find the NB that corresponds to a CPU?

> Also, I think this is buggy in that:
>
>  perf_disable();
>  event->pmu->disable(event);
>  ...
>  event->pmu->enable(event);
>  perf_enable();
>
> can now fail, I think we need to move the put_event_constraint() from
> x86_pmu_disable() into x86_perf_enable() or something.

Constraints are reserved during x86_scheduling(), not during enable().
So if you had a conflict it was detected earlier than that.


-- 
Stephane Eranian  | EMEA Software Engineering
Google France | 38 avenue de l'Opéra | 75002 Paris
Tel : +33 (0) 1 42 68 53 00
This email may be confidential or privileged. If you received this
communication by mistake, please
don't forward it to anyone else, please erase all copies and
attachments, and please let me know that
it went to the wrong person. Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 16:05   ` Stephane Eranian
@ 2010-02-04 16:19     ` Peter Zijlstra
  2010-02-04 16:32       ` Stephane Eranian
  0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2010-02-04 16:19 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

On Thu, 2010-02-04 at 17:05 +0100, Stephane Eranian wrote:

> >> @@ -2250,10 +2261,144 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
> >>       return &unconstrained;
> >>  }
> >>
> >> +/*
> >> + * AMD64 events are detected based on their event codes.
> >> + */
> >> +static inline int amd_is_nb_event(struct hw_perf_event *hwc)
> >> +{
> >> +     u64 val = hwc->config;
> >
> > & K7_EVNTSEL_EVENT_MASK ?
> 
> Yes, except that:
> -#define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
> +#define K7_EVNTSEL_EVENT_MASK  0xF000000FFULL

Ah, indeed.

> >> +      *
> >> +      * try to alllcate same counter as before if
> >> +      * event has already been assigned once. Otherwise,
> >> +      * try to use free counter k obtained during the 1st
> >> +      * pass above.
> >> +      */
> >> +     i = j = hwc->idx != -1 ? hwc->idx : (k == -1 ? 0 : k);
> >
> > That's patently unreadable, and I'm not sure what happens if we failed
> > to find an eligible spot in the above loop, should we not somehow jump
> > out and return emptyconstraint?
> >
> I have clarified the nested if. The goal of the first loop is to check
> if the event is already present (explained why this is possible in the
> comment). We may not have scanned all the slots. Furthermore, there
> may be concurrent scans going on on other CPUs. The first pass tries
> to find an empty slot, it does not reserve it. The second loop is the actual
> allocation. We speculate the slot we found in the first pass is still available.
> If the second loops fails, then we return emptyconstraints.

Ah, now I see, yes indeed.

> >> +     do {
> >> +             old = cmpxchg(nb->owners+i, NULL, event);
> >> +             if (!old)
> >> +                     break;
> >> +             if (++i == x86_pmu.num_events)
> >> +                     i = 0;
> >> +     } while (i != j);
> >> +skip:
> >> +     if (!old)
> >> +             return &nb->event_constraints[i];
> >> +     return &emptyconstraint;
> >>  }

> >> +
> >> +     /*
> >> +      * function may be called too early in the
> >> +      * boot process, in which case nb_id is bogus
> >> +      *
> >> +      * for BSP, there is an explicit call from
> >> +      * amd_pmu_init()
> >> +      */
> >
> > I keep getting flash-backs to doom's graphics engine every time I see
> > BSP..
> >
> So how do you call the initial boot processor?

boot cpu, or x86 specific, cpu0.

> 
> >> +     nb_id = amd_get_nb_id(cpu);
> >> +     if (nb_id == BAD_APICID)
> >> +             return;
> >> +
> >> +     cpu1 = &per_cpu(cpu_hw_events, cpu);
> >> +     cpu1->amd_nb = NULL;
> >> +
> >> +     raw_spin_lock(&amd_nb_lock);
> >> +
> >> +     for_each_online_cpu(i) {
> >> +             cpu2 = &per_cpu(cpu_hw_events, i);
> >> +             nb = cpu2->amd_nb;
> >> +             if (!nb)
> >> +                     continue;
> >> +             if (nb->nb_id == nb_id)
> >> +                     goto found;
> >> +     }
> >> +
> >> +     nb = amd_alloc_nb(cpu, nb_id);
> >> +     if (!nb) {
> >> +             pr_err("perf_events: failed to allocate NB storage for CPU%d\n", cpu);
> >> +             raw_spin_unlock(&amd_nb_lock);
> >> +             return;
> >> +     }
> >> +found:
> >> +     nb->refcnt++;
> >> +     cpu1->amd_nb = nb;
> >> +
> >> +     raw_spin_unlock(&amd_nb_lock);
> >
> > Can't this be simplified by using the cpu to node mask?
> 
> You mean to find the NB that corresponds to a CPU?

Yep, saves having to poke at all cpus.

> > Also, I think this is buggy in that:
> >
> >  perf_disable();
> >  event->pmu->disable(event);
> >  ...
> >  event->pmu->enable(event);
> >  perf_enable();
> >
> > can now fail, I think we need to move the put_event_constraint() from
> > x86_pmu_disable() into x86_perf_enable() or something.
> 
> Constraints are reserved during x86_scheduling(), not during enable().
> So if you had a conflict it was detected earlier than that.

What I'm worried about is:

CPU A and B are of the same NorthBridge and all node counters are taken.

CPU-A			CPU-B

perf_disable();
event->pmu->disable(event)
  x86_pmu.put_event_constraint() /* free slot n */

			event->pmu->enable(event);
			  x86_schedule_events();
			    x86_pmu.get_event_constraint() /* grab slot n */

event->pmu->enable(event)
  x86_schedule_events()
    x86_pmu.get_event_constraint() /* FAIL */
perf_enable();

That means you cannot disable/enable the same event within a
perf_disable/enable section.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 16:19     ` Peter Zijlstra
@ 2010-02-04 16:32       ` Stephane Eranian
  2010-02-04 16:35         ` Stephane Eranian
  0 siblings, 1 reply; 9+ messages in thread
From: Stephane Eranian @ 2010-02-04 16:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

>> >> +     nb_id = amd_get_nb_id(cpu);
>> >> +     if (nb_id == BAD_APICID)
>> >> +             return;
>> >> +
>> >> +     cpu1 = &per_cpu(cpu_hw_events, cpu);
>> >> +     cpu1->amd_nb = NULL;
>> >> +
>> >> +     raw_spin_lock(&amd_nb_lock);
>> >> +
>> >> +     for_each_online_cpu(i) {
>> >> +             cpu2 = &per_cpu(cpu_hw_events, i);
>> >> +             nb = cpu2->amd_nb;
>> >> +             if (!nb)
>> >> +                     continue;
>> >> +             if (nb->nb_id == nb_id)
>> >> +                     goto found;
>> >> +     }
>> >> +
>> >> +     nb = amd_alloc_nb(cpu, nb_id);
>> >> +     if (!nb) {
>> >> +             pr_err("perf_events: failed to allocate NB storage for CPU%d\n", cpu);
>> >> +             raw_spin_unlock(&amd_nb_lock);
>> >> +             return;
>> >> +     }
>> >> +found:
>> >> +     nb->refcnt++;
>> >> +     cpu1->amd_nb = nb;
>> >> +
>> >> +     raw_spin_unlock(&amd_nb_lock);
>> >
>> > Can't this be simplified by using the cpu to node mask?
>>
>> You mean to find the NB that corresponds to a CPU?
>
> Yep, saves having to poke at all cpus.

That's not what the loop is about.

amd_get_nb_id() is the function which returns the id
of the NB based on cpu. It is not clear to me whether
NB and NUMA node always perfectly overlap. What
matter here is the HW config, not how NUMA nodes
were setup.

We allocate one amd_nb struct per NB. Once we get
our NB, we need to find if it has already been allocated
and refcnt++, otherwise we must allocate it.

>> > Also, I think this is buggy in that:
>> >
>> >  perf_disable();
>> >  event->pmu->disable(event);
>> >  ...
>> >  event->pmu->enable(event);
>> >  perf_enable();
>> >
>> > can now fail, I think we need to move the put_event_constraint() from
>> > x86_pmu_disable() into x86_perf_enable() or something.
>>
>> Constraints are reserved during x86_scheduling(), not during enable().
>> So if you had a conflict it was detected earlier than that.
>
> What I'm worried about is:
>
> CPU A and B are of the same NorthBridge and all node counters are taken.
>
> CPU-A                   CPU-B
>
> perf_disable();
> event->pmu->disable(event)
>  x86_pmu.put_event_constraint() /* free slot n */
>
>                        event->pmu->enable(event);
>                          x86_schedule_events();
>                            x86_pmu.get_event_constraint() /* grab slot n */
>
> event->pmu->enable(event)
>  x86_schedule_events()
>    x86_pmu.get_event_constraint() /* FAIL */
> perf_enable();
>
> That means you cannot disable/enable the same event within a
> perf_disable/enable section.
>
Yes but I would say that is because of the logic behind the enable/disable
interface. Here you are disabling "for a short period", you know you're going
to re-enable. Yet you are using an API that is a generic enable/disable.
You would need to pass some argument to disable() to say "temporary"
or "stop but don't release the resource".

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 16:32       ` Stephane Eranian
@ 2010-02-04 16:35         ` Stephane Eranian
  0 siblings, 0 replies; 9+ messages in thread
From: Stephane Eranian @ 2010-02-04 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

>> What I'm worried about is:
>>
>> CPU A and B are of the same NorthBridge and all node counters are taken.
>>
>> CPU-A                   CPU-B
>>
>> perf_disable();
>> event->pmu->disable(event)
>>  x86_pmu.put_event_constraint() /* free slot n */
>>
>>                        event->pmu->enable(event);
>>                          x86_schedule_events();
>>                            x86_pmu.get_event_constraint() /* grab slot n */
>>
>> event->pmu->enable(event)
>>  x86_schedule_events()
>>    x86_pmu.get_event_constraint() /* FAIL */
>> perf_enable();
>>
>> That means you cannot disable/enable the same event within a
>> perf_disable/enable section.
>>
> Yes but I would say that is because of the logic behind the enable/disable
> interface. Here you are disabling "for a short period", you know you're going
> to re-enable. Yet you are using an API that is a generic enable/disable.
> You would need to pass some argument to disable() to say "temporary"
> or "stop but don't release the resource".
>
I think if you were to distinguish between "stop the counter", and
"free the counter"
things would work better here.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] perf_events: AMD event scheduling (v2)
  2010-02-04 14:55 ` Peter Zijlstra
  2010-02-04 14:59   ` Peter Zijlstra
  2010-02-04 16:05   ` Stephane Eranian
@ 2010-02-25 18:29   ` Stephane Eranian
  2 siblings, 0 replies; 9+ messages in thread
From: Stephane Eranian @ 2010-02-25 18:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, paulus, davem, fweisbec, robert.richter,
	perfmon2-devel, eranian

Peter,

Was the AMD Northbridge patch ever applied to tip?
I don't see it in 2.6.33-rc8-tip+.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-02-25 18:29 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-01 20:15 [PATCH] perf_events: AMD event scheduling (v2) Stephane Eranian
2010-02-04 14:55 ` Peter Zijlstra
2010-02-04 14:59   ` Peter Zijlstra
2010-02-04 15:04     ` Stephane Eranian
2010-02-04 16:05   ` Stephane Eranian
2010-02-04 16:19     ` Peter Zijlstra
2010-02-04 16:32       ` Stephane Eranian
2010-02-04 16:35         ` Stephane Eranian
2010-02-25 18:29   ` Stephane Eranian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).