Re: [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
To: "H. Peter Anvin" <hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
Cc: "containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org"
	<containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org>,
	"Waskiewicz Jr,
	Peter P"
	<peter.p.waskiewicz.jr-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,
	"linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org"
	<linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
	Stephane Eranian
	<eranian-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>,
	Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,
	Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>,
	"cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org"
	<cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
	Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
Subject: Re: [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support
Date: Mon, 27 Jan 2014 18:34:20 +0100	[thread overview]
Message-ID: <20140127173420.GA9636@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <52D57AC2.3090109-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>

On Tue, Jan 14, 2014 at 09:58:26AM -0800, H. Peter Anvin wrote:
> On 01/12/2014 11:55 PM, Peter Zijlstra wrote:
> > 
> > The problem is, since there's a limited number of RMIDs we have to
> > rotate at some point, but since changing RMIDs is nondeterministic we
> > can't.
> > 
> 
> This is fundamentally the crux here.  RMIDs are quite expensive for the
> hardware to implement, so they are limited - but recycling them is
> *very* expensive because you literally have to touch every line in the
> cache.

Its not a problem that changing the task:RMID map is expensive, what is
a problem is that there's no deterministic fashion of doing it.

That said; I think I've got a sort-of workaround for that. See the
largish comment near cache_pmu_rotate().

I've also illustrated how to use perf-cgroup for this.

The below is a rough draft, most if not all XXXs should be
fixed/finished. But given I don't actually have hardware that supports
this stuff (afaik) I couldn't be arsed.

---
 include/linux/perf_event.h              |   33 +
 kernel/events/core.c                    |   22 -
 x86/kernel/cpu/perf_event_intel_cache.c |  687 ++++++++++++++++++++++++++++++++
 3 files changed, 725 insertions(+), 17 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -126,6 +126,14 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* cache_pmu */
+			struct task_struct	*cache_target;
+			int			cache_state;
+			int			cache_rmid;
+			struct list_head	cache_events_entry;
+			struct list_head	cache_groups_entry;
+			struct list_head	cache_group_entry;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -526,6 +534,31 @@ struct perf_output_handle {
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_info;
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ *
+ * XXX: its not safe to use this thing!!!
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_subsys_id),
+			    struct perf_cgroup, css);
+}
+
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -329,23 +329,6 @@ struct perf_cgroup_info {
 	u64				timestamp;
 };
 
-struct perf_cgroup {
-	struct cgroup_subsys_state	css;
-	struct perf_cgroup_info	__percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-	return container_of(task_css(task, perf_subsys_id),
-			    struct perf_cgroup, css);
-}
-
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
@@ -6711,6 +6694,11 @@ perf_event_alloc(struct perf_event_attr
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 
+		/*
+		 * XXX fix for cache_target, dynamic type won't have an easy test,
+		 * maybe move target crap into generic event.
+		 */
+
 		if (attr->type == PERF_TYPE_TRACEPOINT)
 			event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
--- /dev/null
+++ b/x86/kernel/cpu/perf_event_intel_cache.c
@@ -0,0 +1,687 @@
+#include <asm/processor.h>
+#include <linux/idr.h>
+#include <linux/raw_spinlock.h>
+#include <linux/perf_event.h>
+
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+unsigned int max_rmid;
+
+unsigned int l3_scale; /* supposedly cacheline size */
+unsigned int l3_max_rmid;
+
+
+struct cache_pmu_state {
+	raw_spin_lock		lock;
+	int			rmid;
+	int 			cnt;
+};
+
+static DEFINE_PER_CPU(struct cache_pmu_state, state);
+
+/*
+ * Protects the global state, hold both for modification, hold either for
+ * stability.
+ *
+ * XXX we modify RMID with only cache_mutex held, racy!
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+static unsigned long *cache_rmid_bitmap;
+
+/*
+ * All events
+ */
+static LIST_HEAD(cache_events);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * The new RMID we must not use until cache_pmu_stable().
+ * See cache_pmu_rotate().
+ */
+static unsigned long *cache_limbo_bitmap;
+
+/*
+ * The spare RMID that make rotation possible; keep out of the
+ * cache_rmid_bitmap to avoid it getting used for new events.
+ */
+static int cache_rotation_rmid;
+
+/*
+ * The freed RMIDs, see cache_pmu_rotate().
+ */
+static int cache_freed_nr;
+static int *cache_freed_rmid;
+
+/*
+ * One online cpu per package, for cache_pmu_stable().
+ */
+static cpumask_t cache_cpus;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+	return bitmap_find_free_region(cache_rmid_bitmap, max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+	bitmap_release_region(cache_rmid_bitmap, rmid, 0);
+}
+
+/*
+ * Needs a quesent state before __put, see cache_pmu_stabilize().
+ */
+static void __free_rmid(int rmid)
+{
+	cache_freed_rmid[cache_freed_nr++] = rmid;
+}
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+static u64 __rmid_read(unsigned long rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, 1 | (rmid << 32));
+	rdmsr(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+static void smp_test_stable(void *info)
+{
+	bool *used = info;
+	int i;
+
+	for (i = 0; i < cache_freed_nr; i++) {
+		if (__rmid_read(cache_freed_rmid[i]))
+			*used = false;
+	}
+}
+
+/*
+ * Test if the rotation_rmid is unused; see the comment near
+ * cache_pmu_rotate().
+ */
+static bool cache_pmu_is_stable(void)
+{
+	bool used = true;
+
+	smp_call_function_many(&cache_cpus, smp_test_stable, &used, true);
+
+	return used;
+}
+
+/*
+ * Quescent state; wait for all the 'freed' RMIDs to become unused.  After this
+ * we can can reuse them and know that the current set of active RMIDs is
+ * stable.
+ */
+static void cache_pmu_stabilize(void)
+{
+	int i = 0;
+
+	if (!cache_freed_nr)
+		return;
+
+	/*
+	 * Now wait until the old RMID drops back to 0 again, this means all
+	 * cachelines have acquired a new tag and the new RMID is now stable.
+	 */
+	while (!cache_pmu_is_stable()) {
+		/*
+		 * XXX adaptive timeout? Ideally the hardware would get us an
+		 * interrupt :/
+		 */
+		schedule_timeout_uninterruptible(1);
+	}
+
+	bitmap_clear(cache_limbo_bitmap, 0, max_rmid);
+
+	if (cache_rotation_rmid <= 0) {
+		cache_rotation_rmid = cache_freed_rmid[0];
+		i++;
+	}
+
+	for (; i < cache_freed_nr; i++)
+		__put_rmid(cache_freed_rmid[i]);
+
+	cache_freed_nr = 0;
+}
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned long cache_group_xchg_rmid(struct perf_event *group, unsigned long rmid)
+{
+	struct perf_event *event;
+	unsigned long old_rmid = group->hw.cache_rmid;
+
+	group->hw.cache_rmid = rmid;
+	list_for_each_entry(event, &group->hw.cache_group_entry, hw.cache_group_entry)
+		event->hw.cache_rmid = rmid;
+
+	return old_rmid;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+	if (a->attach_state & PERF_ATTACH_TASK) {
+		if (a->hw.cache_target != b->hw.cache_target)
+			return false;
+
+		return true;
+	}
+
+	/* not task */
+
+#ifdef CONFIG_CGROUP_PERF
+	if ((a->cgrp == b->cgrp) && a->cgrp)
+		return true;
+#endif
+
+	return true; /* if not task or cgroup, we're machine wide */
+}
+
+static struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->cgrp)
+		return event->cgrp;
+
+	if (event->attach_state & PERF_ATTACH_TASK) /* XXX */
+		return perf_cgroup_from_task(event->hw.cache_target);
+
+	return NULL;
+}
+
+/*
+ * Determine if @na's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cb *ac, *bc;
+
+	ac = event_to_cgroup(a);
+	bc = event_to_cgroup(b);
+
+	if (!ac || !bc) {
+		/*
+		 * If either is NULL, its a system wide event and that
+		 * always conflicts with a cgroup one.
+		 *
+		 * If both are system wide, __match_event() should've
+		 * been true and we'll never get here, if we did fail.
+		 */
+		return true;
+	}
+
+	/*
+	 * If one is a parent of the other, we've got an intersection.
+	 */
+	if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+	    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+		return true;
+#endif
+
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Again, if they're the same __match_event() should've caught us, if not fail.
+	 */
+	if (a->hw.cache_target == b->hw.cache_target)
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs, ought to run from an
+ * delayed work or somesuch.
+ *
+ * Rotating RMIDs is complicated; firstly because the hardware doesn't give us
+ * any clues; secondly because of cgroups.
+ *
+ * There's problems with the hardware interface; when you change the task:RMID
+ * map cachelines retain their 'old' tags, giving a skewed picture. In order to
+ * work around this, we must always keep one free RMID.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID), and
+ * assigning the free RMID to another group (the new RMID). We must then wait
+ * for the old RMID to not be used (no cachelines tagged). This ensure that all
+ * cachelines are tagged with 'active' RMIDs. At this point we can start
+ * reading values for the new RMID and treat the old RMID as the free RMID for
+ * the next rotation.
+ *
+ * Secondly, since cgroups can nest, we must make sure to not program
+ * conflicting cgroups at the same time. A conflicting cgroup is one that has a
+ * parent<->child relation. After all, a task of the child cgroup will also be
+ * covered by the parent cgroup.
+ *
+ * Therefore, when selecting a new group, we must invalidate all conflicting
+ * groups. Rotations allows us to measure all (conflicting) groups
+ * sequentially.
+ *
+ * XXX there's a further problem in that because we do our own rotation and
+ * cheat with schedulability the event {enabled,running} times are incorrect.
+ */
+static bool cache_pmu_rotate(void)
+{
+	struct perf_event *rotor;
+	int rmid;
+
+	mutex_lock(&cache_mutex);
+
+	if (list_empty(&cache_groups))
+		goto unlock_mutex;
+
+	rotor = list_first_entry(&cache_groups, struct perf_event, hw.cache_groups_entry);
+
+	raw_spin_lock_irq(&cache_lock);
+	list_del(&rotor->hw.cache_groups_entry);
+	rmid = cache_group_xchg_rmid(rotor, -1);
+	WARN_ON_ONCE(rmid <= 0); /* first entry must always have an RMID */
+	__free_rmid(rmid);
+	raw_spin_unlock_irq(&cache_loc);
+
+	/*
+	 * XXX O(n^2) schedulability
+	 */
+
+	list_for_each_entry(group, &cache_groups, hw.cache_groups_entry) {
+		bool conflicts = false;
+		struct perf_event *iter;
+
+		list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+			if (iter == group)
+				break;
+			if (__conflict_event(group, iter)) {
+				conflicts = true;
+				break;
+			}
+		}
+
+		if (conflicts && group->hw.cache_rmid > 0) {
+			rmid = cache_group_xchg_rmid(group, -1);
+			WARN_ON_ONCE(rmid <= 0);
+			__free_rmid(rmid);
+			continue;
+		}
+
+		if (!conflicts && group->hw.cache_rmid <= 0) {
+			rmid = __get_rmid();
+			if (rmid <= 0) {
+				rmid = cache_rotation_rmid;
+				cache_rotation_rmid = -1;
+			}
+			set_bit(rmid, cache_limbo_rmid);
+			if (rmid <= 0)
+				break; /* we're out of RMIDs, more next time */
+
+			rmid = cache_group_xchg_rmid(group, rmid);
+			WARM_ON_ONCE(rmid > 0);
+			continue;
+		}
+
+		/*
+		 * either we conflict and do not have an RMID -> good,
+		 * or we do not conflict and have an RMID -> also good.
+		 */
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+	list_add_tail(&rotor->hw.cache_groups_entry, &cache_groups);
+	raw_spin_unlock_irq(&cache_lock);
+
+	/*
+	 * XXX force a PMU reprogram here such that the new RMIDs are in
+	 * effect.
+	 */
+
+	cache_pmu_stabilize();
+
+unlock_mutex:
+	mutex_unlock(&cache_mutex);
+
+	/*
+	 * XXX reschedule work.
+	 */
+}
+
+/*
+ * Find a group and setup RMID
+ */
+static struct perf_event *cache_pmu_setup_event(struct perf_event *event)
+{
+	struct perf_event *iter;
+	int rmid = 0; /* unset */
+
+	list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+		if (__match_event(iter, event)) {
+			event->hw.cache_rmid = iter->hw.cache_rmid;
+			return iter;
+		}
+		if (__conflict_event(iter, event))
+			rmid = -1; /* conflicting rmid */
+	}
+
+	if (!rmid) {
+		/* XXX lacks stabilization */
+		event->hw.cache_rmid = __get_rmid();
+	}
+
+	return NULL;
+}
+
+static void cache_pmu_event_read(struct perf_event *event)
+{
+	unsigned long rmid = event->hw.cache_rmid;
+	u64 val = RMID_VAL_UNAVAIL;
+
+	if (!test_bit(rmid, cache_limbo_bitmap))
+		val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	val *= l3_scale; /* cachelines -> bytes */
+
+	local64_set(&event->count, val);
+}
+
+static void cache_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	if (!(event->hw.cache_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cache_state &= ~PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	if (state->cnt++)
+		WARN_ON_ONCE(state->rmid != rmid);
+	else
+		WARN_ON_ONCE(state->rmid);
+	state->rmid = rmid;
+	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid);
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void cache_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	if (event->hw.cache_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cache_state |= PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	cache_pmu_event_read(event);
+	if (!--state->cnt) {
+		state->rmid = 0;
+		wrmsr(MSR_IA32_PQR_ASSOC, 0);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int cache_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+	int rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	event->hw.cache_state = PERF_HES_STOPPED;
+	rmid = event->hw.cache_rmid;
+	if (rmid <= 0)
+		goto unlock;
+
+	if (mode & PERF_EF_START)
+		cache_pmu_event_start(event, mode);
+
+unlock:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void cache_pmu_event_del(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	cache_pmu_event_stop(event, mode);
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void cache_pmu_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+	raw_spin_lock_irq(&cache_lock);
+
+	list_del(&event->hw.cache_events_entry);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cache_group_entry)) {
+		group_other = list_first_entry(&event->hw.cache_group_entry,
+					       struct perf_event,
+					       hw.cache_group_entry);
+		list_del(&event->hw.cache_group_entry);
+	}
+	/*
+	 * And we're the group leader..
+	 */
+	if (!list_empty(&event->hw.cache_groups_entry)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cache_groups_entry,
+				     &group_other->hw.cache_groups_entry);
+		} else {
+			int rmid = event->hw.cache_rmid;
+			if (rmid > 0)
+				__put_rmid(rmid);
+			list_del(&event->hw.cache_groups_entry);
+		}
+	}
+
+	raw_spin_unlock_irq(&cache_lock);
+	mutex_unlock(&cache_mutex);
+}
+
+static struct pmu cache_pmu;
+
+/*
+ * Takes non-sampling task,cgroup or machine wide events.
+ *
+ * XXX there's a bit of a problem in that we cannot simply do the one event per
+ * node as one would want, since that one event would one get scheduled on the
+ * one cpu. But we want to 'schedule' the RMID on all CPUs.
+ *
+ * This means we want events for each CPU, however, that generates a lot of
+ * duplicate values out to userspace -- this is not to be helped unless we want
+ * to change the core code in some way.
+ */
+static int cache_pmu_event_init(struct perf_event *event)
+{
+	struct perf_event *group;
+
+	if (event->attr.type != cache_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config != 0)
+		return -EINVAL;
+
+	if (event->cpu == -1) /* must have per-cpu events; see above */
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	event->destroy = cache_pmu_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	group = cache_pmu_setup_event(event); /* will also set rmid */
+
+	raw_spin_lock_irq(&cache_lock);
+	if (group) {
+		event->hw.cache_rmid = group->hw.cache_rmid;
+		list_add_tail(&event->hw.cache_group_entry,
+			      &group->hw.cache_group_entry);
+	} else {
+		list_add_tail(&event->hw.cache_groups_entry,
+			      &cache_groups);
+	}
+
+	list_add_tail(&event->hw.cache_events_entry, &cache_events);
+	raw_spin_unlock_irq(&cache_lock);
+
+	mutex_unlock(&cache_mutex);
+
+	return 0;
+}
+
+static struct pmu cache_pmu = {
+	.task_ctx_nr	= perf_sw_context, /* we cheat: our add will never fail */
+	.event_init	= cache_pmu_event_init,
+	.add		= cache_pmu_event_add,
+	.del		= cache_pmu_event_del,
+	.start		= cache_pmu_event_start,
+	.stop		= cache_pmu_event_stop,
+	.read		= cache_pmu_event_read,
+};
+
+static int __init cache_pmu_init(void)
+{
+	unsigned int eax, ebx, ecd, edx;
+	int i;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+
+	if (boot_cpu_data.x86 != 6)
+		return 0;
+
+	cpuid_count(0x07, 0, &eax, &ebx, &ecx, &edx);
+
+	/* CPUID.(EAX=07H, ECX=0).EBX.QOS[bit12] */
+	if (!(ebx & (1 << 12)))
+		return 0;
+
+	cpuid_count(0x0f, 0, &eax, &ebx, &ecx, &edx);
+
+	max_rmid = ebx;
+
+	/*
+	 * We should iterate bits in CPUID(EAX=0FH, ECX=0).EDX
+	 * For now, only support L3 (bit 1).
+	 */
+	if (!(edx & (1 << 1)))
+		return 0;
+
+	cpuid_count(0x0f, 1, &eax, &ebx, &ecx, &edx);
+
+	l3_scale = ebx;
+	l3_max_rmid = ecx;
+
+	if (l3_max_rmid != max_rmid)
+		return 0;
+
+	cache_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+	if (!cache_rmid_bitmap)
+		return -ENOMEM;
+
+	cache_limbo_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+	if (!cache_limbo_bitmap)
+		return -ENOMEM; /* XXX frees */
+
+	cache_freed_rmid = kmalloc(sizeof(int) * max_rmid, GFP_KERNEL);
+	if (!cache_freed_rmid)
+		return -ENOMEM; /* XXX free bitmaps */
+
+	bitmap_zero(cache_rmid_bitmap, max_rmid);
+	bitmap_set(cache_rmid_bitmap, 0, 1); /* RMID 0 is special */
+	cache_rotation_rmid = __get_rmid(); /* keep one free RMID for rotation */
+	if (WARN_ON_ONCE(cache_rotation_rmid < 0))
+		return cache_rotation_rmid;
+
+	/*
+	 * XXX hotplug notifiers!
+	 */
+	for_each_possible_cpu(i) {
+		struct cache_pmu_state *state = &per_cpu(state, cpu);
+
+		raw_spin_lock_init(&state->lock);
+		state->rmid = 0;
+	}
+
+	ret = perf_pmu_register(&cache_pmu, "cache_qos", -1);
+	if (WARN_ON(ret)) {
+		pr_info("Cache QoS detected, registration failed (%d), disabled\n", ret);
+		return -1;
+	}
+
+	return 0;
+}
+device_initcall(cache_pmu_init);

WARNING: multiple messages have this Message-ID (diff)

From: Peter Zijlstra <peterz@infradead.org>
To: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com>,
	Tejun Heo <tj@kernel.org>, Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Li Zefan <lizefan@huawei.com>,
	"containers@lists.linux-foundation.org" 
	<containers@lists.linux-foundation.org>,
	"cgroups@vger.kernel.org" <cgroups@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Stephane Eranian <eranian@google.com>
Subject: Re: [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support
Date: Mon, 27 Jan 2014 18:34:20 +0100	[thread overview]
Message-ID: <20140127173420.GA9636@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <52D57AC2.3090109@zytor.com>

On Tue, Jan 14, 2014 at 09:58:26AM -0800, H. Peter Anvin wrote:
> On 01/12/2014 11:55 PM, Peter Zijlstra wrote:
> > 
> > The problem is, since there's a limited number of RMIDs we have to
> > rotate at some point, but since changing RMIDs is nondeterministic we
> > can't.
> > 
> 
> This is fundamentally the crux here.  RMIDs are quite expensive for the
> hardware to implement, so they are limited - but recycling them is
> *very* expensive because you literally have to touch every line in the
> cache.

Its not a problem that changing the task:RMID map is expensive, what is
a problem is that there's no deterministic fashion of doing it.

That said; I think I've got a sort-of workaround for that. See the
largish comment near cache_pmu_rotate().

I've also illustrated how to use perf-cgroup for this.

The below is a rough draft, most if not all XXXs should be
fixed/finished. But given I don't actually have hardware that supports
this stuff (afaik) I couldn't be arsed.

---
 include/linux/perf_event.h              |   33 +
 kernel/events/core.c                    |   22 -
 x86/kernel/cpu/perf_event_intel_cache.c |  687 ++++++++++++++++++++++++++++++++
 3 files changed, 725 insertions(+), 17 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -126,6 +126,14 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* cache_pmu */
+			struct task_struct	*cache_target;
+			int			cache_state;
+			int			cache_rmid;
+			struct list_head	cache_events_entry;
+			struct list_head	cache_groups_entry;
+			struct list_head	cache_group_entry;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -526,6 +534,31 @@ struct perf_output_handle {
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_info;
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ *
+ * XXX: its not safe to use this thing!!!
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_subsys_id),
+			    struct perf_cgroup, css);
+}
+
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -329,23 +329,6 @@ struct perf_cgroup_info {
 	u64				timestamp;
 };
 
-struct perf_cgroup {
-	struct cgroup_subsys_state	css;
-	struct perf_cgroup_info	__percpu *info;
-};
-
-/*
- * Must ensure cgroup is pinned (css_get) before calling
- * this function. In other words, we cannot call this function
- * if there is no cgroup event for the current CPU context.
- */
-static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
-{
-	return container_of(task_css(task, perf_subsys_id),
-			    struct perf_cgroup, css);
-}
-
 static inline bool
 perf_cgroup_match(struct perf_event *event)
 {
@@ -6711,6 +6694,11 @@ perf_event_alloc(struct perf_event_attr
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 
+		/*
+		 * XXX fix for cache_target, dynamic type won't have an easy test,
+		 * maybe move target crap into generic event.
+		 */
+
 		if (attr->type == PERF_TYPE_TRACEPOINT)
 			event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
--- /dev/null
+++ b/x86/kernel/cpu/perf_event_intel_cache.c
@@ -0,0 +1,687 @@
+#include <asm/processor.h>
+#include <linux/idr.h>
+#include <linux/raw_spinlock.h>
+#include <linux/perf_event.h>
+
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+unsigned int max_rmid;
+
+unsigned int l3_scale; /* supposedly cacheline size */
+unsigned int l3_max_rmid;
+
+
+struct cache_pmu_state {
+	raw_spin_lock		lock;
+	int			rmid;
+	int 			cnt;
+};
+
+static DEFINE_PER_CPU(struct cache_pmu_state, state);
+
+/*
+ * Protects the global state, hold both for modification, hold either for
+ * stability.
+ *
+ * XXX we modify RMID with only cache_mutex held, racy!
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+static unsigned long *cache_rmid_bitmap;
+
+/*
+ * All events
+ */
+static LIST_HEAD(cache_events);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * The new RMID we must not use until cache_pmu_stable().
+ * See cache_pmu_rotate().
+ */
+static unsigned long *cache_limbo_bitmap;
+
+/*
+ * The spare RMID that make rotation possible; keep out of the
+ * cache_rmid_bitmap to avoid it getting used for new events.
+ */
+static int cache_rotation_rmid;
+
+/*
+ * The freed RMIDs, see cache_pmu_rotate().
+ */
+static int cache_freed_nr;
+static int *cache_freed_rmid;
+
+/*
+ * One online cpu per package, for cache_pmu_stable().
+ */
+static cpumask_t cache_cpus;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+	return bitmap_find_free_region(cache_rmid_bitmap, max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+	bitmap_release_region(cache_rmid_bitmap, rmid, 0);
+}
+
+/*
+ * Needs a quesent state before __put, see cache_pmu_stabilize().
+ */
+static void __free_rmid(int rmid)
+{
+	cache_freed_rmid[cache_freed_nr++] = rmid;
+}
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+static u64 __rmid_read(unsigned long rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, 1 | (rmid << 32));
+	rdmsr(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+static void smp_test_stable(void *info)
+{
+	bool *used = info;
+	int i;
+
+	for (i = 0; i < cache_freed_nr; i++) {
+		if (__rmid_read(cache_freed_rmid[i]))
+			*used = false;
+	}
+}
+
+/*
+ * Test if the rotation_rmid is unused; see the comment near
+ * cache_pmu_rotate().
+ */
+static bool cache_pmu_is_stable(void)
+{
+	bool used = true;
+
+	smp_call_function_many(&cache_cpus, smp_test_stable, &used, true);
+
+	return used;
+}
+
+/*
+ * Quescent state; wait for all the 'freed' RMIDs to become unused.  After this
+ * we can can reuse them and know that the current set of active RMIDs is
+ * stable.
+ */
+static void cache_pmu_stabilize(void)
+{
+	int i = 0;
+
+	if (!cache_freed_nr)
+		return;
+
+	/*
+	 * Now wait until the old RMID drops back to 0 again, this means all
+	 * cachelines have acquired a new tag and the new RMID is now stable.
+	 */
+	while (!cache_pmu_is_stable()) {
+		/*
+		 * XXX adaptive timeout? Ideally the hardware would get us an
+		 * interrupt :/
+		 */
+		schedule_timeout_uninterruptible(1);
+	}
+
+	bitmap_clear(cache_limbo_bitmap, 0, max_rmid);
+
+	if (cache_rotation_rmid <= 0) {
+		cache_rotation_rmid = cache_freed_rmid[0];
+		i++;
+	}
+
+	for (; i < cache_freed_nr; i++)
+		__put_rmid(cache_freed_rmid[i]);
+
+	cache_freed_nr = 0;
+}
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned long cache_group_xchg_rmid(struct perf_event *group, unsigned long rmid)
+{
+	struct perf_event *event;
+	unsigned long old_rmid = group->hw.cache_rmid;
+
+	group->hw.cache_rmid = rmid;
+	list_for_each_entry(event, &group->hw.cache_group_entry, hw.cache_group_entry)
+		event->hw.cache_rmid = rmid;
+
+	return old_rmid;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+	if (a->attach_state & PERF_ATTACH_TASK) {
+		if (a->hw.cache_target != b->hw.cache_target)
+			return false;
+
+		return true;
+	}
+
+	/* not task */
+
+#ifdef CONFIG_CGROUP_PERF
+	if ((a->cgrp == b->cgrp) && a->cgrp)
+		return true;
+#endif
+
+	return true; /* if not task or cgroup, we're machine wide */
+}
+
+static struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->cgrp)
+		return event->cgrp;
+
+	if (event->attach_state & PERF_ATTACH_TASK) /* XXX */
+		return perf_cgroup_from_task(event->hw.cache_target);
+
+	return NULL;
+}
+
+/*
+ * Determine if @na's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cb *ac, *bc;
+
+	ac = event_to_cgroup(a);
+	bc = event_to_cgroup(b);
+
+	if (!ac || !bc) {
+		/*
+		 * If either is NULL, its a system wide event and that
+		 * always conflicts with a cgroup one.
+		 *
+		 * If both are system wide, __match_event() should've
+		 * been true and we'll never get here, if we did fail.
+		 */
+		return true;
+	}
+
+	/*
+	 * If one is a parent of the other, we've got an intersection.
+	 */
+	if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+	    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+		return true;
+#endif
+
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Again, if they're the same __match_event() should've caught us, if not fail.
+	 */
+	if (a->hw.cache_target == b->hw.cache_target)
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs, ought to run from an
+ * delayed work or somesuch.
+ *
+ * Rotating RMIDs is complicated; firstly because the hardware doesn't give us
+ * any clues; secondly because of cgroups.
+ *
+ * There's problems with the hardware interface; when you change the task:RMID
+ * map cachelines retain their 'old' tags, giving a skewed picture. In order to
+ * work around this, we must always keep one free RMID.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID), and
+ * assigning the free RMID to another group (the new RMID). We must then wait
+ * for the old RMID to not be used (no cachelines tagged). This ensure that all
+ * cachelines are tagged with 'active' RMIDs. At this point we can start
+ * reading values for the new RMID and treat the old RMID as the free RMID for
+ * the next rotation.
+ *
+ * Secondly, since cgroups can nest, we must make sure to not program
+ * conflicting cgroups at the same time. A conflicting cgroup is one that has a
+ * parent<->child relation. After all, a task of the child cgroup will also be
+ * covered by the parent cgroup.
+ *
+ * Therefore, when selecting a new group, we must invalidate all conflicting
+ * groups. Rotations allows us to measure all (conflicting) groups
+ * sequentially.
+ *
+ * XXX there's a further problem in that because we do our own rotation and
+ * cheat with schedulability the event {enabled,running} times are incorrect.
+ */
+static bool cache_pmu_rotate(void)
+{
+	struct perf_event *rotor;
+	int rmid;
+
+	mutex_lock(&cache_mutex);
+
+	if (list_empty(&cache_groups))
+		goto unlock_mutex;
+
+	rotor = list_first_entry(&cache_groups, struct perf_event, hw.cache_groups_entry);
+
+	raw_spin_lock_irq(&cache_lock);
+	list_del(&rotor->hw.cache_groups_entry);
+	rmid = cache_group_xchg_rmid(rotor, -1);
+	WARN_ON_ONCE(rmid <= 0); /* first entry must always have an RMID */
+	__free_rmid(rmid);
+	raw_spin_unlock_irq(&cache_loc);
+
+	/*
+	 * XXX O(n^2) schedulability
+	 */
+
+	list_for_each_entry(group, &cache_groups, hw.cache_groups_entry) {
+		bool conflicts = false;
+		struct perf_event *iter;
+
+		list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+			if (iter == group)
+				break;
+			if (__conflict_event(group, iter)) {
+				conflicts = true;
+				break;
+			}
+		}
+
+		if (conflicts && group->hw.cache_rmid > 0) {
+			rmid = cache_group_xchg_rmid(group, -1);
+			WARN_ON_ONCE(rmid <= 0);
+			__free_rmid(rmid);
+			continue;
+		}
+
+		if (!conflicts && group->hw.cache_rmid <= 0) {
+			rmid = __get_rmid();
+			if (rmid <= 0) {
+				rmid = cache_rotation_rmid;
+				cache_rotation_rmid = -1;
+			}
+			set_bit(rmid, cache_limbo_rmid);
+			if (rmid <= 0)
+				break; /* we're out of RMIDs, more next time */
+
+			rmid = cache_group_xchg_rmid(group, rmid);
+			WARM_ON_ONCE(rmid > 0);
+			continue;
+		}
+
+		/*
+		 * either we conflict and do not have an RMID -> good,
+		 * or we do not conflict and have an RMID -> also good.
+		 */
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+	list_add_tail(&rotor->hw.cache_groups_entry, &cache_groups);
+	raw_spin_unlock_irq(&cache_lock);
+
+	/*
+	 * XXX force a PMU reprogram here such that the new RMIDs are in
+	 * effect.
+	 */
+
+	cache_pmu_stabilize();
+
+unlock_mutex:
+	mutex_unlock(&cache_mutex);
+
+	/*
+	 * XXX reschedule work.
+	 */
+}
+
+/*
+ * Find a group and setup RMID
+ */
+static struct perf_event *cache_pmu_setup_event(struct perf_event *event)
+{
+	struct perf_event *iter;
+	int rmid = 0; /* unset */
+
+	list_for_each_entry(iter, &cache_groups, hw.cache_groups_entry) {
+		if (__match_event(iter, event)) {
+			event->hw.cache_rmid = iter->hw.cache_rmid;
+			return iter;
+		}
+		if (__conflict_event(iter, event))
+			rmid = -1; /* conflicting rmid */
+	}
+
+	if (!rmid) {
+		/* XXX lacks stabilization */
+		event->hw.cache_rmid = __get_rmid();
+	}
+
+	return NULL;
+}
+
+static void cache_pmu_event_read(struct perf_event *event)
+{
+	unsigned long rmid = event->hw.cache_rmid;
+	u64 val = RMID_VAL_UNAVAIL;
+
+	if (!test_bit(rmid, cache_limbo_bitmap))
+		val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	val *= l3_scale; /* cachelines -> bytes */
+
+	local64_set(&event->count, val);
+}
+
+static void cache_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	if (!(event->hw.cache_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cache_state &= ~PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	if (state->cnt++)
+		WARN_ON_ONCE(state->rmid != rmid);
+	else
+		WARN_ON_ONCE(state->rmid);
+	state->rmid = rmid;
+	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid);
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void cache_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	if (event->hw.cache_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cache_state |= PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	cache_pmu_event_read(event);
+	if (!--state->cnt) {
+		state->rmid = 0;
+		wrmsr(MSR_IA32_PQR_ASSOC, 0);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int cache_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+	int rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	event->hw.cache_state = PERF_HES_STOPPED;
+	rmid = event->hw.cache_rmid;
+	if (rmid <= 0)
+		goto unlock;
+
+	if (mode & PERF_EF_START)
+		cache_pmu_event_start(event, mode);
+
+unlock:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void cache_pmu_event_del(struct perf_event *event, int mode)
+{
+	struct cache_pmu_state *state = &__get_cpu_var(&state);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	cache_pmu_event_stop(event, mode);
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void cache_pmu_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+	raw_spin_lock_irq(&cache_lock);
+
+	list_del(&event->hw.cache_events_entry);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cache_group_entry)) {
+		group_other = list_first_entry(&event->hw.cache_group_entry,
+					       struct perf_event,
+					       hw.cache_group_entry);
+		list_del(&event->hw.cache_group_entry);
+	}
+	/*
+	 * And we're the group leader..
+	 */
+	if (!list_empty(&event->hw.cache_groups_entry)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cache_groups_entry,
+				     &group_other->hw.cache_groups_entry);
+		} else {
+			int rmid = event->hw.cache_rmid;
+			if (rmid > 0)
+				__put_rmid(rmid);
+			list_del(&event->hw.cache_groups_entry);
+		}
+	}
+
+	raw_spin_unlock_irq(&cache_lock);
+	mutex_unlock(&cache_mutex);
+}
+
+static struct pmu cache_pmu;
+
+/*
+ * Takes non-sampling task,cgroup or machine wide events.
+ *
+ * XXX there's a bit of a problem in that we cannot simply do the one event per
+ * node as one would want, since that one event would one get scheduled on the
+ * one cpu. But we want to 'schedule' the RMID on all CPUs.
+ *
+ * This means we want events for each CPU, however, that generates a lot of
+ * duplicate values out to userspace -- this is not to be helped unless we want
+ * to change the core code in some way.
+ */
+static int cache_pmu_event_init(struct perf_event *event)
+{
+	struct perf_event *group;
+
+	if (event->attr.type != cache_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config != 0)
+		return -EINVAL;
+
+	if (event->cpu == -1) /* must have per-cpu events; see above */
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	event->destroy = cache_pmu_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	group = cache_pmu_setup_event(event); /* will also set rmid */
+
+	raw_spin_lock_irq(&cache_lock);
+	if (group) {
+		event->hw.cache_rmid = group->hw.cache_rmid;
+		list_add_tail(&event->hw.cache_group_entry,
+			      &group->hw.cache_group_entry);
+	} else {
+		list_add_tail(&event->hw.cache_groups_entry,
+			      &cache_groups);
+	}
+
+	list_add_tail(&event->hw.cache_events_entry, &cache_events);
+	raw_spin_unlock_irq(&cache_lock);
+
+	mutex_unlock(&cache_mutex);
+
+	return 0;
+}
+
+static struct pmu cache_pmu = {
+	.task_ctx_nr	= perf_sw_context, /* we cheat: our add will never fail */
+	.event_init	= cache_pmu_event_init,
+	.add		= cache_pmu_event_add,
+	.del		= cache_pmu_event_del,
+	.start		= cache_pmu_event_start,
+	.stop		= cache_pmu_event_stop,
+	.read		= cache_pmu_event_read,
+};
+
+static int __init cache_pmu_init(void)
+{
+	unsigned int eax, ebx, ecd, edx;
+	int i;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+
+	if (boot_cpu_data.x86 != 6)
+		return 0;
+
+	cpuid_count(0x07, 0, &eax, &ebx, &ecx, &edx);
+
+	/* CPUID.(EAX=07H, ECX=0).EBX.QOS[bit12] */
+	if (!(ebx & (1 << 12)))
+		return 0;
+
+	cpuid_count(0x0f, 0, &eax, &ebx, &ecx, &edx);
+
+	max_rmid = ebx;
+
+	/*
+	 * We should iterate bits in CPUID(EAX=0FH, ECX=0).EDX
+	 * For now, only support L3 (bit 1).
+	 */
+	if (!(edx & (1 << 1)))
+		return 0;
+
+	cpuid_count(0x0f, 1, &eax, &ebx, &ecx, &edx);
+
+	l3_scale = ebx;
+	l3_max_rmid = ecx;
+
+	if (l3_max_rmid != max_rmid)
+		return 0;
+
+	cache_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+	if (!cache_rmid_bitmap)
+		return -ENOMEM;
+
+	cache_limbo_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(max_rmid), GFP_KERNEL);
+	if (!cache_limbo_bitmap)
+		return -ENOMEM; /* XXX frees */
+
+	cache_freed_rmid = kmalloc(sizeof(int) * max_rmid, GFP_KERNEL);
+	if (!cache_freed_rmid)
+		return -ENOMEM; /* XXX free bitmaps */
+
+	bitmap_zero(cache_rmid_bitmap, max_rmid);
+	bitmap_set(cache_rmid_bitmap, 0, 1); /* RMID 0 is special */
+	cache_rotation_rmid = __get_rmid(); /* keep one free RMID for rotation */
+	if (WARN_ON_ONCE(cache_rotation_rmid < 0))
+		return cache_rotation_rmid;
+
+	/*
+	 * XXX hotplug notifiers!
+	 */
+	for_each_possible_cpu(i) {
+		struct cache_pmu_state *state = &per_cpu(state, cpu);
+
+		raw_spin_lock_init(&state->lock);
+		state->rmid = 0;
+	}
+
+	ret = perf_pmu_register(&cache_pmu, "cache_qos", -1);
+	if (WARN_ON(ret)) {
+		pr_info("Cache QoS detected, registration failed (%d), disabled\n", ret);
+		return -1;
+	}
+
+	return 0;
+}
+device_initcall(cache_pmu_init);

next prev parent reply	other threads:[~2014-01-27 17:34 UTC|newest]

Thread overview: 75+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-01-03 20:34 [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support Peter P Waskiewicz Jr
2014-01-03 20:34 ` Peter P Waskiewicz Jr
2014-01-03 20:34 ` [PATCH 1/4] x86: Add support for Cache QoS Monitoring (CQM) detection Peter P Waskiewicz Jr
2014-01-03 20:34 ` [PATCH 2/4] x86: Add Cache QoS Monitoring support to x86 perf uncore Peter P Waskiewicz Jr
     [not found] ` <1388781285-18067-1-git-send-email-peter.p.waskiewicz.jr-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2014-01-03 20:34   ` [PATCH 3/4] cgroup: Add new cacheqos cgroup subsys to support Cache QoS Monitoring Peter P Waskiewicz Jr
2014-01-03 20:34     ` Peter P Waskiewicz Jr
2014-01-03 20:34   ` [PATCH 4/4] Documentation: Add documentation for cacheqos cgroup Peter P Waskiewicz Jr
2014-01-03 20:34     ` Peter P Waskiewicz Jr
2014-01-04 16:10   ` [PATCH 0/4] x86: Add Cache QoS Monitoring (CQM) support Tejun Heo
2014-01-04 16:10   ` Tejun Heo
2014-01-04 16:10     ` Tejun Heo
     [not found]     ` <20140104161050.GA24306-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2014-01-04 22:43       ` Waskiewicz Jr, Peter P
2014-01-04 22:43         ` Waskiewicz Jr, Peter P
     [not found]         ` <1388875369.9761.25.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-04 22:50           ` Tejun Heo
2014-01-04 22:50             ` Tejun Heo
     [not found]             ` <20140104225058.GC24306-Gd/HAXX7CRxy/B6EtB590w@public.gmane.org>
2014-01-05  5:23               ` Waskiewicz Jr, Peter P
2014-01-05  5:23                 ` Waskiewicz Jr, Peter P
     [not found]                 ` <1388899376.9761.45.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-06 11:16                   ` Peter Zijlstra
2014-01-06 11:16                     ` Peter Zijlstra
     [not found]                     ` <20140106111624.GB5623-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 16:34                       ` Waskiewicz Jr, Peter P
2014-01-06 16:34                         ` Waskiewicz Jr, Peter P
     [not found]                         ` <1389026035.32504.3.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-06 16:41                           ` Peter Zijlstra
2014-01-06 16:41                             ` Peter Zijlstra
     [not found]                             ` <20140106164150.GQ31570-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 16:47                               ` Waskiewicz Jr, Peter P
2014-01-06 16:47                                 ` Waskiewicz Jr, Peter P
     [not found]                                 ` <1389026867.32504.16.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-06 17:53                                   ` Peter Zijlstra
2014-01-06 17:53                                     ` Peter Zijlstra
     [not found]                                     ` <20140106175338.GF30183-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 18:05                                       ` Waskiewicz Jr, Peter P
2014-01-06 18:05                                         ` Waskiewicz Jr, Peter P
2014-01-06 18:06                                   ` Peter Zijlstra
2014-01-06 18:06                                     ` Peter Zijlstra
     [not found]                                     ` <20140106180636.GG30183-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 20:10                                       ` Waskiewicz Jr, Peter P
2014-01-06 20:10                                         ` Waskiewicz Jr, Peter P
     [not found]                                         ` <1389039035.32504.35.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-06 21:26                                           ` Peter Zijlstra
2014-01-06 21:26                                           ` Peter Zijlstra
2014-01-06 21:26                                             ` Peter Zijlstra
     [not found]                                             ` <20140106212623.GH30183-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 21:48                                               ` Waskiewicz Jr, Peter P
2014-01-06 21:48                                                 ` Waskiewicz Jr, Peter P
     [not found]                                                 ` <1389044899.32504.43.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-06 22:12                                                   ` Peter Zijlstra
2014-01-06 22:12                                                     ` Peter Zijlstra
     [not found]                                                     ` <20140106221251.GJ30183-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 22:45                                                       ` Waskiewicz Jr, Peter P
2014-01-06 22:45                                                         ` Waskiewicz Jr, Peter P
     [not found]                                                         ` <1389048315.32504.57.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-07  8:34                                                           ` Peter Zijlstra
2014-01-07  8:34                                                             ` Peter Zijlstra
     [not found]                                                             ` <20140107083440.GL30183-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-07 15:15                                                               ` Waskiewicz Jr, Peter P
2014-01-07 15:15                                                                 ` Waskiewicz Jr, Peter P
     [not found]                                                                 ` <1389107743.32504.69.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-07 21:12                                                                   ` Peter Zijlstra
2014-01-07 21:12                                                                     ` Peter Zijlstra
     [not found]                                                                     ` <20140107211229.GF2480-RM5+C6weyIYnLiPH7yDmwOa11wxjtiyuLtmvbW2Dspo@public.gmane.org>
2014-01-10 18:55                                                                       ` Waskiewicz Jr, Peter P
2014-01-10 18:55                                                                         ` Waskiewicz Jr, Peter P
     [not found]                                                                         ` <1389380100.32504.172.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-01-13  7:55                                                                           ` Peter Zijlstra
2014-01-13  7:55                                                                             ` Peter Zijlstra
     [not found]                                                                             ` <20140113075528.GR7572-RM5+C6weyIYnLiPH7yDmwOa11wxjtiyuLtmvbW2Dspo@public.gmane.org>
2014-01-14 17:58                                                                               ` H. Peter Anvin
2014-01-14 17:58                                                                               ` H. Peter Anvin
2014-01-14 17:58                                                                                 ` H. Peter Anvin
     [not found]                                                                                 ` <52D57AC2.3090109-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
2014-01-27 17:34                                                                                   ` Peter Zijlstra [this message]
2014-01-27 17:34                                                                                     ` Peter Zijlstra
     [not found]                                                                                     ` <20140127173420.GA9636-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-02-18 17:29                                                                                       ` Waskiewicz Jr, Peter P
2014-02-18 17:29                                                                                         ` Waskiewicz Jr, Peter P
     [not found]                                                                                         ` <1392744567.3069.42.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-02-18 19:35                                                                                           ` Peter Zijlstra
2014-02-18 19:35                                                                                           ` Peter Zijlstra
2014-02-18 19:35                                                                                             ` Peter Zijlstra
     [not found]                                                                                             ` <20140218193528.GQ14089-RM5+C6weyIYnLiPH7yDmwOa11wxjtiyuLtmvbW2Dspo@public.gmane.org>
2014-02-18 19:54                                                                                               ` Waskiewicz Jr, Peter P
2014-02-18 19:54                                                                                                 ` Waskiewicz Jr, Peter P
     [not found]                                                                                                 ` <1392753259.607.9.camel-29DAm2eTeB2q+SSgkFU3IPooFf0ArEBIu+b9c/7xato@public.gmane.org>
2014-02-20 16:58                                                                                                   ` Peter Zijlstra
2014-02-20 16:58                                                                                                     ` Peter Zijlstra
2014-01-14 20:46                                                                               ` Waskiewicz Jr, Peter P
2014-01-14 20:46                                                                                 ` Waskiewicz Jr, Peter P
2014-01-14 20:46                                                                               ` Waskiewicz Jr, Peter P
2014-01-06 16:47                               ` Waskiewicz Jr, Peter P
2014-01-06 11:08   ` Peter Zijlstra
2014-01-06 11:08     ` Peter Zijlstra
     [not found]     ` <20140106110803.GA5623-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2014-01-06 16:42       ` Waskiewicz Jr, Peter P
2014-01-06 16:42         ` Waskiewicz Jr, Peter P
2014-01-06 16:42       ` Waskiewicz Jr, Peter P

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140127173420.GA9636@twins.programming.kicks-ass.net \
    --to=peterz-wegcikhe2lqwvfeawa7xhq@public.gmane.org \
    --cc=cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    --cc=eranian-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    --cc=hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org \
    --cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=peter.p.waskiewicz.jr-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org \
    --cc=tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.