[RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
@ 2010-09-09 13:09 Stephane Eranian
  2010-09-09 13:52 ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: Stephane Eranian @ 2010-09-09 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, paulus, davem, fweisbec, perfmon2-devel, eranian,
	eranian, robert.richter, acme

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.
    
The patch adds perf_event_attr.cgroup, a boolean, to activate
this new mode. The cgroup is designated by passing in
perf_event_attr.cgroup_fd, an opened file descriptor to
the <mnt>/<cgroup>/perf_event.perf file.
    
This is the second version of this patch. It corrects the way
time_enabled is accounted for. In cgroup mode, time_enabled reflects
the time the cgroup was active, i.e., threads from the cgroup executed
on the monitored CPU.  This is a more useful metric than just
wall-clock. The meaning of time_enabled without cgroup is unaffected.

Signed-off-by: Stephane Eranian <eranian@google.com>

--
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3cb7d04..ed76357 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -618,6 +618,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 unsigned short css_id(struct cgroup_subsys_state *css);
 unsigned short css_depth(struct cgroup_subsys_state *css);
 
+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..93f86b7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
 SUBSYS(blkio)
 #endif
 
+#ifdef CONFIG_PERF_EVENTS
+SUBSYS(perf)
+#endif
+
 /* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000610c..f84b38e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -215,8 +215,9 @@ struct perf_event_attr {
 				 */
 				precise_ip     :  2, /* skid constraint       */
 				mmap_data      :  1, /* non-exec mmap data    */
+				cgroup         :  1, /* cgroup aggregation    */
 
-				__reserved_1   : 46;
+				__reserved_1   : 45;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -226,6 +227,8 @@ struct perf_event_attr {
 	__u32			bp_type;
 	__u64			bp_addr;
 	__u64			bp_len;
+
+	int			cgroup_fd;
 };
 
 /*
@@ -461,6 +464,7 @@ enum perf_callchain_context {
  */
 
 #ifdef CONFIG_PERF_EVENTS
+# include <linux/cgroup.h>
 # include <asm/perf_event.h>
 # include <asm/local64.h>
 #endif
@@ -657,6 +661,18 @@ struct swevent_hlist {
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
 
+#ifdef CONFIG_CGROUPS
+struct perf_cgroup_time {
+	u64 time;
+	u64 timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state css;
+	struct perf_cgroup_time *time;
+};
+#endif
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -759,7 +775,9 @@ struct perf_event {
 	struct ftrace_event_call	*tp_event;
 	struct event_filter		*filter;
 #endif
-
+#ifdef CONFIG_CGROUPS
+	struct perf_cgroup		*css;
+#endif
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -806,6 +824,8 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	struct rcu_head			rcu_head;
+
+	int				nr_cgroups;
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e5c5497..3e56354 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4722,6 +4722,23 @@ css_get_next(struct cgroup_subsys *ss, int id,
 	return ret;
 }
 
+struct cgroup_subsys_state *cgroup_css_from_file(struct file *f, int id)
+{
+	struct cgroup *cgrp;
+
+	/* check in cgroup filesystem */
+	if (f->f_op != &cgroup_seqfile_operations)
+		return ERR_PTR(-EBADF);
+
+	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+		return ERR_PTR(-EINVAL);
+
+	/* get cgroup */
+	cgrp = __d_cgrp(f->f_dentry->d_parent);
+
+	return cgrp->subsys[id];
+}
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4b84e63..2723d52 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -35,6 +35,7 @@
 
 #include <asm/irq_regs.h>
 
+#define PERF_TSTAMP_ENABLE_INVALID (~0) /* invalid marker, cannot be zero */
 /*
  * Each CPU has a list of per CPU events:
  */
@@ -49,6 +50,228 @@ static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 
+enum event_type_t {
+	EVENT_FLEXIBLE = 0x1,
+	EVENT_PINNED = 0x2,
+	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			      enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+			     enum event_type_t event_type,
+			     struct task_struct *task, int css_sw);
+static inline u64 perf_clock(void)
+{
+	return local_clock();
+}
+
+#ifdef CONFIG_CGROUPS
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	if (!task)
+		return NULL;
+	return container_of(task_subsys_state(task, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline
+struct perf_cgroup *perf_cgroup_from_cont(struct cgroup *cont)
+{
+	return container_of(cgroup_subsys_state(cont, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	struct perf_cgroup *css = perf_cgroup_from_task(task);
+	return !event->css || event->css == css;
+}
+
+static void *perf_get_cgroup(int fd)
+{
+	struct cgroup_subsys_state *css;
+	struct file *file;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return ERR_PTR(-EBADF);
+
+	css = cgroup_css_from_file(file, perf_subsys_id);
+	if (!IS_ERR(css))
+		css_get(css);
+
+	fput_light(file, fput_needed);
+
+	return css;
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+	if (event->css)
+		css_put(&event->css->css);
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return event->css != NULL;
+}
+
+static inline int is_css_current(struct perf_event *event)
+{
+	struct perf_cgroup *css = perf_cgroup_from_task(current);
+
+	return css == event->css;
+}
+
+static inline u64 __perf_event_css_time(struct perf_event *event)
+{
+	struct perf_cgroup_time *t;
+	t = per_cpu_ptr(event->css->time, event->cpu);
+	return t->time;
+}
+
+static inline void __update_css_time(struct perf_cgroup *css)
+{
+	u64 now;
+	struct perf_cgroup_time *t;
+	int cpu = smp_processor_id();
+
+	if (!css)
+		return;
+
+	now = perf_clock();
+
+	t = per_cpu_ptr(css->time, cpu);
+
+	t->time += now - t->timestamp;
+	t->timestamp = now;
+}
+
+static inline void update_task_css_time(struct task_struct *task)
+{
+	struct perf_cgroup *css_out = perf_cgroup_from_task(task);
+	__update_css_time(css_out);
+}
+
+static inline void update_event_css_time(struct perf_event *event)
+{
+	if (!is_css_current(event))
+		return;
+	__update_css_time(event->css);
+}
+
+static inline void perf_cgroup_switch(struct task_struct *task,
+				      struct task_struct *next)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cgroup *css_out = perf_cgroup_from_task(task);
+	struct perf_cgroup *css_in = perf_cgroup_from_task(next);
+	struct perf_cgroup_time *t;
+	int css_sw;
+
+	if (css_out != css_in) {
+		css_sw = 1;
+		update_task_css_time(task);
+		t = per_cpu_ptr(css_in->time, smp_processor_id());
+		t->timestamp = perf_clock();
+	}
+
+	/*
+	 * if cpu context has at least one event with cgroup constraint,
+	 * then flushout all existing events and scheduled again taking
+	 * into account the incoming cgroup. This is a cgroup switch
+	 */
+	if (cpuctx->ctx.nr_cgroups > 0 && css_sw) {
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		cpu_ctx_sched_in(cpuctx, EVENT_ALL, next, 1);
+	}
+}
+
+static inline int perf_connect_cgroup(struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	struct perf_cgroup *css;
+
+	css = perf_get_cgroup(attr->cgroup_fd);
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+	/*
+	 * all events in a group must monitor
+	 * the same cgroup because a thread belongs
+	 * to only one cgroup at a time
+	 */
+	if (group_leader && group_leader->css != css) {
+		event->css = css;
+		perf_put_cgroup(event);
+		return -EINVAL;
+	}
+
+	event->css = css;
+
+	return 0;
+}
+
+#else /* !CONFIG_CGROUP */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event, struct task_struct *task)
+{
+	return true;
+}
+
+static inline void *perf_get_cgroup(int fd)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline int is_css_current(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline u64 __perf_event_css_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void update_css_time(void *css)
+{}
+
+static inline void update_event_css_time(struct perf_event *event)
+{}
+
+static inline void update_task_css_time(struct task_struct *t)
+{}
+static inline void perf_cgroup_switch(struct task_struct *task,
+				      struct task_struct *next)
+{}
+
+static inline int perf_connect_cgroup(struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *gorup_leader)
+{
+	return -EINVAL;
+}
+
+#endif
+
+
+
 /*
  * perf event paranoia level:
  *  -1 - not paranoid at all
@@ -212,11 +435,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	put_ctx(ctx);
 }
 
-static inline u64 perf_clock(void)
-{
-	return local_clock();
-}
-
 /*
  * Update the record of the current time in a context.
  */
@@ -228,29 +446,46 @@ static void update_context_time(struct perf_event_context *ctx)
 	ctx->timestamp = now;
 }
 
+static u64 perf_event_time(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+
+	if (is_cgroup_event(event)) {
+		if (event->cpu == -1) {
+			WARN_ON(event->cpu != smp_processor_id());
+			return 0;
+		}
+		return __perf_event_css_time(event);
+	}
+
+	return ctx ? ctx->time : 0;
+}
+
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
  */
 static void update_event_times(struct perf_event *event)
 {
-	struct perf_event_context *ctx = event->ctx;
-	u64 run_end;
+	u64 run_end, run_start;
 
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	if (ctx->is_active)
-		run_end = ctx->time;
-	else
-		run_end = event->tstamp_stopped;
+	run_end = perf_event_time(event);
+	run_start = event->tstamp_enabled;
 
-	event->total_time_enabled = run_end - event->tstamp_enabled;
+	/*
+	 * that means the cgroup never got scheduled in
+	 * so ensure total_time_enabled is zero
+	 */
+	if (run_start == PERF_TSTAMP_ENABLE_INVALID)
+		run_start = run_end;
+
+	event->total_time_enabled = run_end - run_start;
 
 	if (event->state == PERF_EVENT_STATE_INACTIVE)
 		run_end = event->tstamp_stopped;
-	else
-		run_end = ctx->time;
 
 	event->total_time_running = run_end - event->tstamp_running;
 }
@@ -301,6 +536,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
+	if (is_cgroup_event(event))
+		ctx->nr_cgroups++;
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -340,6 +578,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
+	if (is_cgroup_event(event))
+		ctx->nr_cgroups--;
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -403,9 +644,10 @@ static void perf_group_detach(struct perf_event *event)
 }
 
 static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, struct task_struct *task)
 {
-	return event->cpu == -1 || event->cpu == smp_processor_id();
+	return (event->cpu == -1 || event->cpu == smp_processor_id())
+	    && perf_cgroup_match(event, task);
 }
 
 static void
@@ -413,6 +655,7 @@ event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
+	u64 tstamp = perf_event_time(event);
 	u64 delta;
 	/*
 	 * An event which could not be activated because of
@@ -421,10 +664,10 @@ event_sched_out(struct perf_event *event,
 	 * via read() for time_enabled, time_running:
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE
-	    && !event_filter_match(event)) {
-		delta = ctx->time - event->tstamp_stopped;
+	    && !event_filter_match(event, current)) {
+		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
-		event->tstamp_stopped = ctx->time;
+		event->tstamp_stopped = tstamp;
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -435,7 +678,7 @@ event_sched_out(struct perf_event *event,
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = ctx->time;
+	event->tstamp_stopped = tstamp;
 	event->pmu->disable(event);
 	event->oncpu = -1;
 
@@ -589,6 +832,11 @@ static void __perf_event_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		/*
+		 * update css time only if current->css corresponds
+		 * to event. This is used to update tstamp->stopped
+		 */
+		update_event_css_time(event);
 		update_context_time(ctx);
 		update_group_times(event);
 		if (event == event->group_leader)
@@ -673,7 +921,8 @@ event_sched_in(struct perf_event *event,
 		return -EAGAIN;
 	}
 
-	event->tstamp_running += ctx->time - event->tstamp_stopped;
+	event->tstamp_running +=
+			perf_event_time(event) - event->tstamp_stopped;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -775,11 +1024,33 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
+	u64 tstamp = perf_event_time(event);
+
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = ctx->time;
-	event->tstamp_running = ctx->time;
-	event->tstamp_stopped = ctx->time;
+
+	event->tstamp_running = tstamp;
+	event->tstamp_stopped = tstamp;
+	event->tstamp_enabled = tstamp;
+
+	/*
+	 * an event is added to a context even if the css constraint
+	 * is not satisfied.  In per-cgroup mode, time_enabled only
+	 * counts when threads from the css are active on the CPU.
+	 *
+	 * tstamp_enabled denotes the first time the event CAN be
+	 * enabled, i.e., the first time threads from the css are
+	 * scheduled in. Note that the event may not be scheduled
+	 * immediately if the PMU is overcommitted yet the timestamp
+	 * points to the first css activation.
+	 *
+	 * If css is not currently active, then we mark
+	 * tstamp_enabled = ~0 to remember that it needs to be
+	 * corrected in ctx_flexible_sched_in() and
+	 * ctx_pinned_sched_in()
+	 */
+	if (is_cgroup_event(event) && !is_css_current(event))
+		event->tstamp_enabled = PERF_TSTAMP_ENABLE_INVALID;
 }
 
 /*
@@ -818,9 +1089,16 @@ static void __perf_install_in_context(void *info)
 	 */
 	perf_disable();
 
+	/*
+	 * in cgroup mode, we know the event matches
+	 * the current cgroup, so update the cgroup's
+	 * time so we timestamp correctly.
+	 */
+	update_event_css_time(event);
+
 	add_event_to_ctx(event, ctx);
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event, current))
 		goto unlock;
 
 	/*
@@ -928,13 +1206,14 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->tstamp_enabled = ctx->time - event->total_time_enabled;
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
+
 	list_for_each_entry(sub, &event->sibling_list, group_entry)
 		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-			sub->tstamp_enabled =
-				ctx->time - sub->total_time_enabled;
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 }
 
 /*
@@ -964,9 +1243,17 @@ static void __perf_event_enable(void *info)
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		goto unlock;
+
+	/*
+	 * in cgroup mode, we know the event matches
+	 * the current cgroup, so update the cgroup's
+	 * time so we timestamp correctly.
+	 */
+	update_event_css_time(event);
+
 	__perf_event_mark_enabled(event, ctx);
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event, current))
 		goto unlock;
 
 	/*
@@ -1079,12 +1366,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
@@ -1096,6 +1377,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
+	update_task_css_time(current);
 
 	perf_disable();
 	if (!ctx->nr_active)
@@ -1209,71 +1491,6 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-				 struct task_struct *next)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = task->perf_event_ctxp;
-	struct perf_event_context *next_ctx;
-	struct perf_event_context *parent;
-	int do_switch = 1;
-
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
-	if (likely(!ctx || !cpuctx->task_ctx))
-		return;
-
-	rcu_read_lock();
-	parent = rcu_dereference(ctx->parent_ctx);
-	next_ctx = next->perf_event_ctxp;
-	if (parent && next_ctx &&
-	    rcu_dereference(next_ctx->parent_ctx) == parent) {
-		/*
-		 * Looks like the two contexts are clones, so we might be
-		 * able to optimize the context switch.  We lock both
-		 * contexts and check that they are clones under the
-		 * lock (including re-checking that neither has been
-		 * uncloned in the meantime).  It doesn't matter which
-		 * order we take the locks because no other cpu could
-		 * be trying to lock both of these tasks.
-		 */
-		raw_spin_lock(&ctx->lock);
-		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
-		if (context_equiv(ctx, next_ctx)) {
-			/*
-			 * XXX do we need a memory barrier of sorts
-			 * wrt to rcu_dereference() of perf_event_ctxp
-			 */
-			task->perf_event_ctxp = next_ctx;
-			next->perf_event_ctxp = ctx;
-			ctx->task = next;
-			next_ctx->task = task;
-			do_switch = 0;
-
-			perf_event_sync_stat(ctx, next_ctx);
-		}
-		raw_spin_unlock(&next_ctx->lock);
-		raw_spin_unlock(&ctx->lock);
-	}
-	rcu_read_unlock();
-
-	if (do_switch) {
-		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-		cpuctx->task_ctx = NULL;
-	}
-}
-
 static void task_ctx_sched_out(struct perf_event_context *ctx,
 			       enum event_type_t event_type)
 {
@@ -1308,16 +1525,40 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+		    struct perf_cpu_context *cpuctx,
+		    struct task_struct *task, int css_sw)
 {
 	struct perf_event *event;
 
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+		u64 tstamp = perf_event_time(event);
+
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event, task))
 			continue;
 
+		if (is_cgroup_event(event)) {
+			/*
+			 * if css was not active when the event was
+			 * added to ctx, then this is the first time
+			 * the event can be effectively scheduled, thus
+			 * we update tstamp_enabled
+			 */
+			if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID)
+				event->tstamp_enabled = tstamp;
+			/*
+			 * if we come here because of a context switch
+			 * with cgroup switch, then we need to update
+			 * the point in time at which all cgroup events
+			 * have been stopped. Oterwise, we would compute
+			 * bogus tstamp_running deltas, which would include
+			 * time the cgorup is not active.
+			 */
+			if (css_sw)
+				event->tstamp_stopped = tstamp;
+		}
+
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -1334,7 +1575,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+		      struct perf_cpu_context *cpuctx,
+		      struct task_struct *task, int css_sw)
 {
 	struct perf_event *event;
 	int can_add_hw = 1;
@@ -1347,9 +1589,31 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event, task))
 			continue;
 
+		if (is_cgroup_event(event)) {
+			u64 tstamp = perf_event_time(event);
+			/*
+			 * if css was not active when the event was
+			 * added to ctx, then this is the first time
+			 * the event can be effectively scheduled, thus
+			 * we update tstamp_enabled
+			 */
+			if (event->tstamp_enabled == PERF_TSTAMP_ENABLE_INVALID)
+				event->tstamp_enabled = tstamp;
+			/*
+			 * if we come here because of a context switch
+			 * with cgroup switch, then we need to update
+			 * the point in time at which all cgroup events
+			 * have been stopped. Oterwise, we would compute
+			 * bogus tstamp_running deltas, which would include
+			 * time the cgorup is not active.
+			 */
+			if (css_sw)
+				event->tstamp_stopped = tstamp;
+		}
+
 		if (group_can_go_on(event, cpuctx, can_add_hw))
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -1359,7 +1623,8 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type)
+	     enum event_type_t event_type,
+	     struct task_struct *task, int css_sw)
 {
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
@@ -1375,11 +1640,11 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * in order to give them the best chance of going on.
 	 */
 	if (event_type & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_pinned_sched_in(ctx, cpuctx, task, css_sw);
 
 	/* Then walk through the lower prio flexible groups */
 	if (event_type & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		ctx_flexible_sched_in(ctx, cpuctx, task, css_sw);
 
 	perf_enable();
  out:
@@ -1387,11 +1652,12 @@ ctx_sched_in(struct perf_event_context *ctx,
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
+			     enum event_type_t event_type,
+			     struct task_struct *task, int css_sw)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task, css_sw);
 }
 
 static void task_ctx_sched_in(struct task_struct *task,
@@ -1404,7 +1670,7 @@ static void task_ctx_sched_in(struct task_struct *task,
 		return;
 	if (cpuctx->task_ctx == ctx)
 		return;
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task, 0);
 	cpuctx->task_ctx = ctx;
 }
 /*
@@ -1438,15 +1704,88 @@ void perf_event_task_sched_in(struct task_struct *task)
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
-	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, 0);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, 0);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, 0);
 
 	cpuctx->task_ctx = ctx;
 
 	perf_enable();
 }
 
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+				 struct task_struct *next)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *next_ctx;
+	struct perf_event_context *parent;
+	int do_switch = 1;
+
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+	/*
+	 * switching cgroups
+	 * must update time in going out cgroup
+	 * mark new start time in coming in cgroup
+	 */
+	perf_cgroup_switch(task, next);
+
+	if (likely(!ctx || !cpuctx->task_ctx))
+		return;
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
+	next_ctx = next->perf_event_ctxp;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		raw_spin_lock(&ctx->lock);
+		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			/*
+			 * XXX do we need a memory barrier of sorts
+			 * wrt to rcu_dereference() of perf_event_ctxp
+			 */
+			task->perf_event_ctxp = next_ctx;
+			next->perf_event_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+
+			perf_event_sync_stat(ctx, next_ctx);
+		}
+		raw_spin_unlock(&next_ctx->lock);
+		raw_spin_unlock(&ctx->lock);
+	}
+	rcu_read_unlock();
+
+	if (do_switch) {
+		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+		cpuctx->task_ctx = NULL;
+	}
+}
+
+
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
@@ -1579,7 +1918,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event, current))
 			continue;
 
 		hwc = &event->hw;
@@ -1660,7 +1999,7 @@ void perf_event_task_tick(struct task_struct *curr)
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, curr, 0);
 	if (ctx)
 		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
 	perf_enable();
@@ -1747,6 +2086,7 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
+	update_event_css_time(event);
 	update_context_time(ctx);
 	update_event_times(event);
 	raw_spin_unlock(&ctx->lock);
@@ -1773,6 +2113,7 @@ static u64 perf_event_read(struct perf_event *event)
 		unsigned long flags;
 
 		raw_spin_lock_irqsave(&ctx->lock, flags);
+		update_event_css_time(event);
 		update_context_time(ctx);
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -2132,6 +2473,9 @@ static void free_event(struct perf_event *event)
 		event->buffer = NULL;
 	}
 
+	if (is_cgroup_event(event))
+		perf_put_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -3764,7 +4108,7 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm || event->attr.mmap ||
@@ -3878,7 +4222,7 @@ static int perf_event_comm_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if (event->attr.comm)
@@ -3999,7 +4343,7 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event, current))
 		return 0;
 
 	if ((!executable && event->attr.mmap_data) ||
@@ -4660,6 +5004,7 @@ static void task_clock_perf_event_read(struct perf_event *event)
 	u64 time;
 
 	if (!in_nmi()) {
+		update_event_css_time(event);
 		update_context_time(event->ctx);
 		time = event->ctx->time;
 	} else {
@@ -5037,6 +5382,14 @@ perf_event_alloc(struct perf_event_attr *attr,
 	if (!event)
 		return ERR_PTR(-ENOMEM);
 
+	if (attr->cgroup) {
+		err = perf_connect_cgroup(event, attr, group_leader);
+		if (err) {
+			kfree(event);
+			return ERR_PTR(err);
+		}
+	}
+
 	/*
 	 * Single events are their own group leaders, with an
 	 * empty sibling list:
@@ -5125,6 +5478,7 @@ done:
 	if (err) {
 		if (event->ns)
 			put_pid_ns(event->ns);
+		perf_put_cgroup(event);
 		kfree(event);
 		return ERR_PTR(err);
 	}
@@ -5320,6 +5674,10 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* cgroup reserved for system-wide */
+	if (attr.cgroup && pid != -1)
+		return -EINVAL;
+
 	event_fd = get_unused_fd_flags(O_RDWR);
 	if (event_fd < 0)
 		return event_fd;
@@ -6094,3 +6452,69 @@ static int __init perf_event_sysfs_init(void)
 				  &perfclass_attr_group);
 }
 device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUPS
+static int perf_cgroup_read_map(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	return 0;
+}
+
+static struct cftype perf_cgroup_files[] = {
+	{ .name = "perf",
+	  .read_map = perf_cgroup_read_map,
+	},
+};
+
+static struct cgroup_subsys_state *perf_cgroup_create(
+	struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	struct perf_cgroup_time *t;
+	int c;
+
+	jc = vmalloc(sizeof(*jc));
+	if (!jc)
+		return ERR_PTR(-ENOMEM);
+
+	memset(jc, 0, sizeof(*jc));
+
+	jc->time = alloc_percpu(struct perf_cgroup_time);
+	if (!jc->time) {
+		vfree(jc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for_each_possible_cpu(c) {
+		t = per_cpu_ptr(jc->time, c);
+		t->time = 0;
+		t->timestamp = 0;
+	}
+	return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	struct perf_cgroup *jc = perf_cgroup_from_cont(cont);
+
+	free_percpu(jc->time);
+	vfree(jc);
+}
+
+static int perf_cgroup_populate(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	return cgroup_add_files(cont, ss, perf_cgroup_files,
+			ARRAY_SIZE(perf_cgroup_files));
+}
+
+struct cgroup_subsys perf_subsys = {
+	.name = "perf_event",
+	.subsys_id = perf_subsys_id,
+	.create = perf_cgroup_create,
+	.destroy = perf_cgroup_destroy,
+	.populate = perf_cgroup_populate,
+	.early_init = 0,
+};
+#endif /* CONFIG_CGROUP */

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
  2010-09-09 13:09 [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
@ 2010-09-09 13:52 ` Eric Dumazet
  2010-09-09 21:41   ` Stephane Eranian
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2010-09-09 13:52 UTC (permalink / raw)
  To: eranian
  Cc: linux-kernel, peterz, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme

Le jeudi 09 septembre 2010 à 06:09 -0700, Stephane Eranian a écrit :
> This kernel patch adds the ability to filter monitoring based on
> container groups (cgroups). This is for use in per-cpu mode only.
>     
> The patch adds perf_event_attr.cgroup, a boolean, to activate
> this new mode. The cgroup is designated by passing in
> perf_event_attr.cgroup_fd, an opened file descriptor to
> the <mnt>/<cgroup>/perf_event.perf file.
>     
> This is the second version of this patch. It corrects the way
> time_enabled is accounted for. In cgroup mode, time_enabled reflects
> the time the cgroup was active, i.e., threads from the cgroup executed
> on the monitored CPU.  This is a more useful metric than just
> wall-clock. The meaning of time_enabled without cgroup is unaffected.
> 
> Signed-off-by: Stephane Eranian <eranian@google.com>


> +#ifdef CONFIG_CGROUPS
> +struct perf_cgroup_time {
> +	u64 time;
> +	u64 timestamp;
> +};
> +
> +struct perf_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct perf_cgroup_time *time;

	struct perf_cgroup_time __percpu *time;

Please run sparse  after this "__percpu" change.



> +};
> +#endif
> +
>  /*
> +
> +	jc->time = alloc_percpu(struct perf_cgroup_time);
> +	if (!jc->time) {
> +		vfree(jc);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	for_each_possible_cpu(c) {
> +		t = per_cpu_ptr(jc->time, c);
> +		t->time = 0;
> +		t->timestamp = 0;
> +	}

alloc_percpu() is zalloc_percpu() in fact, memory is already cleared.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
  2010-09-09 13:52 ` Eric Dumazet
@ 2010-09-09 21:41   ` Stephane Eranian
  2010-09-10  8:16     ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Stephane Eranian @ 2010-09-09 21:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: linux-kernel, peterz, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme

Eric,

On Thu, Sep 9, 2010 at 3:52 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le jeudi 09 septembre 2010 à 06:09 -0700, Stephane Eranian a écrit :
>> This kernel patch adds the ability to filter monitoring based on
>> container groups (cgroups). This is for use in per-cpu mode only.
>>
>> The patch adds perf_event_attr.cgroup, a boolean, to activate
>> this new mode. The cgroup is designated by passing in
>> perf_event_attr.cgroup_fd, an opened file descriptor to
>> the <mnt>/<cgroup>/perf_event.perf file.
>>
>> This is the second version of this patch. It corrects the way
>> time_enabled is accounted for. In cgroup mode, time_enabled reflects
>> the time the cgroup was active, i.e., threads from the cgroup executed
>> on the monitored CPU.  This is a more useful metric than just
>> wall-clock. The meaning of time_enabled without cgroup is unaffected.
>>
>> Signed-off-by: Stephane Eranian <eranian@google.com>
>
>
>> +#ifdef CONFIG_CGROUPS
>> +struct perf_cgroup_time {
>> +     u64 time;
>> +     u64 timestamp;
>> +};
>> +
>> +struct perf_cgroup {
>> +     struct cgroup_subsys_state css;
>> +     struct perf_cgroup_time *time;
>
>        struct perf_cgroup_time __percpu *time;
>
> Please run sparse  after this "__percpu" change.
>
>
Will do.

>> +     jc->time = alloc_percpu(struct perf_cgroup_time);
>> +     if (!jc->time) {
>> +             vfree(jc);
>> +             return ERR_PTR(-ENOMEM);
>> +     }
>> +
>> +     for_each_possible_cpu(c) {
>> +             t = per_cpu_ptr(jc->time, c);
>> +             t->time = 0;
>> +             t->timestamp = 0;
>> +     }
>
> alloc_percpu() is zalloc_percpu() in fact, memory is already cleared.
>
I remember thinking about this and trying to trace to the code down
to figure this out. But it is rather complicated. If alloc_percpu() always
clears the memory, then I think that calling is zalloc_percpu()
would be more helpful....

>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
  2010-09-09 21:41   ` Stephane Eranian
@ 2010-09-10  8:16     ` Peter Zijlstra
  2010-09-10  8:41       ` Tejun Heo
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2010-09-10  8:16 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Eric Dumazet, linux-kernel, mingo, paulus, davem, fweisbec,
	perfmon2-devel, eranian, robert.richter, acme, Tejun Heo

On Thu, 2010-09-09 at 23:41 +0200, Stephane Eranian wrote:
> > alloc_percpu() is zalloc_percpu() in fact, memory is already cleared.
> >
> I remember thinking about this and trying to trace to the code down
> to figure this out. But it is rather complicated. If alloc_percpu() always
> clears the memory, then I think that calling is zalloc_percpu()
> would be more helpful.... 

pcpu_populate_chunk() in mm/percpu-vm.c does indeed do that memset, the
one in mm/percpu-km.c does not.

It is not obviously clear to me the -km allocator does indeed result in
zero filled memory.

Tejun?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
  2010-09-10  8:16     ` Peter Zijlstra
@ 2010-09-10  8:41       ` Tejun Heo
  2010-09-10  8:52         ` [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator Tejun Heo
  2010-09-10  8:59         ` [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
  0 siblings, 2 replies; 14+ messages in thread
From: Tejun Heo @ 2010-09-10  8:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Eric Dumazet, linux-kernel, mingo, paulus,
	davem, fweisbec, perfmon2-devel, eranian, robert.richter, acme

Hello,

On 09/10/2010 10:16 AM, Peter Zijlstra wrote:
> On Thu, 2010-09-09 at 23:41 +0200, Stephane Eranian wrote:
>>> alloc_percpu() is zalloc_percpu() in fact, memory is already cleared.
>>>
>> I remember thinking about this and trying to trace to the code down
>> to figure this out. But it is rather complicated. If alloc_percpu() always
>> clears the memory, then I think that calling is zalloc_percpu()
>> would be more helpful.... 

Maybe but at this point it might be a bit too late.  The allocator has
been that way since the beginning.

> pcpu_populate_chunk() in mm/percpu-vm.c does indeed do that memset, the
> one in mm/percpu-km.c does not.
> 
> It is not obviously clear to me the -km allocator does indeed result in
> zero filled memory.

Nice catch.  Fortunately, the -km allocator isn't currently being used
in upstrea although it was enabled for linux-next a couple of days
ago.  I'll fix it up.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator
  2010-09-10  8:41       ` Tejun Heo
@ 2010-09-10  8:52         ` Tejun Heo
  2010-09-10  8:55           ` Peter Zijlstra
  2010-09-10  8:59         ` [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
  1 sibling, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2010-09-10  8:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Eric Dumazet, linux-kernel, mingo, paulus,
	davem, fweisbec, perfmon2-devel, eranian, robert.richter, acme

Percpu allocator should clear memory before returning it but the km
allocator forgot to do it.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Spotted-by: Peter Zijlstra <peterz@infradead.org>
---
 mm/percpu-km.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 7037bc7..89633fe 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -35,7 +35,11 @@

 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
-	/* noop */
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
 	return 0;
 }

-- 
1.7.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator
  2010-09-10  8:52         ` [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator Tejun Heo
@ 2010-09-10  8:55           ` Peter Zijlstra
  2010-09-10 16:31             ` Randy Dunlap
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2010-09-10  8:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Stephane Eranian, Eric Dumazet, linux-kernel, mingo, paulus,
	davem, fweisbec, perfmon2-devel, eranian, robert.richter, acme

On Fri, 2010-09-10 at 10:52 +0200, Tejun Heo wrote:
> Percpu allocator should clear memory before returning it but the km
> allocator forgot to do it.  Fix it.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Spotted-by: Peter Zijlstra <peterz@infradead.org>

(fwiw, -tip uses Reported-by)

Acked-by: Peter Zijlstra <peterz@infradead.org>

> ---
>  mm/percpu-km.c |    6 +++++-
>  1 files changed, 5 insertions(+), 1 deletions(-)
> 
> diff --git a/mm/percpu-km.c b/mm/percpu-km.c
> index 7037bc7..89633fe 100644
> --- a/mm/percpu-km.c
> +++ b/mm/percpu-km.c
> @@ -35,7 +35,11 @@
> 
>  static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
>  {
> -	/* noop */
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
> +
>  	return 0;
>  }
> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator
  2010-09-10  8:55           ` Peter Zijlstra
@ 2010-09-10 16:31             ` Randy Dunlap
  2010-09-10 16:45               ` Ingo Molnar
  0 siblings, 1 reply; 14+ messages in thread
From: Randy Dunlap @ 2010-09-10 16:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Stephane Eranian, Eric Dumazet, linux-kernel, mingo,
	paulus, davem, fweisbec, perfmon2-devel, eranian, robert.richter,
	acme

On Fri, 10 Sep 2010 10:55:18 +0200 Peter Zijlstra wrote:

> On Fri, 2010-09-10 at 10:52 +0200, Tejun Heo wrote:
> > Percpu allocator should clear memory before returning it but the km
> > allocator forgot to do it.  Fix it.
> > 
> > Signed-off-by: Tejun Heo <tj@kernel.org>
> > Spotted-by: Peter Zijlstra <peterz@infradead.org>
> 
> (fwiw, -tip uses Reported-by)

fyi, Documentation/SubmittingPatches does also.


> Acked-by: Peter Zijlstra <peterz@infradead.org>
> 
> > ---
> >  mm/percpu-km.c |    6 +++++-
> >  1 files changed, 5 insertions(+), 1 deletions(-)
> > 
> > diff --git a/mm/percpu-km.c b/mm/percpu-km.c
> > index 7037bc7..89633fe 100644
> > --- a/mm/percpu-km.c
> > +++ b/mm/percpu-km.c
> > @@ -35,7 +35,11 @@
> > 
> >  static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
> >  {
> > -	/* noop */
> > +	unsigned int cpu;
> > +
> > +	for_each_possible_cpu(cpu)
> > +		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
> > +
> >  	return 0;
> >  }


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator
  2010-09-10 16:31             ` Randy Dunlap
@ 2010-09-10 16:45               ` Ingo Molnar
  2010-09-10 23:28                 ` Tejun Heo
  0 siblings, 1 reply; 14+ messages in thread
From: Ingo Molnar @ 2010-09-10 16:45 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Peter Zijlstra, Tejun Heo, Stephane Eranian, Eric Dumazet,
	linux-kernel, paulus, davem, fweisbec, perfmon2-devel, eranian,
	robert.richter, acme


* Randy Dunlap <randy.dunlap@oracle.com> wrote:

> On Fri, 10 Sep 2010 10:55:18 +0200 Peter Zijlstra wrote:
> 
> > On Fri, 2010-09-10 at 10:52 +0200, Tejun Heo wrote:
> > > Percpu allocator should clear memory before returning it but the km
> > > allocator forgot to do it.  Fix it.
> > > 
> > > Signed-off-by: Tejun Heo <tj@kernel.org>
> > > Spotted-by: Peter Zijlstra <peterz@infradead.org>
> > 
> > (fwiw, -tip uses Reported-by)
> 
> fyi, Documentation/SubmittingPatches does also.

Yeah, and, FYI, there's observed strong correlation between the 
maintenance policies of -tip and Documentation/SubmittingPatches.
[i'd even venture so far to claim causation]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator
  2010-09-10 16:45               ` Ingo Molnar
@ 2010-09-10 23:28                 ` Tejun Heo
  0 siblings, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2010-09-10 23:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Randy Dunlap, Peter Zijlstra, Stephane Eranian, Eric Dumazet,
	linux-kernel, paulus, davem, fweisbec, perfmon2-devel, eranian,
	robert.richter, acme

On 09/10/2010 06:45 PM, Ingo Molnar wrote:
> 
> * Randy Dunlap <randy.dunlap@oracle.com> wrote:
> 
>> On Fri, 10 Sep 2010 10:55:18 +0200 Peter Zijlstra wrote:
>>
>>> On Fri, 2010-09-10 at 10:52 +0200, Tejun Heo wrote:
>>>> Percpu allocator should clear memory before returning it but the km
>>>> allocator forgot to do it.  Fix it.
>>>>
>>>> Signed-off-by: Tejun Heo <tj@kernel.org>
>>>> Spotted-by: Peter Zijlstra <peterz@infradead.org>
>>>
>>> (fwiw, -tip uses Reported-by)
>>
>> fyi, Documentation/SubmittingPatches does also.
> 
> Yeah, and, FYI, there's observed strong correlation between the 
> maintenance policies of -tip and Documentation/SubmittingPatches.
> [i'd even venture so far to claim causation]

Oh, I already updated it to be Reported-by.  I use Spotted-by when to
distinguish the cases where the root cause is diagnosed by the
reporter as opposed to when the reporter just reported a bug but I
suppose it's just causing more confusion.  I'll just use Reproted-by
from now on.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3)
  2010-09-10  8:41       ` Tejun Heo
  2010-09-10  8:52         ` [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator Tejun Heo
@ 2010-09-10  8:59         ` Stephane Eranian
  2010-09-10  9:03           ` [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled Tejun Heo
  1 sibling, 1 reply; 14+ messages in thread
From: Stephane Eranian @ 2010-09-10  8:59 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Eric Dumazet, linux-kernel, mingo, paulus, davem,
	fweisbec, perfmon2-devel, eranian, robert.richter, acme

On Fri, Sep 10, 2010 at 10:41 AM, Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On 09/10/2010 10:16 AM, Peter Zijlstra wrote:
> > On Thu, 2010-09-09 at 23:41 +0200, Stephane Eranian wrote:
> >>> alloc_percpu() is zalloc_percpu() in fact, memory is already cleared.
> >>>
> >> I remember thinking about this and trying to trace to the code down
> >> to figure this out. But it is rather complicated. If alloc_percpu() always
> >> clears the memory, then I think that calling is zalloc_percpu()
> >> would be more helpful....
>
> Maybe but at this point it might be a bit too late.  The allocator has
> been that way since the beginning.
>
Then add a comment in the header or at the definition of the function
to make this more explicit.


> > pcpu_populate_chunk() in mm/percpu-vm.c does indeed do that memset, the
> > one in mm/percpu-km.c does not.
> >
> > It is not obviously clear to me the -km allocator does indeed result in
> > zero filled memory.
>
> Nice catch.  Fortunately, the -km allocator isn't currently being used
> in upstrea although it was enabled for linux-next a couple of days
> ago.  I'll fix it up.
>
> Thank you.
>
> --
> tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled
  2010-09-10  8:59         ` [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
@ 2010-09-10  9:03           ` Tejun Heo
  2010-09-10  9:07             ` Stephane Eranian
  2010-09-10  9:09             ` Eric Dumazet
  0 siblings, 2 replies; 14+ messages in thread
From: Tejun Heo @ 2010-09-10  9:03 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Eric Dumazet, linux-kernel, mingo, paulus, davem,
	fweisbec, perfmon2-devel, eranian, robert.richter, acme

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephane Eranian <eranian@google.com>
---
 mm/percpu.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 0cd4bf6..12dea33 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -826,8 +826,8 @@ fail_unlock_mutex:
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Might sleep.  Might trigger writeouts.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
@@ -846,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
- * Allocate percpu area of @size bytes aligned at @align from reserved
- * percpu area if arch has set it up; otherwise, allocation is served
- * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area.  Might sleep.
+ * Might trigger writeouts.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled
  2010-09-10  9:03           ` [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled Tejun Heo
@ 2010-09-10  9:07             ` Stephane Eranian
  2010-09-10  9:09             ` Eric Dumazet
  1 sibling, 0 replies; 14+ messages in thread
From: Stephane Eranian @ 2010-09-10  9:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Eric Dumazet, linux-kernel, mingo, paulus, davem,
	fweisbec, perfmon2-devel, eranian, robert.richter, acme

Thanks

On Fri, Sep 10, 2010 at 11:03 AM, Tejun Heo <tj@kernel.org> wrote:
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Stephane Eranian <eranian@google.com>
> ---
>  mm/percpu.c |   11 ++++++-----
>  1 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 0cd4bf6..12dea33 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -826,8 +826,8 @@ fail_unlock_mutex:
>  * @size: size of area to allocate in bytes
>  * @align: alignment of area (max PAGE_SIZE)
>  *
> - * Allocate percpu area of @size bytes aligned at @align.  Might
> - * sleep.  Might trigger writeouts.
> + * Allocate zero-filled percpu area of @size bytes aligned at @align.
> + * Might sleep.  Might trigger writeouts.
>  *
>  * CONTEXT:
>  * Does GFP_KERNEL allocation.
> @@ -846,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
>  * @size: size of area to allocate in bytes
>  * @align: alignment of area (max PAGE_SIZE)
>  *
> - * Allocate percpu area of @size bytes aligned at @align from reserved
> - * percpu area if arch has set it up; otherwise, allocation is served
> - * from the same dynamic area.  Might sleep.  Might trigger writeouts.
> + * Allocate zero-filled percpu area of @size bytes aligned at @align
> + * from reserved percpu area if arch has set it up; otherwise,
> + * allocation is served from the same dynamic area.  Might sleep.
> + * Might trigger writeouts.
>  *
>  * CONTEXT:
>  * Does GFP_KERNEL allocation.
> --
> 1.7.1
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled
  2010-09-10  9:03           ` [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled Tejun Heo
  2010-09-10  9:07             ` Stephane Eranian
@ 2010-09-10  9:09             ` Eric Dumazet
  1 sibling, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2010-09-10  9:09 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Stephane Eranian, Peter Zijlstra, linux-kernel, mingo, paulus,
	davem, fweisbec, perfmon2-devel, eranian, robert.richter, acme

Le vendredi 10 septembre 2010 à 11:03 +0200, Tejun Heo a écrit :
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Stephane Eranian <eranian@google.com>
> ---

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2010-09-10 23:27 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-09 13:09 [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
2010-09-09 13:52 ` Eric Dumazet
2010-09-09 21:41   ` Stephane Eranian
2010-09-10  8:16     ` Peter Zijlstra
2010-09-10  8:41       ` Tejun Heo
2010-09-10  8:52         ` [PATCH percpu#for-next] percpu: clear memory allocated with the km allocator Tejun Heo
2010-09-10  8:55           ` Peter Zijlstra
2010-09-10 16:31             ` Randy Dunlap
2010-09-10 16:45               ` Ingo Molnar
2010-09-10 23:28                 ` Tejun Heo
2010-09-10  8:59         ` [RFC PATCH 1/2] perf_events: add support for per-cpu per-cgroup monitoring (v3) Stephane Eranian
2010-09-10  9:03           ` [PATCH] percpu: update comments to reflect that percpu allocations are always zero-filled Tejun Heo
2010-09-10  9:07             ` Stephane Eranian
2010-09-10  9:09             ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox