All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Song Liu <songliubraving@fb.com>
Cc: linux-kernel@vger.kernel.org, kernel-team@fb.com,
	Arnaldo Carvalho de Melo <acme@redhat.com>,
	Jiri Olsa <jolsa@kernel.org>,
	Alexey Budankov <alexey.budankov@linux.intel.com>,
	Namhyung Kim <namhyung@kernel.org>, Tejun Heo <tj@kernel.org>,
	kernel test robot <rong.a.chen@intel.com>
Subject: Re: [PATCH v12] perf: Sharing PMU counters across compatible events
Date: Tue, 21 Apr 2020 01:04:29 +0200	[thread overview]
Message-ID: <20200420230429.GT2483@worktop.programming.kicks-ass.net> (raw)
In-Reply-To: <20200331075533.2347393-1-songliubraving@fb.com>

On Tue, Mar 31, 2020 at 12:55:33AM -0700, Song Liu wrote:
> @@ -1778,6 +1775,246 @@ perf_event_groups_next(struct perf_event *event)
>  		event = rb_entry_safe(rb_next(&event->group_node),	\
>  				typeof(*event), group_node))
>  
> +static inline bool perf_event_can_share(struct perf_event *event)
> +{
> +	/* only share hardware counting events */
> +	return !is_sampling_event(event);
> +	return !is_software_event(event) && !is_sampling_event(event);
> +}

ISTR pointing out before that one of those returns is superfluous.

> +/*
> + * Returns whether the two events can share a PMU counter.
> + *
> + * Note: This function does NOT check perf_event_can_share() for
> + * the two events, they should be checked before this function
> + */
> +static inline bool perf_event_compatible(struct perf_event *event_a,
> +					 struct perf_event *event_b)
> +{
> +	return memcmp(&event_a->attr, &event_b->attr, event_a->attr.size) == 0;
> +}
> +
> +static void perf_event_init_dup_master(struct perf_event *event)
> +{
> +	bool is_active = event->state == PERF_EVENT_STATE_ACTIVE;
> +	s64 count;
> +
> +	WARN_ON_ONCE(event->dup_active != 0);
> +	/*
> +	 * The event sharing scheme allows for duplicate events to be ACTIVE
> +	 * while the master is not. In order to facilitate this, the master
> +	 * will be put in the ENABLED state whenever it has active duplicates
> +	 * but is itself *not* ACTIVE.
> +	 *
> +	 * When ENABLED the master event is scheduled, but its counter must
> +	 * appear stalled. Since the PMU driver updates event->count, the
> +	 * master must keep a shadow counter for itself, this is
> +	 * event->master_count.
> +	 */
> +	count = local64_read(&event->count);
> +	local64_set(&event->master_count, count);
> +
> +	if (is_active) {
> +		local64_set(&event->dup_count, count);
> +		event->dup_active = 1;
> +	}
> +
> +	barrier();
> +
> +	WRITE_ONCE(event->dup_master, event);
> +	event->dup_master = event;
> +}

OK, set up master_count, dup_count, barrier, set dup_master.

> +/* tear down dup_master, no more sharing for this event */
> +static void perf_event_exit_dup_master(struct perf_event *event)
> +{
> +	WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> +		     event->state > PERF_EVENT_STATE_INACTIVE);
> +
> +	/* restore event->count and event->child_count */
> +	local64_set(&event->count, local64_read(&event->master_count));
> +
> +	event->dup_active = 0;
> +	WRITE_ONCE(event->dup_master, NULL);
> +
> +	barrier();
> +}

XXX ordering doesn't make sense, I think you want the barrier() between
setting count and dup_master, that way you'll get the correct result out
of perf_event_count() no matter the timing.

> +#define EVENT_TOMBSTONE ((void *)-1L)
> +
> +/*
> + * sync data count from dup_master to event, called on event_pmu_read()
> + * and event_pmu_del()
> + */
> +static void
> +event_pmu_read_dup(struct perf_event *event, struct perf_event *master)
> +{
> +	u64 prev_count, new_count;
> +
> +	if (master == EVENT_TOMBSTONE)
> +		return;
> +
> +again:
> +	prev_count = local64_read(&event->dup_count);
> +	if (master->state > PERF_EVENT_STATE_INACTIVE)
> +		master->pmu->read(master);
> +	new_count = local64_read(&master->count);
> +	if (local64_cmpxchg(&event->dup_count, prev_count, new_count) != prev_count)
> +		goto again;
> +
> +	if (event == master)
> +		local64_add(new_count - prev_count, &event->master_count);
> +	else
> +		local64_add(new_count - prev_count, &event->count);
> +}
> +
> +/* After adding a event to the ctx, try find compatible event(s). */
> +static void
> +perf_event_setup_dup(struct perf_event *event, struct perf_event_context *ctx)
> +{
> +	struct perf_event *tmp;
> +
> +	if (event->state < PERF_EVENT_STATE_OFF ||
> +	    !perf_event_can_share(event))
> +		return;
> +
> +	/* look for dup with other events */
> +	list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> +		if (tmp == event ||
> +		    !perf_event_can_share(tmp) ||
> +		    !perf_event_compatible(event, tmp))
> +			continue;
> +		if (tmp->state < PERF_EVENT_STATE_INACTIVE)
> +			continue;
> +
> +		/* first dup, pick tmp as the master */
> +		if (!tmp->dup_master) {
> +			if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> +				tmp->pmu->read(tmp);
> +			perf_event_init_dup_master(tmp);
> +		}
> +
> +		event->dup_master = tmp->dup_master;
> +
> +		break;
> +	}
> +}
> +
> +/* Remove dup_master for the event */
> +static void
> +perf_event_remove_dup(struct perf_event *event, struct perf_event_context *ctx)
> +
> +{
> +	struct perf_event *tmp, *new_master;
> +	int dup_count, active_count;
> +	int ret;
> +
> +	/* no sharing */
> +	if (!event->dup_master)
> +		return;
> +
> +	WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> +		     event->state > PERF_EVENT_STATE_ENABLED);
> +
> +	/* this event is not the master */
> +	if (event->dup_master != event) {
> +		event->dup_master = NULL;
> +		return;
> +	}
> +
> +	active_count = event->dup_active;
> +	if (active_count) {
> +		perf_pmu_disable(event->pmu);
> +		event->pmu->del(event, 0);
> +	}
> +
> +	/* this event is the master */
> +	dup_count = 0;
> +	new_master = NULL;
> +	list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> +		u64 count;
> +
> +		if (tmp->dup_master != event || tmp == event)
> +			continue;
> +		if (!new_master) {
> +			new_master = tmp;
> +			list_del_init(&new_master->dup_active_list);
> +		}
> +
> +		if (tmp->state == PERF_EVENT_STATE_ACTIVE) {
> +			/* sync read from old master */
> +			event_pmu_read_dup(tmp, event);
> +		}
> +		/*
> +		 * Flip an active event to a new master; this is tricky
> +		 * because for an active event event_pmu_read() can be
> +		 * called at any time from NMI context.
> +		 *
> +		 * This means we need to have ->dup_master and ->dup_count
> +		 * consistent at all times. Of course we cannot do two
> +		 * writes at once :/
> +		 *
> +		 * Instead, flip ->dup_master to EVENT_TOMBSTONE, this will
> +		 * make event_pmu_read_dup() NOP. Then we can set
> +		 * ->dup_count and finally set ->dup_master to the
> +		 * new_master to let event_pmu_read_dup() rip.
> +		 */
> +		WRITE_ONCE(tmp->dup_master, EVENT_TOMBSTONE);
> +		barrier();
> +
> +		count = local64_read(&new_master->count);
> +		local64_set(&tmp->dup_count, count);
> +
> +		if (tmp == new_master)
> +			local64_set(&tmp->master_count, count);
> +		else if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> +			list_move_tail(&tmp->dup_active_list,
> +				       &new_master->dup_active_list);
> +
> +		barrier();
> +		WRITE_ONCE(tmp->dup_master, new_master);
> +		dup_count++;
> +	}
> +
> +	perf_event_exit_dup_master(event);
> +
> +	if (!dup_count)
> +		return;
> +
> +	if (active_count) {
> +		/* copy hardware configure to switch faster */
> +		if (event->pmu->copy_hw_config)
> +			event->pmu->copy_hw_config(event, new_master);
> +
> +		ret = new_master->pmu->add(new_master, PERF_EF_START);
> +		/*
> +		 * Since we just removed the old master (@event), it should be
> +		 * impossible to fail to schedule the new master, an identical
> +		 * event.
> +		 */
> +		WARN_ON_ONCE(ret);
> +		if (new_master->state == PERF_EVENT_STATE_INACTIVE) {
> +			/*
> +			 * We don't need to update time, so don't call
> +			 * perf_event_set_state().
> +			 */
> +			new_master->state = PERF_EVENT_STATE_ENABLED;
> +		}
> +		perf_pmu_enable(new_master->pmu);
> +	}
> +
> +	if (dup_count == 1) {
> +		/*
> +		 * We set up as a master, but there aren't any more duplicates.
> +		 * Simply clear ->dup_master, as ->master_count == ->count per
> +		 * the above.
> +		 */
> +		WRITE_ONCE(new_master->dup_master, NULL);
> +	} else {
> +		new_master->dup_active = active_count;
> +	}
> +}
> +
>  /*
>   * Add an event from the lists for its context.
>   * Must be called with ctx->mutex and ctx->lock held.

> @@ -4223,7 +4534,14 @@ static void __perf_event_read(void *info)
>  
>  static inline u64 perf_event_count(struct perf_event *event)
>  {
> -	return local64_read(&event->count) + atomic64_read(&event->child_count);
> +	u64 count;
> +
> +	if (likely(event->dup_master != event))
> +		count = local64_read(&event->count);
> +	else
> +		count = local64_read(&event->master_count);
> +
> +	return count + atomic64_read(&event->child_count);
>  }

So lsat time I said something about SMP ordering here. Where did that
go?

Specifically, without ordering it is possible to observe dup_master and
dup_count out of order. So while we might then see dup_master, we might
then also see an old dup_count, which would give 'interesting' effects.

Granted, adding memory barriers all over this will suck :/

  parent reply	other threads:[~2020-04-20 23:04 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-31  7:55 [PATCH v12] perf: Sharing PMU counters across compatible events Song Liu
2020-04-09 20:40 ` Song Liu
2020-04-20 23:04 ` Peter Zijlstra [this message]
2020-04-21  1:13   ` Song Liu
2020-04-21  7:18     ` Peter Zijlstra
2020-04-29 20:44   ` Song Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200420230429.GT2483@worktop.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=acme@redhat.com \
    --cc=alexey.budankov@linux.intel.com \
    --cc=jolsa@kernel.org \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=namhyung@kernel.org \
    --cc=rong.a.chen@intel.com \
    --cc=songliubraving@fb.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.