public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@elte.hu>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org, Paul Mackerras <paulus@samba.org>,
	Mike Galbraith <efault@gmx.de>,
	Arjan van de Ven <arjan@infradead.org>,
	Wu Fengguang <fengguang.wu@intel.com>
Subject: Re: [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument
Date: Wed, 25 Mar 2009 18:16:19 +0100	[thread overview]
Message-ID: <20090325171619.GC14250@elte.hu> (raw)
In-Reply-To: <1237992774.7972.1178.camel@twins>


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, 2009-03-25 at 13:57 +0100, Peter Zijlstra wrote:
> > On Wed, 2009-03-25 at 13:54 +0100, Ingo Molnar wrote:
> > > * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > 
> > > > On Wed, 2009-03-25 at 13:35 +0100, Ingo Molnar wrote:
> > > > 
> > > > > > Also, when mixing streams (events,mmap) is a single: you missed 
> > > > > > 'n' events still good?
> > > > > 
> > > > > How would such mixing work? Multiple counters streaming into the 
> > > > > same mmap area?
> > > > 
> > > > No basically having overflow events and mmap-vma changed events in 
> > > > a single output stream.
> > > 
> > > ah, and i missed the impact of variable size records - that too 
> > > makes it somewhat impractical to emit overflow records in situ. (the 
> > > kernel does not really know the precise start of the previous 
> > > record, typically.)
> > 
> > Alternatively, we could simply not emit new events until the read
> > position increases,. that's much simpler.
> > 
> > Still don't like mapping the stuff writable though..
> 
> This is what it would look like I suppose...
> 
> Any thoughts?
> 
> Not-signed-off-by: me

(you dont like it?)

> ---
>  include/linux/perf_counter.h |    4 ++
>  kernel/perf_counter.c        |   67 +++++++++++++++++++++++++++++++++++++----
>  2 files changed, 64 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> index 6bf67ce..d5a599c 100644
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -165,6 +165,8 @@ struct perf_counter_mmap_page {
>  	__s64	offset;			/* add to hardware counter value */
>  
>  	__u32   data_head;		/* head in the data section */
> +	__u32	data_tail;		/* user-space written tail */
> +	__u32	overflow;		/* number of lost events */

small detail: i'd suggest to always pad things up to 64 bits. In 
case someone adds a new field with u64.

>  };
>  
>  struct perf_event_header {
> @@ -269,8 +271,10 @@ struct file;
>  struct perf_mmap_data {
>  	struct rcu_head			rcu_head;
>  	int				nr_pages;
> +	int				writable;
>  	atomic_t			wakeup;
>  	atomic_t			head;
> +	atomic_t			overflow;
>  	struct perf_counter_mmap_page   *user_page;
>  	void 				*data_pages[0];
>  };
> diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
> index 3b862a7..1f5c515 100644
> --- a/kernel/perf_counter.c
> +++ b/kernel/perf_counter.c
> @@ -1330,6 +1330,7 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
>  		userpg->offset -= atomic64_read(&counter->hw.prev_count);
>  
>  	userpg->data_head = atomic_read(&data->head);
> +	userpg->overflow = atomic_read(&data->overflow);
>  	smp_wmb();
>  	++userpg->lock;
>  	preempt_enable();
> @@ -1375,6 +1376,28 @@ unlock:
>  	return ret;
>  }
>  
> +static int perf_mmap_mkwrite(struct vm_area_struct *vma, struct page *page)
> +{
> +	int ret = -EINVAL;
> +
> +	rcu_read_lock();
> +	data = rcu_dereference(counter->data);
> +	if (!data)
> +		goto unlock;
> +
> +	/*
> +	 * Only allow writes to the control page.
> +	 */
> +	if (page != virt_to_page(data->user_page))
> +		goto unlock;
> +
> +	ret = 0;
> +unlock:
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +

I guess this:

	rcu_read_lock();
	data = rcu_dereference(counter->data);

	/*
	 * Only allow writes to the control page.
	 */
	if (data && (page == virt_to_page(data->user_page))
		ret = 0;

	rcu_read_unlock();

is more compact?

>  static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
>  {
>  	struct perf_mmap_data *data;
> @@ -1463,6 +1486,7 @@ static struct vm_operations_struct perf_mmap_vmops = {
>  	.open = perf_mmap_open,
>  	.close = perf_mmap_close,
>  	.fault = perf_mmap_fault,
> +	.page_mkwrite = perf_mmap_mkwrite,
>  };

(nit: this structure should align vertically)

>  
>  static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> @@ -1473,7 +1497,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>  	unsigned long locked, lock_limit;
>  	int ret = 0;
>  
> -	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
> +	if (!(vma->vm_flags & VM_SHARED))
>  		return -EINVAL;
>  
>  	vma_size = vma->vm_end - vma->vm_start;
> @@ -1503,16 +1527,19 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>  
>  	mutex_lock(&counter->mmap_mutex);
>  	if (atomic_inc_not_zero(&counter->mmap_count))
> -		goto out;
> +		goto unlock;
>  
>  	WARN_ON(counter->data);
>  	ret = perf_mmap_data_alloc(counter, nr_pages);
> -	if (!ret)
> -		atomic_set(&counter->mmap_count, 1);
> -out:
> +	if (ret)
> +		goto unlock;
> +
> +	atomic_set(&counter->mmap_count, 1);
> +	if (vma->vm_flags & VM_WRITE)
> +		counter->data->writable = 1;
> +unlock:
>  	mutex_unlock(&counter->mmap_mutex);
>  
> -	vma->vm_flags &= ~VM_MAYWRITE;

does ->vm_fflags have VM_MAYWRITE by default?

>  	vma->vm_flags |= VM_RESERVED;
>  	vma->vm_ops = &perf_mmap_vmops;
>  
> @@ -1540,6 +1567,28 @@ struct perf_output_handle {
>  	int			wakeup;
>  };
>  
> +static int perf_output_overflow(struct perf_mmap_data *data,
> +				unsigned int offset, unsigned int head)
> +{
> +	unsigned int tail;
> +	unsigned int mask;
> +
> +	if (!data->writable)
> +		return 0;

so mmap()-ing it readonly turns off overflow detection 
automatically? That's a nice touch i think - user-space can avoid 
this overhead, if it does not care about overflows.

> +	mask = (data->nr_pages << PAGE_SHIFT) - 1;

btw., we could have a data->mask.

> +	smp_rmb();
> +	tail = ACCESS_ONCE(data->user_page->data_tail);
> +
> +	offset = (offset - tail) & mask;
> +	head   = (head   - tail) & mask;
> +
> +	if ((int)(head - offset) < 0)
> +		return 1;
> +
> +	return 0;

I guess it should use bool and return true/false.

> +}
> +
>  static int perf_output_begin(struct perf_output_handle *handle,
>  			     struct perf_counter *counter, unsigned int size)
>  {
> @@ -1552,11 +1601,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
>  		goto out;
>  
>  	if (!data->nr_pages)
> -		goto out;
> +		goto fail;
>  
>  	do {
>  		offset = head = atomic_read(&data->head);
>  		head += size;
> +		if (unlikely(perf_output_overflow(data, offset, head)))
> +			goto fail;
>  	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
>  
>  	handle->counter	= counter;
> @@ -1567,6 +1618,8 @@ static int perf_output_begin(struct perf_output_handle *handle,
>  
>  	return 0;
>  
> +fail:
> +	atomic_inc(&data->overflow);

data->user_page->overflow should be increased too - so that 
user-space can see it.

And do we really need data->overflow?

	Ingo

  reply	other threads:[~2009-03-25 17:16 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-03-25 11:30 [PATCH 0/6] perf_counter: new output ABI Peter Zijlstra
2009-03-25 11:30 ` [PATCH 1/6] perf_counter: more elaborate write API Peter Zijlstra
2009-03-25 12:06   ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-25 11:30 ` [PATCH 2/6] perf_counter: output objects Peter Zijlstra
2009-03-25 12:06   ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-25 11:30 ` [PATCH 3/6] perf_counter: sanity check on the output API Peter Zijlstra
2009-03-25 12:06   ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-25 11:30 ` [PATCH 4/6] perf_counter: optionally provide the pid/tid of the sampled task Peter Zijlstra
2009-03-25 12:06   ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-25 11:30 ` [PATCH 5/6] perf_counter: kerneltop: mmap_pages argument Peter Zijlstra
2009-03-25 12:07   ` [tip:perfcounters/core] " Peter Zijlstra
2009-03-25 12:18   ` [PATCH 5/6] " Ingo Molnar
2009-03-25 12:27     ` Peter Zijlstra
2009-03-25 12:35       ` Ingo Molnar
2009-03-25 12:41         ` Peter Zijlstra
2009-03-25 12:54           ` Ingo Molnar
2009-03-25 12:57             ` Peter Zijlstra
2009-03-25 14:52               ` Peter Zijlstra
2009-03-25 17:16                 ` Ingo Molnar [this message]
2009-03-25 21:18                   ` Peter Zijlstra
2009-03-26  2:22       ` Paul Mackerras
2009-03-25 11:30 ` [PATCH 6/6] perf_counter: kerneltop: output event support Peter Zijlstra
2009-03-25 12:07   ` [tip:perfcounters/core] " Peter Zijlstra
2009-04-04  0:21   ` [PATCH 6/6] " Corey Ashford
2009-04-04 12:17     ` Peter Zijlstra
2009-04-04 18:10       ` Corey Ashford
2009-03-25 12:05 ` [PATCH 0/6] perf_counter: new output ABI Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090325171619.GC14250@elte.hu \
    --to=mingo@elte.hu \
    --cc=arjan@infradead.org \
    --cc=efault@gmx.de \
    --cc=fengguang.wu@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulus@samba.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox