Re: [PATCH] net-tun: restructure tun_do_read for better sleep/wakeup efficiency

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Jason Wang <jasowang@redhat.com>
To: Xi Wang <xii@google.com>, "David S. Miller" <davem@davemloft.net>,
	netdev@vger.kernel.org
Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>,
	Neal Cardwell <ncardwell@google.com>,
	Eric Dumazet <edumazet@google.com>
Subject: Re: [PATCH] net-tun: restructure tun_do_read for better sleep/wakeup efficiency
Date: Wed, 07 May 2014 11:40:38 +0800	[thread overview]
Message-ID: <5369AB36.6030609@redhat.com> (raw)
In-Reply-To: <1399422244-22751-1-git-send-email-xii@google.com>

On 05/07/2014 08:24 AM, Xi Wang wrote:
> tun_do_read always adds current thread to wait queue, even if a packet
> is ready to read. This is inefficient because both sleeper and waker
> want to acquire the wait queue spin lock when packet rate is high.

After commit 61a5ff15ebdab87887861a6b128b108404e4706d, this will only
help for blocking read. Looks like for performance critical userspaces,
they will use non blocking reads.
>
> We restructure the read function and use common kernel networking
> routines to handle receive, sleep and wakeup. With the change
> available packets are checked first before the reading thread is added
> to the wait queue.

This is interesting, since it may help if we want to add rx busy loop
for tun. (In fact I worked a similar patch like this).
>
> Ran performance tests with the following configuration:
>
>  - my packet generator -> tap1 -> br0 -> tap0 -> my packet consumer
>  - sender pinned to one core and receiver pinned to another core
>  - sender send small UDP packets (64 bytes total) as fast as it can
>  - sandy bridge cores
>  - throughput are receiver side goodput numbers
>
> The results are
>
> baseline: 757k pkts/sec, cpu utilization at 1.54 cpus
>  changed: 804k pkts/sec, cpu utilization at 1.57 cpus
>
> The performance difference is largely determined by packet rate and
> inter-cpu communication cost. For example, if the sender and
> receiver are pinned to different cpu sockets, the results are
>
> baseline: 558k pkts/sec, cpu utilization at 1.71 cpus
>  changed: 690k pkts/sec, cpu utilization at 1.67 cpus

So I believe your consumer is using blocking reads. How about re-test
with non blocking reads and re-test to make sure no regression?
>
> Co-authored-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Xi Wang <xii@google.com>
> ---
>  drivers/net/tun.c | 68 +++++++++++++++++++++----------------------------------
>  1 file changed, 26 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ee328ba..cb25385 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -133,8 +133,7 @@ struct tap_filter {
>  struct tun_file {
>  	struct sock sk;
>  	struct socket socket;
> -	struct socket_wq wq;
> -	struct tun_struct __rcu *tun;
> +	struct tun_struct __rcu *tun ____cacheline_aligned_in_smp;

This seems a optimization which is un-related to the topic. May send as
another patch but did you really see improvement for this?
>  	struct net *net;
>  	struct fasync_struct *fasync;
>  	/* only used for fasnyc */
> @@ -498,12 +497,12 @@ static void tun_detach_all(struct net_device *dev)
>  	for (i = 0; i < n; i++) {
>  		tfile = rtnl_dereference(tun->tfiles[i]);
>  		BUG_ON(!tfile);
> -		wake_up_all(&tfile->wq.wait);
> +		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
>  		RCU_INIT_POINTER(tfile->tun, NULL);
>  		--tun->numqueues;
>  	}
>  	list_for_each_entry(tfile, &tun->disabled, next) {
> -		wake_up_all(&tfile->wq.wait);
> +	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
>  		RCU_INIT_POINTER(tfile->tun, NULL);
>  	}
>  	BUG_ON(tun->numqueues != 0);
> @@ -807,8 +806,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  	/* Notify and wake up reader process */
>  	if (tfile->flags & TUN_FASYNC)
>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> -	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
> -				   POLLRDNORM | POLLRDBAND);
> +	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
>  
>  	rcu_read_unlock();
>  	return NETDEV_TX_OK;
> @@ -965,7 +963,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
>  
>  	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>  
> -	poll_wait(file, &tfile->wq.wait, wait);
> +	poll_wait(file, sk_sleep(sk), wait);
>  
>  	if (!skb_queue_empty(&sk->sk_receive_queue))
>  		mask |= POLLIN | POLLRDNORM;
> @@ -1330,46 +1328,21 @@ done:
>  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>  			   const struct iovec *iv, ssize_t len, int noblock)
>  {
> -	DECLARE_WAITQUEUE(wait, current);
>  	struct sk_buff *skb;
>  	ssize_t ret = 0;
> +	int peeked, err, off = 0;
>  
>  	tun_debug(KERN_INFO, tun, "tun_do_read\n");
>  
> -	if (unlikely(!noblock))
> -		add_wait_queue(&tfile->wq.wait, &wait);
> -	while (len) {
> -		if (unlikely(!noblock))
> -			current->state = TASK_INTERRUPTIBLE;
> -
> -		/* Read frames from the queue */
> -		if (!(skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue))) {
> -			if (noblock) {
> -				ret = -EAGAIN;
> -				break;
> -			}
> -			if (signal_pending(current)) {
> -				ret = -ERESTARTSYS;
> -				break;
> -			}
> -			if (tun->dev->reg_state != NETREG_REGISTERED) {
> -				ret = -EIO;
> -				break;
> -			}
> -
> -			/* Nothing to read, let's sleep */
> -			schedule();
> -			continue;
> -		}
> +	if (!len)
> +		return ret;
>  
> +	/* Read frames from queue */
> +	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
> +				  &peeked, &off, &err);
> +	if (skb) {

This changes the userspace ABI a little bit. Originally, userspace can
see different error codes and do responds, but here it can only see zero.
>  		ret = tun_put_user(tun, tfile, skb, iv, len);
>  		kfree_skb(skb);
> -		break;
> -	}
> -
> -	if (unlikely(!noblock)) {
> -		current->state = TASK_RUNNING;
> -		remove_wait_queue(&tfile->wq.wait, &wait);
>  	}
>  
>  	return ret;
> @@ -2187,20 +2160,28 @@ out:
>  static int tun_chr_open(struct inode *inode, struct file * file)
>  {
>  	struct tun_file *tfile;
> +	struct socket_wq *wq;
>  
>  	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
>  
> +	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
> +	if (!wq)
> +		return -ENOMEM;
> +

Why not just reusing the socket_wq structure inside tun_file structure
like what we did in the past?
>  	tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
>  					    &tun_proto);
> -	if (!tfile)
> +	if (!tfile) {
> +		kfree(wq);
>  		return -ENOMEM;
> +	}
> +
>  	RCU_INIT_POINTER(tfile->tun, NULL);
>  	tfile->net = get_net(current->nsproxy->net_ns);
>  	tfile->flags = 0;
>  	tfile->ifindex = 0;
>  
> -	rcu_assign_pointer(tfile->socket.wq, &tfile->wq);
> -	init_waitqueue_head(&tfile->wq.wait);
> +	init_waitqueue_head(&wq->wait);
> +	RCU_INIT_POINTER(tfile->socket.wq, wq);
>  
>  	tfile->socket.file = file;
>  	tfile->socket.ops = &tun_socket_ops;
> @@ -2224,9 +2205,12 @@ static int tun_chr_close(struct inode *inode, struct file *file)
>  {
>  	struct tun_file *tfile = file->private_data;
>  	struct net *net = tfile->net;
> +	struct socket_wq *wq;
>  
> +	wq = rcu_dereference_protected(tfile->socket.wq, 1);
>  	tun_detach(tfile, true);
>  	put_net(net);
> +	kfree_rcu(wq, rcu);
>  
>  	return 0;
>  }

next prev parent reply	other threads:[~2014-05-07  3:40 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-07  0:24 [PATCH] net-tun: restructure tun_do_read for better sleep/wakeup efficiency Xi Wang
2014-05-07  3:40 ` Jason Wang [this message]
2014-05-08 18:22   ` Xi Wang
2014-05-09  3:10     ` Jason Wang
2014-05-09  6:34       ` Xi Wang
2014-05-12  6:15       ` Michael S. Tsirkin
2014-05-13  6:15         ` Jason Wang
2014-05-13  8:20           ` Michael S. Tsirkin
2014-05-13  8:46             ` Jason Wang
  -- strict thread matches above, loose matches on Subject: below --
2014-05-07  0:08 Xi Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5369AB36.6030609@redhat.com \
    --to=jasowang@redhat.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=maxk@qti.qualcomm.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=xii@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).