From: "Michael S. Tsirkin" <mst@redhat.com>
To: Jason Wang <jasowang@redhat.com>
Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/3] tuntap: rx batching
Date: Wed, 9 Nov 2016 18:38:38 +0200 [thread overview]
Message-ID: <20161109183259-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <1478677113-13126-1-git-send-email-jasowang@redhat.com>
On Wed, Nov 09, 2016 at 03:38:31PM +0800, Jason Wang wrote:
> Backlog were used for tuntap rx, but it can only process 1 packet at
> one time since it was scheduled during sendmsg() synchronously in
> process context. This lead bad cache utilization so this patch tries
> to do some batching before call rx NAPI. This is done through:
>
> - accept MSG_MORE as a hint from sendmsg() caller, if it was set,
> batch the packet temporarily in a linked list and submit them all
> once MSG_MORE were cleared.
> - implement a tuntap specific NAPI handler for processing this kind of
> possible batching. (This could be done by extending backlog to
> support skb like, but using a tun specific one looks cleaner and
> easier for future extension).
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
So why do we need an extra queue? This is not what hardware devices do.
How about adding the packet to queue unconditionally, deferring
signalling until we get sendmsg without MSG_MORE?
> ---
> drivers/net/tun.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 65 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1588469..d40583b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -74,6 +74,7 @@
> #include <linux/skb_array.h>
>
> #include <asm/uaccess.h>
> +#include <linux/interrupt.h>
>
> /* Uncomment to enable debugging */
> /* #define TUN_DEBUG 1 */
> @@ -169,6 +170,8 @@ struct tun_file {
> struct list_head next;
> struct tun_struct *detached;
> struct skb_array tx_array;
> + struct napi_struct napi;
> + struct sk_buff_head process_queue;
> };
>
> struct tun_flow_entry {
> @@ -522,6 +525,8 @@ static void tun_queue_purge(struct tun_file *tfile)
> while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
> kfree_skb(skb);
>
> + skb_queue_purge(&tfile->sk.sk_write_queue);
> + skb_queue_purge(&tfile->process_queue);
> skb_queue_purge(&tfile->sk.sk_error_queue);
> }
>
> @@ -532,6 +537,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>
> tun = rtnl_dereference(tfile->tun);
>
> + if (tun && clean) {
> + napi_disable(&tfile->napi);
> + netif_napi_del(&tfile->napi);
> + }
> +
> if (tun && !tfile->detached) {
> u16 index = tfile->queue_index;
> BUG_ON(index >= tun->numqueues);
> @@ -587,6 +597,7 @@ static void tun_detach_all(struct net_device *dev)
>
> for (i = 0; i < n; i++) {
> tfile = rtnl_dereference(tun->tfiles[i]);
> + napi_disable(&tfile->napi);
> BUG_ON(!tfile);
> tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
> tfile->socket.sk->sk_data_ready(tfile->socket.sk);
> @@ -603,6 +614,7 @@ static void tun_detach_all(struct net_device *dev)
> synchronize_net();
> for (i = 0; i < n; i++) {
> tfile = rtnl_dereference(tun->tfiles[i]);
> + netif_napi_del(&tfile->napi);
> /* Drop read queue */
> tun_queue_purge(tfile);
> sock_put(&tfile->sk);
> @@ -618,6 +630,41 @@ static void tun_detach_all(struct net_device *dev)
> module_put(THIS_MODULE);
> }
>
> +static int tun_poll(struct napi_struct *napi, int budget)
> +{
> + struct tun_file *tfile = container_of(napi, struct tun_file, napi);
> + struct sk_buff_head *input_queue =
> + &tfile->socket.sk->sk_write_queue;
> + struct sk_buff *skb;
> + unsigned int received = 0;
> +
> + while (1) {
> + while ((skb = __skb_dequeue(&tfile->process_queue))) {
> + netif_receive_skb(skb);
> + if (++received >= budget)
> + return received;
> + }
> +
> + spin_lock(&input_queue->lock);
> + if (skb_queue_empty(input_queue)) {
> + spin_unlock(&input_queue->lock);
> + break;
> + }
> + skb_queue_splice_tail_init(input_queue, &tfile->process_queue);
> + spin_unlock(&input_queue->lock);
> + }
> +
> + if (received < budget) {
> + napi_complete(napi);
> + if (skb_peek(&tfile->socket.sk->sk_write_queue) &&
> + unlikely(napi_schedule_prep(napi))) {
> + __napi_schedule(napi);
> + }
> + }
> +
> + return received;
> +}
> +
> static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
> {
> struct tun_file *tfile = file->private_data;
> @@ -666,9 +713,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
>
> if (tfile->detached)
> tun_enable_queue(tfile);
> - else
> + else {
> sock_hold(&tfile->sk);
> -
> + netif_napi_add(tun->dev, &tfile->napi, tun_poll, 64);
> + napi_enable(&tfile->napi);
> + }
> tun_set_real_num_queues(tun);
>
> /* device is allowed to go away first, so no need to hold extra
> @@ -1150,7 +1199,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
> /* Get packet from user space buffer */
> static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
> void *msg_control, struct iov_iter *from,
> - int noblock)
> + int noblock, bool more)
> {
> struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
> struct sk_buff *skb;
> @@ -1296,7 +1345,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
> skb_probe_transport_header(skb, 0);
>
> rxhash = skb_get_hash(skb);
> - netif_rx_ni(skb);
> + skb_queue_tail(&tfile->socket.sk->sk_write_queue, skb);
> +
> + if (!more) {
> + local_bh_disable();
> + napi_schedule(&tfile->napi);
> + local_bh_enable();
Why do we need to disable bh here? I thought napi_schedule can
be called from any context.
> + }
>
> stats = get_cpu_ptr(tun->pcpu_stats);
> u64_stats_update_begin(&stats->syncp);
> @@ -1319,7 +1374,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
> if (!tun)
> return -EBADFD;
>
> - result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
> + result = tun_get_user(tun, tfile, NULL, from,
> + file->f_flags & O_NONBLOCK, false);
>
> tun_put(tun);
> return result;
> @@ -1579,7 +1635,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> return -EBADFD;
>
> ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> - m->msg_flags & MSG_DONTWAIT);
> + m->msg_flags & MSG_DONTWAIT,
> + m->msg_flags & MSG_MORE);
> tun_put(tun);
> return ret;
> }
> @@ -2336,6 +2393,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> file->private_data = tfile;
> INIT_LIST_HEAD(&tfile->next);
>
> + skb_queue_head_init(&tfile->process_queue);
> +
> sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
>
> return 0;
> --
> 2.7.4
next prev parent reply other threads:[~2016-11-09 16:38 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-11-09 7:38 [PATCH 1/3] tuntap: rx batching Jason Wang
2016-11-09 7:38 ` [PATCH 2/3] vhost: better detection of available buffers Jason Wang
2016-11-09 19:57 ` Michael S. Tsirkin
2016-11-11 2:18 ` Jason Wang
2016-11-11 3:41 ` Michael S. Tsirkin
2016-11-11 4:18 ` Jason Wang
2016-11-11 16:20 ` Michael S. Tsirkin
2016-11-15 3:16 ` Jason Wang
2016-11-15 3:28 ` Michael S. Tsirkin
2016-11-15 8:00 ` Jason Wang
2016-11-15 14:46 ` Michael S. Tsirkin
2016-11-09 7:38 ` [PATCH 3/3] vhost_net: tx support batching Jason Wang
2016-11-09 20:05 ` Michael S. Tsirkin
2016-11-11 2:27 ` Jason Wang
2016-11-09 16:38 ` Michael S. Tsirkin [this message]
2016-11-11 2:07 ` [PATCH 1/3] tuntap: rx batching Jason Wang
2016-11-11 3:31 ` Michael S. Tsirkin
2016-11-11 4:10 ` Jason Wang
2016-11-11 4:17 ` John Fastabend
2016-11-11 4:28 ` Jason Wang
2016-11-11 4:45 ` John Fastabend
2016-11-11 16:20 ` Michael S. Tsirkin
2016-11-15 3:14 ` Jason Wang
2016-11-15 3:41 ` Michael S. Tsirkin
2016-11-15 8:08 ` Jason Wang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20161109183259-mutt-send-email-mst@kernel.org \
--to=mst@redhat.com \
--cc=jasowang@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).