Netdev List
 help / color / mirror / Atom feed
* Re:[RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.
From: xiaohui.xin @ 2010-04-22  8:37 UTC (permalink / raw)
  To: mst; +Cc: arnd, netdev, kvm, linux-kernel, mingo, davem, jdike, Xin Xiaohui
In-Reply-To: <20100415090324.GA15135@redhat.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---

Michael,

>Can't vhost supply a kiocb completion callback that will handle the list?

Yes, thanks. And with it I also remove the vq->receiver finally.

Thanks
Xiaohui

 drivers/vhost/net.c   |  227 +++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/vhost.c |  115 ++++++++++++++-----------
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 301 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..4a70f66 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include <linux/workqueue.h>
 #include <linux/rcupdate.h>
 #include <linux/file.h>
+#include <linux/aio.h>
 
 #include <linux/net.h>
 #include <linux/if_packet.h>
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
+#include <linux/mpassthru.h>
 
 #include <net/sock.h>
 
@@ -47,6 +49,7 @@ struct vhost_net {
 	struct vhost_dev dev;
 	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 	struct vhost_poll poll[VHOST_NET_VQ_MAX];
+	struct kmem_cache       *cache;
 	/* Tells us whether we are polling a socket for TX.
 	 * We only do this when socket buffer fills up.
 	 * Protected by tx vq lock. */
@@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 	net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	if (!list_empty(&vq->notifier)) {
+		iocb = list_first_entry(&vq->notifier,
+				struct kiocb, ki_list);
+		list_del(&iocb->ki_list);
+	}
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+	return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+	struct vhost_virtqueue *vq = iocb->private;
+	unsigned long flags;
+
+        spin_lock_irqsave(&vq->notify_lock, flags);
+        list_add_tail(&iocb->ki_list, &vq->notifier);
+        spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+					 struct vhost_virtqueue *vq,
+					 struct socket *sock)
+{
+	struct kiocb *iocb = NULL;
+	struct vhost_log *vq_log = NULL;
+	int rx_total_len = 0;
+	unsigned int head, log, in, out;
+	int size;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+
+	if (sock->sk->sk_data_ready)
+		sock->sk->sk_data_ready(sock->sk, 0);
+
+	vq_log = unlikely(vhost_has_feature(
+				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+	while ((iocb = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, iocb->ki_nbytes);
+		log = (int)(iocb->ki_user_data >> 32);
+		size = iocb->ki_nbytes;
+		head = iocb->ki_pos;
+		rx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+		kmem_cache_free(net->cache, iocb);
+
+		/* when log is enabled, recomputing the log info is needed,
+		 * since these buffers are in async queue, and may not get
+		 * the log info before.
+		 */
+		if (unlikely(vq_log)) {
+			if (!log)
+				__vhost_get_vq_desc(&net->dev, vq, vq->iov,
+						    ARRAY_SIZE(vq->iov),
+						    &out, &in, vq_log,
+						    &log, head);
+			vhost_log_write(vq, vq_log, log, size);
+		}
+		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+					struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	int tx_total_len = 0;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+
+	while ((iocb = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, 0);
+		tx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+
+		kmem_cache_free(net->cache, iocb);
+		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+				 struct vhost_virtqueue *vq,
+				 unsigned head, unsigned log)
+{
+	struct kiocb *iocb = NULL;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return NULL; 
+	iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+	if (!iocb)
+		return NULL;
+	iocb->private = vq;
+	iocb->ki_pos = head;
+	iocb->ki_dtor = handle_iocb;
+	if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+		iocb->ki_user_data = ((unsigned long)log << 32 | vq->num);
+		iocb->ki_iovec = vq->hdr;
+	}
+	return iocb;
+}
+				 
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, s;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -124,6 +248,8 @@ static void handle_tx(struct vhost_net *net)
 		tx_poll_stop(net);
 	hdr_size = vq->hdr_size;
 
+	handle_async_tx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -151,6 +277,11 @@ static void handle_tx(struct vhost_net *net)
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
+
+		iocb = create_iocb(net, vq, head, 0);
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+			break;
+
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
@@ -160,12 +291,18 @@ static void handle_tx(struct vhost_net *net)
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(NULL, sock, &msg, len);
+		err = sock->ops->sendmsg(iocb, sock, &msg, len);
 		if (unlikely(err < 0)) {
+			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			tx_poll_start(net, sock);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		if (err != len)
 			pr_err("Truncated TX packet: "
 			       " len %d != %zd\n", err, len);
@@ -177,6 +314,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_tx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
@@ -186,6 +325,7 @@ static void handle_tx(struct vhost_net *net)
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -206,7 +346,8 @@ static void handle_rx(struct vhost_net *net)
 	int err;
 	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+			vq->link_state == VHOST_VQ_LINK_SYNC))
 		return;
 
 	use_mm(net->dev.mm);
@@ -214,9 +355,17 @@ static void handle_rx(struct vhost_net *net)
 	vhost_disable_notify(vq);
 	hdr_size = vq->hdr_size;
 
+	/* In async cases, when write log is enabled, in case the submitted
+	 * buffers did not get log info before the log enabling, so we'd
+	 * better recompute the log info when needed. We do this in
+	 * handle_async_rx_events_notify().
+	 */
+
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
+	handle_async_rx_events_notify(net, vq, sock);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -245,6 +394,11 @@ static void handle_rx(struct vhost_net *net)
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
+
+		iocb = create_iocb(net, vq, head, log);
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
+			break;
+
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
@@ -252,13 +406,20 @@ static void handle_rx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
-		err = sock->ops->recvmsg(NULL, sock, &msg,
+
+		err = sock->ops->recvmsg(iocb, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
+			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		/* TODO: Should check and handle checksum. */
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
@@ -284,10 +445,13 @@ static void handle_rx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_rx_events_notify(net, vq, sock);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
 
+
 static void handle_tx_kick(struct work_struct *work)
 {
 	struct vhost_virtqueue *vq;
@@ -338,6 +502,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
 	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+	n->cache = NULL;
 	return 0;
 }
 
@@ -398,6 +563,18 @@ static void vhost_net_flush(struct vhost_net *n)
 	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
 }
 
+static void vhost_async_cleanup(struct vhost_net *n)
+{
+	/* clean the notifier */
+	struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+	struct kiocb *iocb = NULL;
+	if (n->cache) {
+		while ((iocb = notify_dequeue(vq)) != NULL)
+			kmem_cache_free(n->cache, iocb);
+		kmem_cache_destroy(n->cache);
+	}
+}
+
 static int vhost_net_release(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = f->private_data;
@@ -414,6 +591,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
+	vhost_async_cleanup(n);
 	kfree(n);
 	return 0;
 }
@@ -462,7 +640,19 @@ static struct socket *get_tun_socket(int fd)
 	return sock;
 }
 
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+	struct file *file = fget(fd);
+	struct socket *sock;
+	if (!file)
+		return ERR_PTR(-EBADF);
+	sock = mp_get_socket(file);
+	if (IS_ERR(sock))
+		fput(file);
+	return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
 {
 	struct socket *sock;
 	if (fd == -1)
@@ -473,9 +663,30 @@ static struct socket *get_socket(int fd)
 	sock = get_tun_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	sock = get_mp_socket(fd);
+	if (!IS_ERR(sock)) {
+		vq->link_state = VHOST_VQ_LINK_ASYNC;
+		return sock;
+	}
 	return ERR_PTR(-ENOTSOCK);
 }
 
+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+	struct vhost_virtqueue *vq = n->vqs + index;
+
+	WARN_ON(!mutex_is_locked(&vq->mutex));
+	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+		INIT_LIST_HEAD(&vq->notifier);
+		spin_lock_init(&vq->notify_lock);
+		if (!n->cache) {
+			n->cache = kmem_cache_create("vhost_kiocb",
+					sizeof(struct kiocb), 0,
+					SLAB_HWCACHE_ALIGN, NULL);
+		}
+	}
+}
+
 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
@@ -493,12 +704,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 	vq = n->vqs + index;
 	mutex_lock(&vq->mutex);
-	sock = get_socket(fd);
+	vq->link_state = VHOST_VQ_LINK_SYNC;
+	sock = get_socket(vq, fd);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
 		goto err;
 	}
 
+	vhost_init_link_state(n, index);
+
 	/* start polling new socket */
 	oldsock = vq->private_data;
 	if (sock == oldsock)
@@ -507,8 +721,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, sock);
 	vhost_net_enable_vq(n, vq);
-	mutex_unlock(&vq->mutex);
 done:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
@@ -516,6 +730,7 @@ done:
 	}
 	return r;
 err:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	return r;
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
-			   struct vhost_log *log, unsigned int *log_num)
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head)
 {
 	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
+	unsigned int i = head, found = 0;
 	int ret;
 
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
-	if (get_user(vq->avail_idx, &vq->avail->idx)) {
-		vq_err(vq, "Failed to access avail idx at %p\n",
-		       &vq->avail->idx);
-		return vq->num;
-	}
-
-	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
-		vq_err(vq, "Guest moved used index from %u to %u",
-		       last_avail_idx, vq->avail_idx);
-		return vq->num;
-	}
-
-	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->avail_idx == last_avail_idx)
-		return vq->num;
-
-	/* Only get avail ring entries after they have been exposed by guest. */
-	rmb();
-
-	/* Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen. */
-	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
-		vq_err(vq, "Failed to read head: idx %d address %p\n",
-		       last_avail_idx,
-		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return vq->num;
-	}
-
-	/* If their number is silly, that's an error. */
-	if (head >= vq->num) {
-		vq_err(vq, "Guest says index %u > %u is available",
-		       head, vq->num);
-		return vq->num;
-	}
-
 	/* When we start there are none of either input nor output. */
 	*out_num = *in_num = 0;
 	if (unlikely(log))
 		*log_num = 0;
 
-	i = head;
 	do {
 		unsigned iov_count = *in_num + *out_num;
 		if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			*out_num += ret;
 		}
 	} while ((i = next_desc(&desc)) != -1);
+	return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+			   struct iovec iov[], unsigned int iov_size,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num)
+{
+	struct vring_desc desc;
+	unsigned int i, head, found = 0;
+	u16 last_avail_idx;
+	unsigned int ret;
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	last_avail_idx = vq->last_avail_idx;
+	if (get_user(vq->avail_idx, &vq->avail->idx)) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return vq->num;
+	}
+
+	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+		vq_err(vq, "Guest moved used index from %u to %u",
+		       last_avail_idx, vq->avail_idx);
+		return vq->num;
+	}
+
+	/* If there's nothing new since last we looked, return invalid. */
+	if (vq->avail_idx == last_avail_idx)
+		return vq->num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	rmb();
+
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen. */
+	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+		vq_err(vq, "Failed to read head: idx %d address %p\n",
+		       last_avail_idx,
+		       &vq->avail->ring[last_avail_idx % vq->num]);
+		return vq->num;
+	}
+
+	/* If their number is silly, that's an error. */
+	if (head >= vq->num) {
+		vq_err(vq, "Guest says index %u > %u is available",
+		       head, vq->num);
+		return vq->num;
+	}
+
+	ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+				  out_num, in_num,
+				  log, log_num, head);
 
 	/* On success, increment avail index. */
+	if (ret == vq->num)
+		return ret;
 	vq->last_avail_idx++;
 	return head;
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..8b95df8 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
 	u64 len;
 };
 
+enum vhost_vq_link_state {
+	VHOST_VQ_LINK_SYNC = 	0,
+	VHOST_VQ_LINK_ASYNC = 	1,
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log log[VHOST_NET_MAX_SG];
+	/*Differiate async socket for 0-copy from normal*/
+	enum vhost_vq_link_state link_state;
+	struct list_head notifier;
+	spinlock_t notify_lock;
 };
 
 struct vhost_dev {
@@ -122,6 +131,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+			   struct iovec iov[], unsigned int iov_count,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
1.5.4.4


^ permalink raw reply related

* RE: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.
From: Xin, Xiaohui @ 2010-04-22  8:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu,
	jdike@linux.intel.com, davem@davemloft.net
In-Reply-To: <20100421083507.GA30855@redhat.com>

Michael,

>Yes, I think this packet split mode probably maps well to mergeable buffer
>support. Note that
>1. Not all devices support large packets in this way, others might map
>   to indirect buffers better

Do the indirect buffers accord to deal with the skb->frag_list?

>   So we have to figure out how migration is going to work
Yes, different guest virtio-net driver may contain different features.
Does the qemu migration work with different features supported by virtio-net
driver now?

>2. It's up to guest driver whether to enable features such as
>   mergeable buffers and indirect buffers
>   So we have to figure out how to notify guest which mode
>   is optimal for a given device
Yes. When a device is binded, the mp device may query the capabilities from driver.
Actually, there is a structure now in mp device can do this, we can add some field
to support more.

>3. We don't want to depend on jumbo frames for decent performance
>   So we probably should support GSO/GRO
GSO is for the tx side, right? I think driver can handle it itself.
For GRO, I'm not sure it's easy or not. Basically, the mp device now
we have support is doing what raw socket is doing. The packets are not going to host stack.
-- 
MST

^ permalink raw reply

* Re: [RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-22  9:19 UTC (permalink / raw)
  To: Xin, Xiaohui
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu,
	jdike@linux.intel.com, davem@davemloft.net
In-Reply-To: <F2E9EB7348B8264F86B6AB8151CE2D79026FAE0BDB@shsmsx502.ccr.corp.intel.com>

On Thu, Apr 22, 2010 at 04:57:56PM +0800, Xin, Xiaohui wrote:
> Michael,
> 
> >Yes, I think this packet split mode probably maps well to mergeable buffer
> >support. Note that
> >1. Not all devices support large packets in this way, others might map
> >   to indirect buffers better
> 
> Do the indirect buffers accord to deal with the skb->frag_list?

We currently use skb->frags.

> >   So we have to figure out how migration is going to work
> Yes, different guest virtio-net driver may contain different features.
> Does the qemu migration work with different features supported by virtio-net
> driver now?

For now, you must have identical feature-sets for migration to work.
And long as we manage the buffers in software, we can always make
features match.

> >2. It's up to guest driver whether to enable features such as
> >   mergeable buffers and indirect buffers
> >   So we have to figure out how to notify guest which mode
> >   is optimal for a given device
> Yes. When a device is binded, the mp device may query the capabilities from driver.
> Actually, there is a structure now in mp device can do this, we can add some field
> to support more.
> 
> >3. We don't want to depend on jumbo frames for decent performance
> >   So we probably should support GSO/GRO
> GSO is for the tx side, right? I think driver can handle it itself.
> For GRO, I'm not sure it's easy or not. Basically, the mp device now
> we have support is doing what raw socket is doing. The packets are not going to host stack.

See commit bfd5f4a3d605e0f6054df0b59fe0907ff7e696d3
(it doesn't currently work with vhost net, but that's
 a separate story).

> -- 
> MST

^ permalink raw reply

* [PATCH v5] net: batch skb dequeueing from softnet input_pkt_queue
From: Changli Gao @ 2010-04-22  9:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: jamal, Tom Herbert, Eric Dumazet, netdev, Changli Gao

batch skb dequeueing from softnet input_pkt_queue

batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
contention when RPS is enabled. input_pkt_queue is reimplemented as a single
linked list (FIFO) to keep enqueueing and dequeueing as fast as posible, and
input_pkt_queue_lock is moved into RPS section to reduce 4 bytes on 32bits
machine.

Note: input_pkt_queue_len doesn't been decreased until process_backlog()
returns.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |   12 ++++-
 net/core/dev.c            |   99 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 82 insertions(+), 29 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..58abdd5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,7 @@ struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
+	struct sk_buff		*process_queue;
 
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
@@ -1396,15 +1397,20 @@ struct softnet_data {
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_head;
+	spinlock_t		input_pkt_queue_lock;
 #endif
-	struct sk_buff_head	input_pkt_queue;
+	unsigned int		input_pkt_queue_len;
+	struct sk_buff		*input_pkt_queue_head;
+	struct sk_buff		**input_pkt_queue_tailp;
+
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
+static inline void input_queue_head_add(struct softnet_data *sd,
+					unsigned int len)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head++;
+	sd->input_queue_head += len;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e904c47..f37c223 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -211,14 +211,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_lock(&sd->input_pkt_queue.lock);
+	spin_lock(&sd->input_pkt_queue_lock);
 #endif
 }
 
 static inline void rps_unlock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	spin_unlock(&sd->input_pkt_queue.lock);
+	spin_unlock(&sd->input_pkt_queue_lock);
 #endif
 }
 
@@ -2409,12 +2409,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	__get_cpu_var(netdev_rx_stat).total++;
 
 	rps_lock(sd);
-	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (sd->input_pkt_queue.qlen) {
+	if (sd->input_pkt_queue_len <= netdev_max_backlog) {
+		if (sd->input_pkt_queue_len) {
 enqueue:
-			__skb_queue_tail(&sd->input_pkt_queue, skb);
+			skb->next = NULL;
+			*sd->input_pkt_queue_tailp = skb;
+			sd->input_pkt_queue_tailp = &skb->next;
+			sd->input_pkt_queue_len++;
 #ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
+			*qtail = sd->input_queue_head + sd->input_pkt_queue_len;
 #endif
 			rps_unlock(sd);
 			local_irq_restore(flags);
@@ -2927,19 +2930,37 @@ EXPORT_SYMBOL(netif_receive_skb);
 /* Network device is going away, flush any packets still pending
  * Called with irqs disabled.
  */
-static void flush_backlog(void *arg)
+
+static struct sk_buff **__flush_backlog(struct softnet_data *sd,
+					struct sk_buff **pskb,
+					struct net_device *dev)
 {
-	struct net_device *dev = arg;
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
-	struct sk_buff *skb, *tmp;
+	struct sk_buff *skb;
 
-	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
+	while (*pskb) {
+		skb = *pskb;
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &sd->input_pkt_queue);
+			*pskb = skb->next;
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			input_queue_head_add(sd, 1);
+			sd->input_pkt_queue_len--;
+		} else {
+			pskb = &skb->next;
 		}
+	}
+
+	return pskb;
+}
+
+static void flush_backlog(void *arg)
+{
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct sk_buff **tailp;
+
+	rps_lock(sd);
+	tailp = __flush_backlog(sd, &sd->input_pkt_queue_head, arg);
+	sd->input_pkt_queue_tailp = tailp;
+	__flush_backlog(sd, &sd->process_queue, arg);
 	rps_unlock(sd);
 }
 
@@ -3249,24 +3270,39 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 
 	napi->weight = weight_p;
+	local_irq_disable();
 	do {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		while (sd->process_queue) {
+			skb = sd->process_queue;
+			sd->process_queue = skb->next;
+			local_irq_enable();
+			__netif_receive_skb(skb);
+			if (++work >= quota) {
+				local_irq_disable();
+				rps_lock(sd);
+				goto out;
+			}
+			local_irq_disable();
+		}
+
 		rps_lock(sd);
-		skb = __skb_dequeue(&sd->input_pkt_queue);
-		if (!skb) {
+		if (sd->input_pkt_queue_head == NULL) {
 			__napi_complete(napi);
-			rps_unlock(sd);
-			local_irq_enable();
 			break;
 		}
-		input_queue_head_incr(sd);
+		sd->process_queue = sd->input_pkt_queue_head;
+		sd->input_pkt_queue_head = NULL;
+		sd->input_pkt_queue_tailp = &sd->input_pkt_queue_head;
 		rps_unlock(sd);
-		local_irq_enable();
+	} while (1);
 
-		__netif_receive_skb(skb);
-	} while (++work < quota);
+out:
+	sd->input_pkt_queue_len -= work;
+	input_queue_head_add(sd, work);
+	rps_unlock(sd);
+	local_irq_enable();
 
 	return work;
 }
@@ -5621,10 +5657,17 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
-	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+	while ((skb = oldsd->input_pkt_queue_head)) {
+		oldsd->input_pkt_queue_head = skb->next;
+		netif_rx(skb);
+	}
+	while ((skb = oldsd->process_queue)) {
+		oldsd->process_queue = skb->next;
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
 	}
+	oldsd->input_pkt_queue_tailp = &oldsd->input_pkt_queue_head;
+	input_queue_head_add(oldsd, oldsd->input_pkt_queue_len);
+	oldsd->input_pkt_queue_len = 0;
 
 	return NOTIFY_OK;
 }
@@ -5842,11 +5885,15 @@ static int __init net_dev_init(void)
 	for_each_possible_cpu(i) {
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
-		skb_queue_head_init(&sd->input_pkt_queue);
+		sd->input_pkt_queue_head = NULL;
+		sd->input_pkt_queue_tailp = &sd->input_pkt_queue_head;
+		sd->input_pkt_queue_len = 0;
+		sd->process_queue = NULL;
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);
 
 #ifdef CONFIG_RPS
+		spin_lock_init(&sd->input_pkt_queue_lock);
 		sd->csd.func = rps_trigger_softirq;
 		sd->csd.info = sd;
 		sd->csd.flags = 0;

^ permalink raw reply related

* Re: [PATCH v5] net: batch skb dequeueing from softnet input_pkt_queue
From: David Miller @ 2010-04-22  9:43 UTC (permalink / raw)
  To: xiaosuo; +Cc: hadi, therbert, eric.dumazet, netdev
In-Reply-To: <1271927357-2973-1-git-send-email-xiaosuo@gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 22 Apr 2010 17:09:17 +0800

> +	unsigned int		input_pkt_queue_len;
> +	struct sk_buff		*input_pkt_queue_head;
> +	struct sk_buff		**input_pkt_queue_tailp;
> +

Please do not ignore Stephen Hemminger's feedback.

We already have enough odd SKB queue implementations, we
do not need yet another one in a core location.  This makes
it harder and harder to eventually convert sk_buff to use
"struct list_head".

Instead, use "struct sk_buff_head" and the lockless accessors
(__skb_insert, etc.) and initializer (__skb_queue_head_init).

^ permalink raw reply

* [patch] wimax: wimax_msg_alloc() returns ERR_PTR not null
From: Dan Carpenter @ 2010-04-22  9:46 UTC (permalink / raw)
  To: Inaky Perez-Gonzalez
  Cc: linux-wimax, André Goddard Rosa, wimax, netdev,
	kernel-janitors

wimax_msg_alloc() returns an ERR_PTR and not null.  I changed it to test
for ERR_PTR instead of null.  I also added a check in front of the
kfree() because kfree() can handle null but not ERR_PTR.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
index fa2e11e..05e2247 100644
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -300,17 +300,16 @@ void i2400m_rx_ctl_ack(struct i2400m *i2400m,
 		d_printf(1, dev, "Huh? waiter for command reply cancelled\n");
 		goto error_waiter_cancelled;
 	}
-	if (ack_skb == NULL) {
+	if (IS_ERR(ack_skb))
 		dev_err(dev, "CMD/GET/SET ack: cannot allocate SKB\n");
-		i2400m->ack_skb = ERR_PTR(-ENOMEM);
-	} else
-		i2400m->ack_skb = ack_skb;
+	i2400m->ack_skb = ack_skb;
 	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
 	complete(&i2400m->msg_completion);
 	return;
 
 error_waiter_cancelled:
-	kfree_skb(ack_skb);
+	if (!IS_ERR(ack_skb))
+		kfree_skb(ack_skb);
 error_no_waiter:
 	spin_unlock_irqrestore(&i2400m->rx_lock, flags);
 	return;

^ permalink raw reply related

* Re: [RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.
From: Michael S. Tsirkin @ 2010-04-22  9:49 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: arnd, netdev, kvm, linux-kernel, mingo, davem, jdike
In-Reply-To: <1271925436-4861-1-git-send-email-xiaohui.xin@intel.com>

On Thu, Apr 22, 2010 at 04:37:16PM +0800, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> ---
> 
> Michael,
> 
> >Can't vhost supply a kiocb completion callback that will handle the list?
> 
> Yes, thanks. And with it I also remove the vq->receiver finally.
> 
> Thanks
> Xiaohui

Nice progress. I commented on some minor issues below.
Thanks!

>  drivers/vhost/net.c   |  227 +++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/vhost/vhost.c |  115 ++++++++++++++-----------
>  drivers/vhost/vhost.h |   14 +++
>  3 files changed, 301 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..4a70f66 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include <linux/workqueue.h>
>  #include <linux/rcupdate.h>
>  #include <linux/file.h>
> +#include <linux/aio.h>
>  
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
>  #include <linux/if_arp.h>
>  #include <linux/if_tun.h>
> +#include <linux/mpassthru.h>
>  #include <net/sock.h>
>  
> @@ -47,6 +49,7 @@ struct vhost_net {
>  	struct vhost_dev dev;
>  	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
>  	struct vhost_poll poll[VHOST_NET_VQ_MAX];
> +	struct kmem_cache       *cache;
>  	/* Tells us whether we are polling a socket for TX.
>  	 * We only do this when socket buffer fills up.
>  	 * Protected by tx vq lock. */
> @@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
>  	net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> +{
> +	struct kiocb *iocb = NULL;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&vq->notify_lock, flags);
> +	if (!list_empty(&vq->notifier)) {
> +		iocb = list_first_entry(&vq->notifier,
> +				struct kiocb, ki_list);
> +		list_del(&iocb->ki_list);
> +	}
> +	spin_unlock_irqrestore(&vq->notify_lock, flags);
> +	return iocb;
> +}
> +
> +static void handle_iocb(struct kiocb *iocb)
> +{
> +	struct vhost_virtqueue *vq = iocb->private;
> +	unsigned long flags;
> +
> +        spin_lock_irqsave(&vq->notify_lock, flags);
> +        list_add_tail(&iocb->ki_list, &vq->notifier);
> +        spin_unlock_irqrestore(&vq->notify_lock, flags);
> +}
> +

checkpatch.pl does not complain about the above?

> +static void handle_async_rx_events_notify(struct vhost_net *net,
> +					 struct vhost_virtqueue *vq,
> +					 struct socket *sock)

continuation lines should start to the right of (.

> +{
> +	struct kiocb *iocb = NULL;
> +	struct vhost_log *vq_log = NULL;
> +	int rx_total_len = 0;
> +	unsigned int head, log, in, out;
> +	int size;
> +
> +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> +		return;
> +
> +	if (sock->sk->sk_data_ready)
> +		sock->sk->sk_data_ready(sock->sk, 0);
> +
> +	vq_log = unlikely(vhost_has_feature(
> +				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;

split the above line at ?, continuation being to the left of ( looks
ugly.

> +	while ((iocb = notify_dequeue(vq)) != NULL) {
> +		vhost_add_used_and_signal(&net->dev, vq,
> +				iocb->ki_pos, iocb->ki_nbytes);
> +		log = (int)(iocb->ki_user_data >> 32);

how about we always do the recompute step, and not encode
the log bit in ki_user_data?

> +		size = iocb->ki_nbytes;
> +		head = iocb->ki_pos;
> +		rx_total_len += iocb->ki_nbytes;
> +
> +		if (iocb->ki_dtor)
> +			iocb->ki_dtor(iocb);
> +		kmem_cache_free(net->cache, iocb);
> +
> +		/* when log is enabled, recomputing the log info is needed,
> +		 * since these buffers are in async queue, and may not get
> +		 * the log info before.
> +		 */
> +		if (unlikely(vq_log)) {
> +			if (!log)
> +				__vhost_get_vq_desc(&net->dev, vq, vq->iov,
> +						    ARRAY_SIZE(vq->iov),
> +						    &out, &in, vq_log,
> +						    &log, head);
> +			vhost_log_write(vq, vq_log, log, size);
> +		}
> +		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> +			vhost_poll_queue(&vq->poll);
> +			break;
> +		}
> +	}
> +}
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> +					struct vhost_virtqueue *vq)
> +{
> +	struct kiocb *iocb = NULL;
> +	int tx_total_len = 0;
> +
> +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> +		return;
> +
> +	while ((iocb = notify_dequeue(vq)) != NULL) {
> +		vhost_add_used_and_signal(&net->dev, vq,
> +				iocb->ki_pos, 0);
> +		tx_total_len += iocb->ki_nbytes;
> +
> +		if (iocb->ki_dtor)
> +			iocb->ki_dtor(iocb);
> +
> +		kmem_cache_free(net->cache, iocb);
> +		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> +			vhost_poll_queue(&vq->poll);
> +			break;
> +		}
> +	}
> +}
> +
> +static struct kiocb *create_iocb(struct vhost_net *net,
> +				 struct vhost_virtqueue *vq,
> +				 unsigned head, unsigned log)
> +{
> +	struct kiocb *iocb = NULL;
> +
> +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> +		return NULL; 
> +	iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> +	if (!iocb)
> +		return NULL;
> +	iocb->private = vq;
> +	iocb->ki_pos = head;
> +	iocb->ki_dtor = handle_iocb;
> +	if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
> +		iocb->ki_user_data = ((unsigned long)log << 32 | vq->num);
> +		iocb->ki_iovec = vq->hdr;
> +	}
> +	return iocb;
> +}
> +				 
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> +	struct kiocb *iocb = NULL;

do we need to init this?

>  	unsigned head, out, in, s;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -124,6 +248,8 @@ static void handle_tx(struct vhost_net *net)
>  		tx_poll_stop(net);
>  	hdr_size = vq->hdr_size;
>  
> +	handle_async_tx_events_notify(net, vq);
> +
>  	for (;;) {
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
> @@ -151,6 +277,11 @@ static void handle_tx(struct vhost_net *net)
>  		/* Skip header. TODO: support TSO. */
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>  		msg.msg_iovlen = out;
> +
> +		iocb = create_iocb(net, vq, head, 0);

For sync case, we can save some cycles by using iocb = NULL.

> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
> +			break;
> +
>  		len = iov_length(vq->iov, out);
>  		/* Sanity check */
>  		if (!len) {

Generally, I would like to reduce the number of places
where we do if (link_state == XXX) in code.
It should be possible to do this by splitting common code
out into functions.

> @@ -160,12 +291,18 @@ static void handle_tx(struct vhost_net *net)
>  			break;
>  		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> +		err = sock->ops->sendmsg(iocb, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> +			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +				kmem_cache_free(net->cache, iocb);
>  			vhost_discard_vq_desc(vq);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
> +
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +			continue;
> +
>  		if (err != len)
>  			pr_err("Truncated TX packet: "
>  			       " len %d != %zd\n", err, len);
> @@ -177,6 +314,8 @@ static void handle_tx(struct vhost_net *net)
>  		}
>  	}
>  
> +	handle_async_tx_events_notify(net, vq);
> +
>  	mutex_unlock(&vq->mutex);
>  	unuse_mm(net->dev.mm);
>  }
> @@ -186,6 +325,7 @@ static void handle_tx(struct vhost_net *net)
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> +	struct kiocb *iocb = NULL;
>  	unsigned head, out, in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
> @@ -206,7 +346,8 @@ static void handle_rx(struct vhost_net *net)
>  	int err;
>  	size_t hdr_size;
>  	struct socket *sock = rcu_dereference(vq->private_data);
> -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> +	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> +			vq->link_state == VHOST_VQ_LINK_SYNC))
>  		return;
>  
>  	use_mm(net->dev.mm);
> @@ -214,9 +355,17 @@ static void handle_rx(struct vhost_net *net)
>  	vhost_disable_notify(vq);
>  	hdr_size = vq->hdr_size;
>  
> +	/* In async cases, when write log is enabled, in case the submitted
> +	 * buffers did not get log info before the log enabling, so we'd
> +	 * better recompute the log info when needed. We do this in
> +	 * handle_async_rx_events_notify().
> +	 */
> +
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> +	handle_async_rx_events_notify(net, vq, sock);
> +
>  	for (;;) {
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
> @@ -245,6 +394,11 @@ static void handle_rx(struct vhost_net *net)
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
> +
> +		iocb = create_iocb(net, vq, head, log);
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC && !iocb)
> +			break;
> +
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
> @@ -252,13 +406,20 @@ static void handle_rx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> -		err = sock->ops->recvmsg(NULL, sock, &msg,
> +
> +		err = sock->ops->recvmsg(iocb, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
> +			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +				kmem_cache_free(net->cache, iocb);
>  			vhost_discard_vq_desc(vq);
>  			break;
>  		}
> +
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +			continue;
> +
>  		/* TODO: Should check and handle checksum. */
>  		if (err > len) {
>  			pr_err("Discarded truncated rx packet: "
> @@ -284,10 +445,13 @@ static void handle_rx(struct vhost_net *net)
>  		}
>  	}
>  
> +	handle_async_rx_events_notify(net, vq, sock);
> +
>  	mutex_unlock(&vq->mutex);
>  	unuse_mm(net->dev.mm);
>  }
>  
> +

don't do this

>  static void handle_tx_kick(struct work_struct *work)
>  {
>  	struct vhost_virtqueue *vq;
> @@ -338,6 +502,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
>  	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
>  	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> +	n->cache = NULL;
>  	return 0;
>  }
>  
> @@ -398,6 +563,18 @@ static void vhost_net_flush(struct vhost_net *n)
>  	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
>  }
>  
> +static void vhost_async_cleanup(struct vhost_net *n)
> +{
> +	/* clean the notifier */
> +	struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> +	struct kiocb *iocb = NULL;
> +	if (n->cache) {
> +		while ((iocb = notify_dequeue(vq)) != NULL)
> +			kmem_cache_free(n->cache, iocb);
> +		kmem_cache_destroy(n->cache);
> +	}
> +}
> +
>  static int vhost_net_release(struct inode *inode, struct file *f)
>  {
>  	struct vhost_net *n = f->private_data;
> @@ -414,6 +591,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
>  	/* We do an extra flush before freeing memory,
>  	 * since jobs can re-queue themselves. */
>  	vhost_net_flush(n);
> +	vhost_async_cleanup(n);
>  	kfree(n);
>  	return 0;
>  }
> @@ -462,7 +640,19 @@ static struct socket *get_tun_socket(int fd)
>  	return sock;
>  }
>  
> -static struct socket *get_socket(int fd)
> +static struct socket *get_mp_socket(int fd)
> +{
> +	struct file *file = fget(fd);
> +	struct socket *sock;
> +	if (!file)
> +		return ERR_PTR(-EBADF);
> +	sock = mp_get_socket(file);
> +	if (IS_ERR(sock))
> +		fput(file);
> +	return sock;
> +}
> +
> +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
>  {
>  	struct socket *sock;
>  	if (fd == -1)
> @@ -473,9 +663,30 @@ static struct socket *get_socket(int fd)
>  	sock = get_tun_socket(fd);
>  	if (!IS_ERR(sock))
>  		return sock;
> +	sock = get_mp_socket(fd);
> +	if (!IS_ERR(sock)) {
> +		vq->link_state = VHOST_VQ_LINK_ASYNC;
> +		return sock;
> +	}
>  	return ERR_PTR(-ENOTSOCK);
>  }
>  
> +static void vhost_init_link_state(struct vhost_net *n, int index)
> +{
> +	struct vhost_virtqueue *vq = n->vqs + index;
> +
> +	WARN_ON(!mutex_is_locked(&vq->mutex));
> +	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> +		INIT_LIST_HEAD(&vq->notifier);
> +		spin_lock_init(&vq->notify_lock);
> +		if (!n->cache) {
> +			n->cache = kmem_cache_create("vhost_kiocb",
> +					sizeof(struct kiocb), 0,
> +					SLAB_HWCACHE_ALIGN, NULL);
> +		}
> +	}
> +}
> +
>  static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  {
>  	struct socket *sock, *oldsock;
> @@ -493,12 +704,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  	}
>  	vq = n->vqs + index;
>  	mutex_lock(&vq->mutex);
> -	sock = get_socket(fd);
> +	vq->link_state = VHOST_VQ_LINK_SYNC;
> +	sock = get_socket(vq, fd);
>  	if (IS_ERR(sock)) {
>  		r = PTR_ERR(sock);
>  		goto err;
>  	}
>  
> +	vhost_init_link_state(n, index);
> +

I think we should just teach get_socket to return link_state
in addition to the socket pointer, and pass the returned value to
vhost_init_link_state.

>  	/* start polling new socket */
>  	oldsock = vq->private_data;
>  	if (sock == oldsock)
> @@ -507,8 +721,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  	vhost_net_disable_vq(n, vq);
>  	rcu_assign_pointer(vq->private_data, sock);
>  	vhost_net_enable_vq(n, vq);
> -	mutex_unlock(&vq->mutex);
>  done:
> +	mutex_unlock(&vq->mutex);
>  	mutex_unlock(&n->dev.mutex);
>  	if (oldsock) {
>  		vhost_net_flush_vq(n, index);

why the change above? Are you sure it's safe?  Need to be careful here:
doing everything under vq and dev mutex is much simpler.
If this change is required, need to review locking carefully
to make sure we are not introducing races.

> @@ -516,6 +730,7 @@ done:
>  	}
>  	return r;
>  err:
> +	mutex_unlock(&vq->mutex);
>  	mutex_unlock(&n->dev.mutex);
>  	return r;
>  }
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 97233d5..53dab80 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
>  	return 0;
>  }
>  
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access.  Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which
> - * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> +unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
>  			   struct iovec iov[], unsigned int iov_size,
>  			   unsigned int *out_num, unsigned int *in_num,
> -			   struct vhost_log *log, unsigned int *log_num)
> +			   struct vhost_log *log, unsigned int *log_num,
> +			   unsigned int head)
>  {
>  	struct vring_desc desc;
> -	unsigned int i, head, found = 0;
> -	u16 last_avail_idx;
> +	unsigned int i = head, found = 0;
>  	int ret;
>  
> -	/* Check it isn't doing very strange things with descriptor numbers. */
> -	last_avail_idx = vq->last_avail_idx;
> -	if (get_user(vq->avail_idx, &vq->avail->idx)) {
> -		vq_err(vq, "Failed to access avail idx at %p\n",
> -		       &vq->avail->idx);
> -		return vq->num;
> -	}
> -
> -	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
> -		vq_err(vq, "Guest moved used index from %u to %u",
> -		       last_avail_idx, vq->avail_idx);
> -		return vq->num;
> -	}
> -
> -	/* If there's nothing new since last we looked, return invalid. */
> -	if (vq->avail_idx == last_avail_idx)
> -		return vq->num;
> -
> -	/* Only get avail ring entries after they have been exposed by guest. */
> -	rmb();
> -
> -	/* Grab the next descriptor number they're advertising, and increment
> -	 * the index we've seen. */
> -	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
> -		vq_err(vq, "Failed to read head: idx %d address %p\n",
> -		       last_avail_idx,
> -		       &vq->avail->ring[last_avail_idx % vq->num]);
> -		return vq->num;
> -	}
> -
> -	/* If their number is silly, that's an error. */
> -	if (head >= vq->num) {
> -		vq_err(vq, "Guest says index %u > %u is available",
> -		       head, vq->num);
> -		return vq->num;
> -	}
> -
>  	/* When we start there are none of either input nor output. */
>  	*out_num = *in_num = 0;
>  	if (unlikely(log))
>  		*log_num = 0;
>  
> -	i = head;
>  	do {
>  		unsigned iov_count = *in_num + *out_num;
>  		if (i >= vq->num) {
> @@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
>  			*out_num += ret;
>  		}
>  	} while ((i = next_desc(&desc)) != -1);
> +	return head;
> +}
> +
> +/* This looks in the virtqueue and for the first available buffer, and converts
> + * it to an iovec for convenient access.  Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which
> + * is never a valid descriptor number) if none was found. */
> +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> +			   struct iovec iov[], unsigned int iov_size,
> +			   unsigned int *out_num, unsigned int *in_num,
> +			   struct vhost_log *log, unsigned int *log_num)
> +{
> +	struct vring_desc desc;
> +	unsigned int i, head, found = 0;
> +	u16 last_avail_idx;
> +	unsigned int ret;
> +
> +	/* Check it isn't doing very strange things with descriptor numbers. */
> +	last_avail_idx = vq->last_avail_idx;
> +	if (get_user(vq->avail_idx, &vq->avail->idx)) {
> +		vq_err(vq, "Failed to access avail idx at %p\n",
> +		       &vq->avail->idx);
> +		return vq->num;
> +	}
> +
> +	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
> +		vq_err(vq, "Guest moved used index from %u to %u",
> +		       last_avail_idx, vq->avail_idx);
> +		return vq->num;
> +	}
> +
> +	/* If there's nothing new since last we looked, return invalid. */
> +	if (vq->avail_idx == last_avail_idx)
> +		return vq->num;
> +
> +	/* Only get avail ring entries after they have been exposed by guest. */
> +	rmb();
> +
> +	/* Grab the next descriptor number they're advertising, and increment
> +	 * the index we've seen. */
> +	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
> +		vq_err(vq, "Failed to read head: idx %d address %p\n",
> +		       last_avail_idx,
> +		       &vq->avail->ring[last_avail_idx % vq->num]);
> +		return vq->num;
> +	}
> +
> +	/* If their number is silly, that's an error. */
> +	if (head >= vq->num) {
> +		vq_err(vq, "Guest says index %u > %u is available",
> +		       head, vq->num);
> +		return vq->num;
> +	}
> +
> +	ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
> +				  out_num, in_num,
> +				  log, log_num, head);
>  
>  	/* On success, increment avail index. */
> +	if (ret == vq->num)
> +		return ret;
>  	vq->last_avail_idx++;
>  	return head;
>  }
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d1f0453..8b95df8 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -43,6 +43,11 @@ struct vhost_log {
>  	u64 len;
>  };
>  
> +enum vhost_vq_link_state {
> +	VHOST_VQ_LINK_SYNC = 	0,
> +	VHOST_VQ_LINK_ASYNC = 	1,

don't try to align values to the right, just put a single space each
side of =.

> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -96,6 +101,10 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log log[VHOST_NET_MAX_SG];
> +	/*Differiate async socket for 0-copy from normal*/

spaces after /* and before */.

> +	enum vhost_vq_link_state link_state;
> +	struct list_head notifier;
> +	spinlock_t notify_lock;
>  };
>  
>  struct vhost_dev {
> @@ -122,6 +131,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
>  			   struct iovec iov[], unsigned int iov_count,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num);
> +unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> +			   struct iovec iov[], unsigned int iov_count,
> +			   unsigned int *out_num, unsigned int *in_num,
> +			   struct vhost_log *log, unsigned int *log_num,
> +			   unsigned int head);
>  void vhost_discard_vq_desc(struct vhost_virtqueue *);
>  
>  int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> -- 
> 1.5.4.4

^ permalink raw reply

* [patch] wimax: checking ERR_PTR vs null
From: Dan Carpenter @ 2010-04-22  9:50 UTC (permalink / raw)
  To: netdev
  Cc: Inaky Perez-Gonzalez, Alexey Dobriyan, Paulius Zaleckas,
	David S. Miller, wimax, kernel-janitors

stch_skb is allocated with wimax_gnl_re_state_change_alloc().  That
function returns ERR_PTRs on failure and doesn't return NULL.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 1ed65db..62b1a66 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -315,7 +315,7 @@ void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
 		BUG();
 	}
 	__wimax_state_set(wimax_dev, new_state);
-	if (stch_skb)
+	if (!IS_ERR(stch_skb))
 		wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
 out:
 	d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",

^ permalink raw reply related

* [patch] bluetooth: handle l2cap_create_connless_pdu() errors
From: Dan Carpenter @ 2010-04-22  9:52 UTC (permalink / raw)
  To: Marcel Holtmann
  Cc: David S. Miller, Gustavo F. Padovan, Andrei Emeltchenko,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA

l2cap_create_connless_pdu() can sometimes return ERR_PTR(-ENOMEM) or
ERR_PTR(-EFAULT).

Signed-off-by: Dan Carpenter <error27-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 99d68c3..9753b69 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1626,7 +1626,10 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 	/* Connectionless channel */
 	if (sk->sk_type == SOCK_DGRAM) {
 		skb = l2cap_create_connless_pdu(sk, msg, len);
-		err = l2cap_do_send(sk, skb);
+		if (IS_ERR(skb))
+			err = PTR_ERR(skb);
+		else
+			err = l2cap_do_send(sk, skb);
 		goto done;
 	}
 

^ permalink raw reply related

* [patch] rtnetlink: potential ERR_PTR dereference
From: Dan Carpenter @ 2010-04-22  9:53 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Patrick McHardy, Eric W. Biederman, Mitch Williams,
	David S. Miller, kernel-janitors

In the original code, if rtnl_create_link() returned an ERR_PTR then that
would get passed to rtnl_configure_link() which dereferences it.

Signed-off-by: Dan Carpenter <error27@gmail.com>
---
Found by a static checker, and compile tested only.  :/

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4568120..fe776c9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1270,10 +1270,11 @@ replay:
 			err = ops->newlink(net, dev, tb, data);
 		else
 			err = register_netdevice(dev);
-		if (err < 0 && !IS_ERR(dev)) {
+
+		if (err < 0 && !IS_ERR(dev))
 			free_netdev(dev);
+		if (err < 0)
 			goto out;
-		}
 
 		err = rtnl_configure_link(dev, ifm);
 		if (err < 0)

^ permalink raw reply related

* [patch] rdma: potential ERR_PTR dereference
From: Dan Carpenter @ 2010-04-22  9:55 UTC (permalink / raw)
  To: Andy Grover; +Cc: David S. Miller, rds-devel, netdev, kernel-janitors

In the original code, the "goto out" calls "rdma_destroy_id(cm_id);"
That isn't needed here and would cause problems because "cm_id" is an 
ERR_PTR.  The new code just returns directly.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 9ece910..7b15508 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -134,7 +134,7 @@ static int __init rds_rdma_listen_init(void)
 		ret = PTR_ERR(cm_id);
 		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
 		       "rdma_create_id() returned %d\n", ret);
-		goto out;
+		return ret;
 	}
 
 	sin.sin_family = AF_INET,

^ permalink raw reply related

* Re: [patch] rtnetlink: potential ERR_PTR dereference
From: Patrick McHardy @ 2010-04-22 10:35 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: netdev, Eric Dumazet, Eric W. Biederman, Mitch Williams,
	David S. Miller, kernel-janitors
In-Reply-To: <20100422095327.GP29647@bicker>

Dan Carpenter wrote:
> In the original code, if rtnl_create_link() returned an ERR_PTR then that
> would get passed to rtnl_configure_link() which dereferences it.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>
> ---
> Found by a static checker, and compile tested only.  :/

Looks fine to me.

Acked-by: Patrick McHardy <kaber@trash.net>

^ permalink raw reply

* Re: [net-next PATCH 1/2] add iovnl netlink support
From: Arnd Bergmann @ 2010-04-22 10:53 UTC (permalink / raw)
  To: David Miller; +Cc: scofeldm, netdev, chrisw
In-Reply-To: <20100421.235236.69366636.davem@davemloft.net>

On Thursday 22 April 2010, David Miller wrote:
> From: Scott Feldman <scofeldm@cisco.com>
> Date: Mon, 19 Apr 2010 12:18:07 -0700
> 
> > +     if (tb[IOV_ATTR_VF_IFNAME])
> > +             vf_dev = dev_get_by_name(&init_net,
> > +                     nla_data(tb[IOV_ATTR_VF_IFNAME]));
> 
> It's probably best to check this for NULL and notify
> the user with an error in that case (don't forget to
> put 'dev' in that error path :-)

Since you brought up that hunk: shouldn't the namespace better
be current->nsproxy->net_ns instead of init_ns? If the sender
is confined in a separate network namespace, I would expect
that it should be able to modify devices in its own namespace
but none that are in the root namespace.

	Arnd

^ permalink raw reply

* Re: [net-next PATCH 1/2] add iovnl netlink support
From: David Miller @ 2010-04-22 10:56 UTC (permalink / raw)
  To: arnd; +Cc: scofeldm, netdev, chrisw
In-Reply-To: <201004221253.11290.arnd@arndb.de>

From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Apr 2010 12:53:11 +0200

> On Thursday 22 April 2010, David Miller wrote:
>> From: Scott Feldman <scofeldm@cisco.com>
>> Date: Mon, 19 Apr 2010 12:18:07 -0700
>> 
>> > +     if (tb[IOV_ATTR_VF_IFNAME])
>> > +             vf_dev = dev_get_by_name(&init_net,
>> > +                     nla_data(tb[IOV_ATTR_VF_IFNAME]));
>> 
>> It's probably best to check this for NULL and notify
>> the user with an error in that case (don't forget to
>> put 'dev' in that error path :-)
> 
> Since you brought up that hunk: shouldn't the namespace better
> be current->nsproxy->net_ns instead of init_ns? If the sender
> is confined in a separate network namespace, I would expect
> that it should be able to modify devices in its own namespace
> but none that are in the root namespace.

Yes, the namespace needs to be handled better.

But reading other parts of the discussion it seems that
IOV_ATTR_VF_IFNAME and some other bits will likely be
removed in the initial implementation of this stuff.

^ permalink raw reply

* Re: [net-next PATCH 1/2] add iovnl netlink support
From: Arnd Bergmann @ 2010-04-22 11:12 UTC (permalink / raw)
  To: David Miller; +Cc: scofeldm, netdev, chrisw
In-Reply-To: <20100422.035615.176728799.davem@davemloft.net>

On Thursday 22 April 2010, David Miller wrote:
> But reading other parts of the discussion it seems that
> IOV_ATTR_VF_IFNAME and some other bits will likely be
> removed in the initial implementation of this stuff.

That's what I suggested, yes. However, I'm still waiting for
a reply from Scott wether it's actually possibly to remove
it based on the way that the enic firmware works.

	Arnd

^ permalink raw reply

* [PATCH] NIU support for skb->rxhash
From: David Miller @ 2010-04-22 11:21 UTC (permalink / raw)
  To: netdev


But it turns out using it is largely pointless since the only way to
get the hash value(s) is through a structure which is prepended to the
packet data (so we take a cache miss on the packet data anyways)
instead of being able to fetch it out of the RX descriptors :-/

If anyone out there is trying to design sane hardware, please put the
following into your RX descriptors:

1) ethernet protocol type (u16)
2) a flag bit indicating if the packet destination matched one
   of the programmed unicast MAC addresses
3) a flag bit indicating "multicast"
4) a flag bit indicating "broadcast"
5) at least 32-bits of the computed flow hash (u32)

kthx, bye!

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 493e25c..f8ee985 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -36,8 +36,8 @@
 #include "niu.h"
 
 #define DRV_MODULE_NAME		"niu"
-#define DRV_MODULE_VERSION	"1.0"
-#define DRV_MODULE_RELDATE	"Nov 14, 2008"
+#define DRV_MODULE_VERSION	"1.1"
+#define DRV_MODULE_RELDATE	"Apr 22, 2010"
 
 static char version[] __devinitdata =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
@@ -3444,6 +3444,7 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np,
 			      struct rx_ring_info *rp)
 {
 	unsigned int index = rp->rcr_index;
+	struct rx_pkt_hdr1 *rh;
 	struct sk_buff *skb;
 	int len, num_rcr;
 
@@ -3477,9 +3478,6 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np,
 		if (num_rcr == 1) {
 			int ptype;
 
-			off += 2;
-			append_size -= 2;
-
 			ptype = (val >> RCR_ENTRY_PKT_TYPE_SHIFT);
 			if ((ptype == RCR_PKT_TYPE_TCP ||
 			     ptype == RCR_PKT_TYPE_UDP) &&
@@ -3488,8 +3486,7 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np,
 				skb->ip_summed = CHECKSUM_UNNECESSARY;
 			else
 				skb->ip_summed = CHECKSUM_NONE;
-		}
-		if (!(val & RCR_ENTRY_MULTI))
+		} else if (!(val & RCR_ENTRY_MULTI))
 			append_size = len - skb->len;
 
 		niu_rx_skb_append(skb, page, off, append_size);
@@ -3510,8 +3507,16 @@ static int niu_process_rx_pkt(struct napi_struct *napi, struct niu *np,
 	}
 	rp->rcr_index = index;
 
-	skb_reserve(skb, NET_IP_ALIGN);
-	__pskb_pull_tail(skb, min(len, VLAN_ETH_HLEN));
+	len += sizeof(*rh);
+	len = min_t(int, len, sizeof(*rh) + VLAN_ETH_HLEN);
+	__pskb_pull_tail(skb, len);
+
+	rh = (struct rx_pkt_hdr1 *) skb->data;
+	skb->rxhash = ((u32)rh->hashval2_0 << 24 |
+		       (u32)rh->hashval2_1 << 16 |
+		       (u32)rh->hashval1_1 << 8 |
+		       (u32)rh->hashval1_2 << 0);
+	skb_pull(skb, sizeof(*rh));
 
 	rp->rx_packets++;
 	rp->rx_bytes += skb->len;
@@ -4946,7 +4951,9 @@ static int niu_init_one_rx_channel(struct niu *np, struct rx_ring_info *rp)
 	      RX_DMA_CTL_STAT_RCRTO |
 	      RX_DMA_CTL_STAT_RBR_EMPTY));
 	nw64(RXDMA_CFIG1(channel), rp->mbox_dma >> 32);
-	nw64(RXDMA_CFIG2(channel), (rp->mbox_dma & 0x00000000ffffffc0));
+	nw64(RXDMA_CFIG2(channel),
+	     ((rp->mbox_dma & RXDMA_CFIG2_MBADDR_L) |
+	      RXDMA_CFIG2_FULL_HDR));
 	nw64(RBR_CFIG_A(channel),
 	     ((u64)rp->rbr_table_size << RBR_CFIG_A_LEN_SHIFT) |
 	     (rp->rbr_dma & (RBR_CFIG_A_STADDR_BASE | RBR_CFIG_A_STADDR)));
diff --git a/drivers/net/niu.h b/drivers/net/niu.h
index 3bd0b59..d671546 100644
--- a/drivers/net/niu.h
+++ b/drivers/net/niu.h
@@ -2706,7 +2706,7 @@ struct rx_pkt_hdr0 {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	u8	inputport:2,
 		maccheck:1,
-		class:4;
+		class:5;
 	u8	vlan:1,
 		llcsnap:1,
 		noport:1,
@@ -2715,7 +2715,7 @@ struct rx_pkt_hdr0 {
 		tres:2,
 		tzfvld:1;
 #elif defined(__BIG_ENDIAN_BITFIELD)
-	u8	class:4,
+	u8	class:5,
 		maccheck:1,
 		inputport:2;
 	u8	tzfvld:1,
@@ -2775,6 +2775,9 @@ struct rx_pkt_hdr1 {
 	/* Bits 7:0 of hash value, H1.  */
 	u8	hashval1_2;
 
+	u8	hwrsvd5;
+	u8	hwrsvd6;
+
 	u8	usrdata_0;	/* Bits 39:32 of user data.  */
 	u8	usrdata_1;	/* Bits 31:24 of user data.  */
 	u8	usrdata_2;	/* Bits 23:16 of user data.  */

^ permalink raw reply related

* Re: [PATCH v5] net: batch skb dequeueing from softnet input_pkt_queue
From: Eric Dumazet @ 2010-04-22 11:37 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, jamal, Tom Herbert, netdev
In-Reply-To: <1271927357-2973-1-git-send-email-xiaosuo@gmail.com>

Le jeudi 22 avril 2010 à 17:09 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled. input_pkt_queue is reimplemented as a single
> linked list (FIFO) to keep enqueueing and dequeueing as fast as posible, and
> input_pkt_queue_lock is moved into RPS section to reduce 4 bytes on 32bits
> machine.
> 
> Note: input_pkt_queue_len doesn't been decreased until process_backlog()
> returns.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  include/linux/netdevice.h |   12 ++++-
>  net/core/dev.c            |   99 +++++++++++++++++++++++++++++++++-------------
>  2 files changed, 82 insertions(+), 29 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..58abdd5 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1387,6 +1387,7 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct list_head	poll_list;
>  	struct sk_buff		*completion_queue;
> +	struct sk_buff		*process_queue;
>  
>  #ifdef CONFIG_RPS
>  	struct softnet_data	*rps_ipi_list;
> @@ -1396,15 +1397,20 @@ struct softnet_data {
>  	struct softnet_data	*rps_ipi_next;
>  	unsigned int		cpu;
>  	unsigned int		input_queue_head;
> +	spinlock_t		input_pkt_queue_lock;
>  #endif
> -	struct sk_buff_head	input_pkt_queue;
> +	unsigned int		input_pkt_queue_len;
> +	struct sk_buff		*input_pkt_queue_head;
> +	struct sk_buff		**input_pkt_queue_tailp;
> +
>  	struct napi_struct	backlog;
>  };
>  
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> +static inline void input_queue_head_add(struct softnet_data *sd,
> +					unsigned int len)
>  {
>  #ifdef CONFIG_RPS
> -	sd->input_queue_head++;
> +	sd->input_queue_head += len;
>  #endif
>  }
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e904c47..f37c223 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -211,14 +211,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
>  static inline void rps_lock(struct softnet_data *sd)
>  {
>  #ifdef CONFIG_RPS
> -	spin_lock(&sd->input_pkt_queue.lock);
> +	spin_lock(&sd->input_pkt_queue_lock);
>  #endif
>  }
>  
>  static inline void rps_unlock(struct softnet_data *sd)
>  {
>  #ifdef CONFIG_RPS
> -	spin_unlock(&sd->input_pkt_queue.lock);
> +	spin_unlock(&sd->input_pkt_queue_lock);
>  #endif
>  }
>  
> @@ -2409,12 +2409,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
>  	rps_lock(sd);
> -	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
> -		if (sd->input_pkt_queue.qlen) {
> +	if (sd->input_pkt_queue_len <= netdev_max_backlog) {
> +		if (sd->input_pkt_queue_len) {
>  enqueue:
> -			__skb_queue_tail(&sd->input_pkt_queue, skb);
> +			skb->next = NULL;
> +			*sd->input_pkt_queue_tailp = skb;
> +			sd->input_pkt_queue_tailp = &skb->next;
> +			sd->input_pkt_queue_len++;
>  #ifdef CONFIG_RPS
> -			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
> +			*qtail = sd->input_queue_head + sd->input_pkt_queue_len;
>  #endif
>  			rps_unlock(sd);
>  			local_irq_restore(flags);
> @@ -2927,19 +2930,37 @@ EXPORT_SYMBOL(netif_receive_skb);
>  /* Network device is going away, flush any packets still pending
>   * Called with irqs disabled.
>   */
> -static void flush_backlog(void *arg)
> +
> +static struct sk_buff **__flush_backlog(struct softnet_data *sd,
> +					struct sk_buff **pskb,
> +					struct net_device *dev)
>  {
> -	struct net_device *dev = arg;
> -	struct softnet_data *sd = &__get_cpu_var(softnet_data);
> -	struct sk_buff *skb, *tmp;
> +	struct sk_buff *skb;
>  
> -	rps_lock(sd);
> -	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
> +	while (*pskb) {
> +		skb = *pskb;
>  		if (skb->dev == dev) {
> -			__skb_unlink(skb, &sd->input_pkt_queue);
> +			*pskb = skb->next;
>  			kfree_skb(skb);
> -			input_queue_head_incr(sd);
> +			input_queue_head_add(sd, 1);
> +			sd->input_pkt_queue_len--;
> +		} else {
> +			pskb = &skb->next;
>  		}
> +	}
> +
> +	return pskb;
> +}
> +
> +static void flush_backlog(void *arg)
> +{
> +	struct softnet_data *sd = &__get_cpu_var(softnet_data);
> +	struct sk_buff **tailp;
> +
> +	rps_lock(sd);
> +	tailp = __flush_backlog(sd, &sd->input_pkt_queue_head, arg);
> +	sd->input_pkt_queue_tailp = tailp;
> +	__flush_backlog(sd, &sd->process_queue, arg);
>  	rps_unlock(sd);
>  }
>  
> @@ -3249,24 +3270,39 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  	struct softnet_data *sd = &__get_cpu_var(softnet_data);
>  
>  	napi->weight = weight_p;
> +	local_irq_disable();
>  	do {
>  		struct sk_buff *skb;
>  
> -		local_irq_disable();
> +		while (sd->process_queue) {
> +			skb = sd->process_queue;
> +			sd->process_queue = skb->next;
> +			local_irq_enable();
> +			__netif_receive_skb(skb);
> +			if (++work >= quota) {
> +				local_irq_disable();
> +				rps_lock(sd);
> +				goto out;
> +			}
> +			local_irq_disable();
> +		}
> +
>  		rps_lock(sd);
> -		skb = __skb_dequeue(&sd->input_pkt_queue);
> -		if (!skb) {
> +		if (sd->input_pkt_queue_head == NULL) {
>  			__napi_complete(napi);
> -			rps_unlock(sd);
> -			local_irq_enable();
>  			break;
>  		}
> -		input_queue_head_incr(sd);
> +		sd->process_queue = sd->input_pkt_queue_head;
> +		sd->input_pkt_queue_head = NULL;
> +		sd->input_pkt_queue_tailp = &sd->input_pkt_queue_head;
>  		rps_unlock(sd);
> -		local_irq_enable();
> +	} while (1);
>  
> -		__netif_receive_skb(skb);
> -	} while (++work < quota);
> +out:
> +	sd->input_pkt_queue_len -= work;
> +	input_queue_head_add(sd, work);
> +	rps_unlock(sd);
> +	local_irq_enable();
>  



Please reorder things better.

Most likely this function is called for one packet.

In your version you take twice the rps_lock()/rps_unlock() path, so
it'll be slower.

Once to 'transfert' one list to process list

Once to be able to do the 'label out:' post processing.




^ permalink raw reply

* Re: [PATCH] NIU support for skb->rxhash
From: Eric Dumazet @ 2010-04-22 11:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100422.042157.99869295.davem@davemloft.net>

Le jeudi 22 avril 2010 à 04:21 -0700, David Miller a écrit :
> But it turns out using it is largely pointless since the only way to
> get the hash value(s) is through a structure which is prepended to the
> packet data (so we take a cache miss on the packet data anyways)
> instead of being able to fetch it out of the RX descriptors :-/
> 
> If anyone out there is trying to design sane hardware, please put the
> following into your RX descriptors:
> 
> 1) ethernet protocol type (u16)
> 2) a flag bit indicating if the packet destination matched one
>    of the programmed unicast MAC addresses
> 3) a flag bit indicating "multicast"
> 4) a flag bit indicating "broadcast"
> 5) at least 32-bits of the computed flow hash (u32)
> 
> kthx, bye!

Then, our stack also touch all 256 bytes of skb structure itself.

offsetof(struct sk_buff, next)    =0x0
offsetof(struct sk_buff, rxhash)  =0xa8
offsetof(struct sk_buff, dev)     =0x20
offsetof(struct sk_buff, len)     =0x68
offsetof(struct sk_buff, protocol)=0x7e
offsetof(struct sk_buff, network_header)=0xc0
offsetof(struct sk_buff, data)    =0xd8
offsetof(struct sk_buff, head)    =0xd0

Time for a reordering I guess ;)



^ permalink raw reply

* Re: [PATCH linux-next 1/2] irq: Add CPU mask affinity hint callback framework
From: Peter P Waskiewicz Jr @ 2010-04-22 12:11 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: tglx@linutronix.de, davem@davemloft.net, arjan@linux.jf.intel.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1271854785.2101.17.camel@achroite.uk.solarflarecom.com>

On Wed, 21 Apr 2010, Ben Hutchings wrote:

> On Tue, 2010-04-20 at 11:01 -0700, Peter P Waskiewicz Jr wrote:
>> This patch adds a callback function pointer to the irq_desc
>> structure, along with a registration function and a read-only
>> proc entry for each interrupt.
>>
>> This affinity_hint handle for each interrupt can be used by
>> underlying drivers that need a better mechanism to control
>> interrupt affinity.  The underlying driver can register a
>> callback for the interrupt, which will allow the driver to
>> provide the CPU mask for the interrupt to anything that
>> requests it.  The intent is to extend the userspace daemon,
>> irqbalance, to help hint to it a preferred CPU mask to balance
>> the interrupt into.
>
> Doesn't it make more sense to have the driver follow affinity decisions
> made from user-space?  I realise that reallocating queues is disruptive
> and we probably don't want irqbalance to trigger that, but there should
> be a mechanism for the administrator to trigger it.

The driver here would be assisting userspace (irqbalance) to provide 
better details how the HW is laid out with respect to flows.  As it stands 
today, irqbalance is almost guaranteed to move interrups to CPUs that are 
not aligned with where applications are running for network adapters. 
This is very apparent when running at speeds in the 10 Gigabit range, or 
even multiple 1 Gigabit ports running at the same time.

>
> Looking at your patch for ixgbe:
>
> [...]
>> diff --git a/drivers/net/ixgbe/ixgbe_main.c
>> b/drivers/net/ixgbe/ixgbe_main.c
>> index 1b1419c..3e00d41 100644
>> --- a/drivers/net/ixgbe/ixgbe_main.c
>> +++ b/drivers/net/ixgbe/ixgbe_main.c
> [...]
>> @@ -1083,6 +1113,16 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter)
>>                         q_vector->eitr = adapter->rx_eitr_param;
>>
>>                 ixgbe_write_eitr(q_vector);
>> +
>> +               /*
>> +                * Allocate the affinity_hint cpumask, assign the mask for
>> +                * this vector, and register our affinity_hint callback.
>> +                */
>> +               alloc_cpumask_var(&q_vector->affinity_mask, GFP_KERNEL);
>> +               cpumask_set_cpu(v_idx, q_vector->affinity_mask);
>> +               irq_register_affinity_hint(adapter->msix_entries[v_idx].vector,
>> +                                          adapter,
>> +                                          &ixgbe_irq_affinity_callback);
>>         }
>>
>>         if (adapter->hw.mac.type == ixgbe_mac_82598EB)
> [...]
>
> This just assigns IRQs to the first n CPU threads.  Depending on the
> enumeration order, this might result in assigning an IRQ to each of 2
> threads on a core while leaving other cores unused!

This ixgbe patch is only meant to be an example of how you could use it. 
I didn't hammer out all the corner cases of interrupt alignment in it yet. 
However, ixgbe is already aligning Tx flows onto the CPU/queue pair the Tx 
occurred (i.e. Tx session from CPU 4 will be queued on Tx queue 4), and 
then uses our Flow Director HW offload to steer Rx to Rx queue 4, assuming 
that the interrupt for Rx queue 4 is affinitized to CPU 4.  The flow 
alignment breaks when the IRQ affinity has no knowledge what the 
underlying set of vectors are bound to, and what mode the HW is running 
in.

FCoE offloads that spread multiple SCSI exchange IDs across CPU cores also 
needs this to properly align things.  John Fastabend is going to provide 
some examples where this is very useful in the FCoE case.

> irqbalance can already find the various IRQs associated with a single
> net device by looking at the handler names.  So it can do at least as
> well as this without such a hint.  Unless drivers have *useful* hints to
> give, I don't see the point in adding this mechanism.

irqbalance identifies which interrupts go with which network device.  But 
it has no clue about flow management, and often will make a decision that 
hurts performance scaling.  I have data showing when scaling multiple 10 
Gigabit ports (4 in the current test), I can gain an extra 10 Gigabits of 
throughput just by aligning the interrupts properly (go from ~58 Gbps to 
~68 Gbps in bi-directional tests).

I do have the patches for irqbalance that uses this new handle to make 
better decisions for devices implementing the mask.  I can send those to 
help show the whole picture of what's happening.

Appreciate the feedback though Ben.

Cheers,
-PJ

^ permalink raw reply

* Re: rps perfomance WAS(Re: rps: question
From: jamal @ 2010-04-22 12:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, Rick Jones, David Miller, therbert, netdev, robert,
	andi
In-Reply-To: <1271876480.7895.3106.camel@edumazet-laptop>

On Wed, 2010-04-21 at 21:01 +0200, Eric Dumazet wrote:

> Drawback of using a fixed src ip from your generator is that all flows
> share the same struct dst entry on SUT. This might explain some glitches
> you noticed (ip_route_input + ip_rcv at high level on slave/application
> cpus)

yes, that would explain it ;-> I could have flows going to each cpu
generating different unique dst. It is good i didnt ;->

> Also note your test is one way. If some data was replied we would see
> much use of the 'flows'
> 

In my next step i wanted to "route" these packets at app level and for
this stage of testing just wanted to sink the data to reduce experiment
variables. Reason:
The netdev structure would hit a lot of cache misses if i started using
it to both send/recv since lots of things are shared on tx/rx (example
napi tx prunning could happen on either tx or receive path); same thing
with qdisc path which is at netdev granularity.. I think there may be
room for interesting improvements in this area..

> I notice epoll_ctl() used a lot, are you re-arming epoll each time you
> receive a datagram ?

I am using default libevent on debian. It looks very old and maybe
buggy. I will try to upgrade first and if still see the same
investigate.
  
> I see slave/application cpus hit _raw_spin_lock_irqsave() and  
> _raw_spin_unlock_irqrestore().
> 
> Maybe a ring buffer could help (instead of a double linked queue) for
> backlog, or the double queue trick, if Changli wants to respin his
> patch.
> 

Ok, I will have some cycles later today/tommorow or for sure on weekend.
My setup is still intact - so i can test.

cheers,
jamal


^ permalink raw reply

* [PATCH] Socket filter ancilliary data access for skb->dev->type
From: Paul LeoNerd Evans @ 2010-04-22 12:12 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 1393 bytes --]

Add an SKF_AD_HATYPE field to the packet ancilliary data area, giving
access to skb->dev->type, as reported in the sll_hatype field.

When capturing packets on a PF_PACKET/SOCK_RAW socket bound to all
interfaces, there doesn't appear to be a way for the filter program to
actually find out the underlying hardware type the packet was captured
on. This patch adds such ability.

Signed-off-by: Paul Evans <leonerd@leonerd.org.uk>

---

diff -ur linux-2.6.33.2.orig/include/linux/filter.h linux-2.6.33.2/include/linux/filter.h
--- linux-2.6.33.2.orig/include/linux/filter.h	2010-04-02 00:02:33.000000000 +0100
+++ linux-2.6.33.2/include/linux/filter.h	2010-04-20 22:40:25.000000000 +0100
@@ -123,7 +123,8 @@
 #define SKF_AD_NLATTR_NEST	16
 #define SKF_AD_MARK 	20
 #define SKF_AD_QUEUE	24
-#define SKF_AD_MAX	28
+#define SKF_AD_HATYPE	28
+#define SKF_AD_MAX	32
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff -ur linux-2.6.33.2.orig/net/core/filter.c linux-2.6.33.2/net/core/filter.c
--- linux-2.6.33.2.orig/net/core/filter.c	2010-04-02 00:02:33.000000000 +0100
+++ linux-2.6.33.2/net/core/filter.c	2010-04-20 22:41:01.000000000 +0100
@@ -309,6 +309,9 @@
 		case SKF_AD_QUEUE:
 			A = skb->queue_mapping;
 			continue;
+		case SKF_AD_HATYPE:
+			A = skb->dev->type;
+			continue;
 		case SKF_AD_NLATTR: {
 			struct nlattr *nla;

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 190 bytes --]

^ permalink raw reply

* Re: [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-22 12:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Changli Gao, David S. Miller, netdev, Tom Herbert
In-Reply-To: <1271891149.7895.3751.camel@edumazet-laptop>

On Thu, 2010-04-22 at 01:05 +0200, Eric Dumazet wrote:


> [RFC] net: introduce a batch mode in process_backlog()
> 
> We see a lock contention on input_pkt_queue.lock in RPS benches.
> 
> As suggested by Changli Gao, we can batch several skbs at once in
> process_backlog(), so that we dirty input_pkt_queue less often.
> 

Ok, so i grab the latest and greatest net-next and apply this before
testing? Let me know..

cheers,
jamal


^ permalink raw reply

* Re: [PATCH v5] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-22 12:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Changli Gao, David S. Miller, Tom Herbert, netdev
In-Reply-To: <1271936227.7895.5285.camel@edumazet-laptop>

On Thu, 2010-04-22 at 13:37 +0200, Eric Dumazet wrote:

> Please reorder things better.
> 
> Most likely this function is called for one packet.
> 
> In your version you take twice the rps_lock()/rps_unlock() path, so
> it'll be slower.
> 
> Once to 'transfert' one list to process list
> 
> Once to be able to do the 'label out:' post processing.
> 

Ok, once it is settled the right changes are i will test.

cheers,
jamal


^ permalink raw reply

* Re: [PATCH v5] net: batch skb dequeueing from softnet input_pkt_queue
From: Changli Gao @ 2010-04-22 12:27 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: hadi, therbert, netdev
In-Reply-To: <20100422.024336.119875321.davem@davemloft.net>

On Thu, Apr 22, 2010 at 5:43 PM, David Miller <davem@davemloft.net> wrote:
> From: Changli Gao <xiaosuo@gmail.com>
> Date: Thu, 22 Apr 2010 17:09:17 +0800
>
>> +     unsigned int            input_pkt_queue_len;
>> +     struct sk_buff          *input_pkt_queue_head;
>> +     struct sk_buff          **input_pkt_queue_tailp;
>> +
>
> Please do not ignore Stephen Hemminger's feedback.
>
> We already have enough odd SKB queue implementations, we
> do not need yet another one in a core location.  This makes
> it harder and harder to eventually convert sk_buff to use
> "struct list_head".
>
> Instead, use "struct sk_buff_head" and the lockless accessors
> (__skb_insert, etc.) and initializer (__skb_queue_head_init).
>

If I want to keep softnet_data small, I have to access the internal
fields of sk_buff_head, and modify them in a hack way. It doesn't
sound good. If not, softnet_data will become:

struct softnet_data {
        struct Qdisc            *output_queue;
        struct list_head        poll_list;
        struct sk_buff          *completion_queue;
        struct sk_buff_head     process_queue;

#ifdef CONFIG_RPS
        struct softnet_data     *rps_ipi_list;

        /* Elements below can be accessed between CPUs for RPS */
        struct call_single_data csd ____cacheline_aligned_in_smp;
        struct softnet_data     *rps_ipi_next;
        unsigned int            cpu;
        unsigned int            input_queue_head;
#endif
        unsigned int            input_pkt_queue_len;
        struct sk_buff_head     input_pkt_queue;
        struct napi_struct      backlog;
};

Eric, do you think it is too fat?

^ permalink raw reply

* Re: [PATCH] Socket filter ancilliary data access for skb->dev->type
From: Patrick McHardy @ 2010-04-22 12:28 UTC (permalink / raw)
  To: Paul LeoNerd Evans; +Cc: netdev
In-Reply-To: <20100422121253.GR19334@cel.leo>

Paul LeoNerd Evans wrote:
> Add an SKF_AD_HATYPE field to the packet ancilliary data area, giving
> access to skb->dev->type, as reported in the sll_hatype field.
> 
> When capturing packets on a PF_PACKET/SOCK_RAW socket bound to all
> interfaces, there doesn't appear to be a way for the filter program to
> actually find out the underlying hardware type the packet was captured
> on. This patch adds such ability.
> 
> +		case SKF_AD_HATYPE:
> +			A = skb->dev->type;
> +			continue;

I think we should be adding a check whether skb->dev is non-NULL here
since filters can also be attached to netlink sockets. The same applies
to SKF_AD_IFINDEX.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox