Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.
From: xiaohui.xin @ 2010-04-09  9:37 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1270805865-16901-2-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---
 drivers/vhost/net.c   |  203 +++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/vhost.c |  115 ++++++++++++++++------------
 drivers/vhost/vhost.h |   15 ++++
 3 files changed, 278 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..d3fb3fc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include <linux/workqueue.h>
 #include <linux/rcupdate.h>
 #include <linux/file.h>
+#include <linux/aio.h>
 
 #include <linux/net.h>
 #include <linux/if_packet.h>
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
+#include <linux/mpassthru.h>
 
 #include <net/sock.h>
 
@@ -47,6 +49,7 @@ struct vhost_net {
 	struct vhost_dev dev;
 	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 	struct vhost_poll poll[VHOST_NET_VQ_MAX];
+	struct kmem_cache       *cache;
 	/* Tells us whether we are polling a socket for TX.
 	 * We only do this when socket buffer fills up.
 	 * Protected by tx vq lock. */
@@ -91,11 +94,100 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 	net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	if (!list_empty(&vq->notifier)) {
+		iocb = list_first_entry(&vq->notifier,
+				struct kiocb, ki_list);
+		list_del(&iocb->ki_list);
+	}
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+	return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+					struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	struct vhost_log *vq_log = NULL;
+	int rx_total_len = 0;
+	unsigned int head, log, in, out;
+	int size;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+
+	if (vq->receiver)
+		vq->receiver(vq);
+
+	vq_log = unlikely(vhost_has_feature(
+				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+	while ((iocb = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, iocb->ki_nbytes);
+		log = (int)iocb->ki_user_data;
+		size = iocb->ki_nbytes;
+		head = iocb->ki_pos;
+		rx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+		kmem_cache_free(net->cache, iocb);
+
+		/* when log is enabled, recomputing the log info is needed,
+		 * since these buffers are in async queue, and may not get
+		 * the log info before.
+		 */
+		if (unlikely(vq_log)) {
+			if (!log)
+				__vhost_get_vq_desc(&net->dev, vq, vq->iov,
+						    ARRAY_SIZE(vq->iov),
+						    &out, &in, vq_log,
+						    &log, head);
+			vhost_log_write(vq, vq_log, log, size);
+		}
+		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+					struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	int tx_total_len = 0;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+
+	while ((iocb = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, 0);
+		tx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+
+		kmem_cache_free(net->cache, iocb);
+		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, s;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -124,6 +216,8 @@ static void handle_tx(struct vhost_net *net)
 		tx_poll_stop(net);
 	hdr_size = vq->hdr_size;
 
+	handle_async_tx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -151,6 +245,15 @@ static void handle_tx(struct vhost_net *net)
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+			if (!iocb)
+				break;
+			iocb->ki_pos = head;
+			iocb->private = (void *)vq;
+		}
+
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
@@ -160,12 +263,18 @@ static void handle_tx(struct vhost_net *net)
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(NULL, sock, &msg, len);
+		err = sock->ops->sendmsg(iocb, sock, &msg, len);
 		if (unlikely(err < 0)) {
+			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			tx_poll_start(net, sock);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		if (err != len)
 			pr_err("Truncated TX packet: "
 			       " len %d != %zd\n", err, len);
@@ -177,6 +286,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_tx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
@@ -186,6 +297,7 @@ static void handle_tx(struct vhost_net *net)
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -206,7 +318,8 @@ static void handle_rx(struct vhost_net *net)
 	int err;
 	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+			vq->link_state == VHOST_VQ_LINK_SYNC))
 		return;
 
 	use_mm(net->dev.mm);
@@ -214,9 +327,17 @@ static void handle_rx(struct vhost_net *net)
 	vhost_disable_notify(vq);
 	hdr_size = vq->hdr_size;
 
+	/* In async cases, when write log is enabled, in case the submitted
+	 * buffers did not get log info before the log enabling, so we'd
+	 * better recompute the log info when needed. We do this in
+	 * handle_async_rx_events_notify().
+	 */
+
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
+	handle_async_rx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -245,6 +366,14 @@ static void handle_rx(struct vhost_net *net)
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+			if (!iocb)
+				break;
+			iocb->private = vq;
+			iocb->ki_pos = head;
+			iocb->ki_user_data = log;
+		}
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
@@ -252,13 +381,20 @@ static void handle_rx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
-		err = sock->ops->recvmsg(NULL, sock, &msg,
+
+		err = sock->ops->recvmsg(iocb, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
+			if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		/* TODO: Should check and handle checksum. */
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
@@ -284,10 +420,13 @@ static void handle_rx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_rx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
 
+
 static void handle_tx_kick(struct work_struct *work)
 {
 	struct vhost_virtqueue *vq;
@@ -338,6 +477,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
 	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+	n->cache = NULL;
 	return 0;
 }
 
@@ -398,6 +538,18 @@ static void vhost_net_flush(struct vhost_net *n)
 	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
 }
 
+static void vhost_async_cleanup(struct vhost_net *n)
+{
+	/* clean the notifier */
+	struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+	struct kiocb *iocb = NULL;
+	if (n->cache) {
+		while ((iocb = notify_dequeue(vq)) != NULL)
+			kmem_cache_free(n->cache, iocb);
+		kmem_cache_destroy(n->cache);
+	}
+}
+
 static int vhost_net_release(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = f->private_data;
@@ -414,6 +566,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
+	vhost_async_cleanup(n);
 	kfree(n);
 	return 0;
 }
@@ -462,7 +615,19 @@ static struct socket *get_tun_socket(int fd)
 	return sock;
 }
 
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+	struct file *file = fget(fd);
+	struct socket *sock;
+	if (!file)
+		return ERR_PTR(-EBADF);
+	sock = mp_get_socket(file);
+	if (IS_ERR(sock))
+		fput(file);
+	return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
 {
 	struct socket *sock;
 	if (fd == -1)
@@ -473,9 +638,31 @@ static struct socket *get_socket(int fd)
 	sock = get_tun_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	sock = get_mp_socket(fd);
+	if (!IS_ERR(sock)) {
+		vq->link_state = VHOST_VQ_LINK_ASYNC;
+		return sock;
+	}
 	return ERR_PTR(-ENOTSOCK);
 }
 
+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+	struct vhost_virtqueue *vq = n->vqs + index;
+
+	WARN_ON(!mutex_is_locked(&vq->mutex));
+	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+		vq->receiver = NULL;
+		INIT_LIST_HEAD(&vq->notifier);
+		spin_lock_init(&vq->notify_lock);
+		if (!n->cache) {
+			n->cache = kmem_cache_create("vhost_kiocb",
+					sizeof(struct kiocb), 0,
+					SLAB_HWCACHE_ALIGN, NULL);
+		}
+	}
+}
+
 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
@@ -493,12 +680,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 	vq = n->vqs + index;
 	mutex_lock(&vq->mutex);
-	sock = get_socket(fd);
+	vq->link_state = VHOST_VQ_LINK_SYNC;
+	sock = get_socket(vq, fd);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
 		goto err;
 	}
 
+	vhost_init_link_state(n, index);
+
 	/* start polling new socket */
 	oldsock = vq->private_data;
 	if (sock == oldsock)
@@ -507,8 +697,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, sock);
 	vhost_net_enable_vq(n, vq);
-	mutex_unlock(&vq->mutex);
 done:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
@@ -516,6 +706,7 @@ done:
 	}
 	return r;
 err:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	return r;
 }
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
-			   struct vhost_log *log, unsigned int *log_num)
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head)
 {
 	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
+	unsigned int i = head, found = 0;
 	int ret;
 
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
-	if (get_user(vq->avail_idx, &vq->avail->idx)) {
-		vq_err(vq, "Failed to access avail idx at %p\n",
-		       &vq->avail->idx);
-		return vq->num;
-	}
-
-	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
-		vq_err(vq, "Guest moved used index from %u to %u",
-		       last_avail_idx, vq->avail_idx);
-		return vq->num;
-	}
-
-	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->avail_idx == last_avail_idx)
-		return vq->num;
-
-	/* Only get avail ring entries after they have been exposed by guest. */
-	rmb();
-
-	/* Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen. */
-	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
-		vq_err(vq, "Failed to read head: idx %d address %p\n",
-		       last_avail_idx,
-		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return vq->num;
-	}
-
-	/* If their number is silly, that's an error. */
-	if (head >= vq->num) {
-		vq_err(vq, "Guest says index %u > %u is available",
-		       head, vq->num);
-		return vq->num;
-	}
-
 	/* When we start there are none of either input nor output. */
 	*out_num = *in_num = 0;
 	if (unlikely(log))
 		*log_num = 0;
 
-	i = head;
 	do {
 		unsigned iov_count = *in_num + *out_num;
 		if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			*out_num += ret;
 		}
 	} while ((i = next_desc(&desc)) != -1);
+	return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+			   struct iovec iov[], unsigned int iov_size,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num)
+{
+	struct vring_desc desc;
+	unsigned int i, head, found = 0;
+	u16 last_avail_idx;
+	unsigned int ret;
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	last_avail_idx = vq->last_avail_idx;
+	if (get_user(vq->avail_idx, &vq->avail->idx)) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return vq->num;
+	}
+
+	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+		vq_err(vq, "Guest moved used index from %u to %u",
+		       last_avail_idx, vq->avail_idx);
+		return vq->num;
+	}
+
+	/* If there's nothing new since last we looked, return invalid. */
+	if (vq->avail_idx == last_avail_idx)
+		return vq->num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	rmb();
+
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen. */
+	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+		vq_err(vq, "Failed to read head: idx %d address %p\n",
+		       last_avail_idx,
+		       &vq->avail->ring[last_avail_idx % vq->num]);
+		return vq->num;
+	}
+
+	/* If their number is silly, that's an error. */
+	if (head >= vq->num) {
+		vq_err(vq, "Guest says index %u > %u is available",
+		       head, vq->num);
+		return vq->num;
+	}
+
+	ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+				  out_num, in_num,
+				  log, log_num, head);
 
 	/* On success, increment avail index. */
+	if (ret == vq->num)
+		return ret;
 	vq->last_avail_idx++;
 	return head;
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..a74a6d4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
 	u64 len;
 };
 
+enum vhost_vq_link_state {
+	VHOST_VQ_LINK_SYNC = 	0,
+	VHOST_VQ_LINK_ASYNC = 	1,
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -96,6 +101,11 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log log[VHOST_NET_MAX_SG];
+	/*Differiate async socket for 0-copy from normal*/
+	enum vhost_vq_link_state link_state;
+	struct list_head notifier;
+	spinlock_t notify_lock;
+	void (*receiver)(struct vhost_virtqueue *);
 };
 
 struct vhost_dev {
@@ -122,6 +132,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+			   struct iovec iov[], unsigned int iov_count,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
1.5.4.4


^ permalink raw reply related

* Re: Crashes in xfrm_lookup
From: Herbert Xu @ 2010-04-09  9:25 UTC (permalink / raw)
  To: Timo Teräs; +Cc: broonie, netdev
In-Reply-To: <4BBEE990.6080807@iki.fi>

On Fri, Apr 09, 2010 at 11:47:12AM +0300, Timo Teräs wrote:
>
> No. Prior it had one element unconditionally. My patch made
> it have zero or one element. The non-SUB_POLICY case crashed
> because xfrm_pols_put(xxx, 0) unconditionally calls
> xfrm_policy_put on unused pointer.

OK I was mistaken.

> __xfrm_lookup(). In the end it uses: "xfrm_pols_put(pols, drop_pols)"
> to free up policies that are looked up with xfrm_sk_policy_lookup().
> The only major code path there is, if per-socket policy has no
> transformations (which is common case, ike daemons do this so they
> can talk IKE without transformations).
>
> If we have cached bundle, the policies are referenced to from the
> bundle and we do not need to reference, or release them in the
> lookup function.
>
> It is a bit icky. But it's the only way to do it, since no one
> wanted to cache per-socket bundles in the flow cache.

BTW, socket policies cannot be sub-policies so you don't need
to expand policies for them.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH v2 net-next 2/2] gianfar: Add hardware TX timestamping support
From: Manfred Rudigier @ 2010-04-09  9:10 UTC (permalink / raw)
  To: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'
  Cc: 'sandeep.kumar@freescale.com', 'David Miller',
	'Scott Wood'

If a packet has the skb_shared_tx->hardware flag set the device is
instructed to generate a TX timestamp and write it back to memory after
the frame is transmitted. During the clean_tx_ring operation the
timestamp will be extracted and copied into the skb_shared_hwtstamps
struct of the skb.

TX timestamping is enabled by setting the tx_type to something else
than HWTSTAMP_TX_OFF with the SIOCSHWTSTAMP ioctl command. It is only
supported by eTSEC devices.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |  118 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/net/gianfar.h |    3 +-
 2 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 552e10d..becc3f3 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -795,8 +795,18 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
 	if (config.flags)
 		return -EINVAL;
 
-	if (config.tx_type)
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		priv->hwts_tx_en = 0;
+		break;
+	case HWTSTAMP_TX_ON:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_tx_en = 1;
+		break;
+	default:
 		return -ERANGE;
+	}
 
 	switch (config.rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
@@ -1972,23 +1982,29 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct netdev_queue *txq;
 	struct gfar __iomem *regs = NULL;
 	struct txfcb *fcb = NULL;
-	struct txbd8 *txbdp, *txbdp_start, *base;
+	struct txbd8 *txbdp, *txbdp_start, *base, *txbdp_tstamp = NULL;
 	u32 lstatus;
-	int i, rq = 0;
+	int i, rq = 0, do_tstamp = 0;
 	u32 bufaddr;
 	unsigned long flags;
-	unsigned int nr_frags, length;
-
+	unsigned int nr_frags, nr_txbds, length;
+	union skb_shared_tx *shtx;
 
 	rq = skb->queue_mapping;
 	tx_queue = priv->tx_queue[rq];
 	txq = netdev_get_tx_queue(dev, rq);
 	base = tx_queue->tx_bd_base;
 	regs = tx_queue->grp->regs;
+	shtx = skb_tx(skb);
+
+	/* check if time stamp should be generated */
+	if (unlikely(shtx->hardware && priv->hwts_tx_en))
+		do_tstamp = 1;
 
 	/* make space for additional header when fcb is needed */
 	if (((skb->ip_summed == CHECKSUM_PARTIAL) ||
-			(priv->vlgrp && vlan_tx_tag_present(skb))) &&
+			(priv->vlgrp && vlan_tx_tag_present(skb)) ||
+			unlikely(do_tstamp)) &&
 			(skb_headroom(skb) < GMAC_FCB_LEN)) {
 		struct sk_buff *skb_new;
 
@@ -2005,8 +2021,14 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* total number of fragments in the SKB */
 	nr_frags = skb_shinfo(skb)->nr_frags;
 
+	/* calculate the required number of TxBDs for this skb */
+	if (unlikely(do_tstamp))
+		nr_txbds = nr_frags + 2;
+	else
+		nr_txbds = nr_frags + 1;
+
 	/* check if there is space to queue this packet */
-	if ((nr_frags+1) > tx_queue->num_txbdfree) {
+	if (nr_txbds > tx_queue->num_txbdfree) {
 		/* no space, stop the queue */
 		netif_tx_stop_queue(txq);
 		dev->stats.tx_fifo_errors++;
@@ -2018,9 +2040,19 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	txq->tx_packets ++;
 
 	txbdp = txbdp_start = tx_queue->cur_tx;
+	lstatus = txbdp->lstatus;
+
+	/* Time stamp insertion requires one additional TxBD */
+	if (unlikely(do_tstamp))
+		txbdp_tstamp = txbdp = next_txbd(txbdp, base,
+				tx_queue->tx_ring_size);
 
 	if (nr_frags == 0) {
-		lstatus = txbdp->lstatus | BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
+		if (unlikely(do_tstamp))
+			txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_LAST |
+					TXBD_INTERRUPT);
+		else
+			lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
 	} else {
 		/* Place the fragment addresses and lengths into the TxBDs */
 		for (i = 0; i < nr_frags; i++) {
@@ -2066,11 +2098,32 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		gfar_tx_vlan(skb, fcb);
 	}
 
-	/* setup the TxBD length and buffer pointer for the first BD */
+	/* Setup tx hardware time stamping if requested */
+	if (unlikely(do_tstamp)) {
+		shtx->in_progress = 1;
+		if (fcb == NULL)
+			fcb = gfar_add_fcb(skb);
+		fcb->ptp = 1;
+		lstatus |= BD_LFLAG(TXBD_TOE);
+	}
+
 	txbdp_start->bufPtr = dma_map_single(&priv->ofdev->dev, skb->data,
 			skb_headlen(skb), DMA_TO_DEVICE);
 
-	lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	/*
+	 * If time stamping is requested one additional TxBD must be set up. The
+	 * first TxBD points to the FCB and must have a data length of
+	 * GMAC_FCB_LEN. The second TxBD points to the actual frame data with
+	 * the full frame length.
+	 */
+	if (unlikely(do_tstamp)) {
+		txbdp_tstamp->bufPtr = txbdp_start->bufPtr + GMAC_FCB_LEN;
+		txbdp_tstamp->lstatus |= BD_LFLAG(TXBD_READY) |
+				(skb_headlen(skb) - GMAC_FCB_LEN);
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | GMAC_FCB_LEN;
+	} else {
+		lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
+	}
 
 	/*
 	 * We can work in parallel with gfar_clean_tx_ring(), except
@@ -2110,7 +2163,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_queue->cur_tx = next_txbd(txbdp, base, tx_queue->tx_ring_size);
 
 	/* reduce TxBD free count */
-	tx_queue->num_txbdfree -= (nr_frags + 1);
+	tx_queue->num_txbdfree -= (nr_txbds);
 
 	dev->trans_start = jiffies;
 
@@ -2301,16 +2354,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 	struct net_device *dev = tx_queue->dev;
 	struct gfar_private *priv = netdev_priv(dev);
 	struct gfar_priv_rx_q *rx_queue = NULL;
-	struct txbd8 *bdp;
+	struct txbd8 *bdp, *next = NULL;
 	struct txbd8 *lbdp = NULL;
 	struct txbd8 *base = tx_queue->tx_bd_base;
 	struct sk_buff *skb;
 	int skb_dirtytx;
 	int tx_ring_size = tx_queue->tx_ring_size;
-	int frags = 0;
+	int frags = 0, nr_txbds = 0;
 	int i;
 	int howmany = 0;
 	u32 lstatus;
+	size_t buflen;
+	union skb_shared_tx *shtx;
 
 	rx_queue = priv->rx_queue[tx_queue->qindex];
 	bdp = tx_queue->dirty_tx;
@@ -2320,7 +2375,18 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 		unsigned long flags;
 
 		frags = skb_shinfo(skb)->nr_frags;
-		lbdp = skip_txbd(bdp, frags, base, tx_ring_size);
+
+		/*
+		 * When time stamping, one additional TxBD must be freed.
+		 * Also, we need to dma_unmap_single() the TxPAL.
+		 */
+		shtx = skb_tx(skb);
+		if (unlikely(shtx->in_progress))
+			nr_txbds = frags + 2;
+		else
+			nr_txbds = frags + 1;
+
+		lbdp = skip_txbd(bdp, nr_txbds - 1, base, tx_ring_size);
 
 		lstatus = lbdp->lstatus;
 
@@ -2329,10 +2395,24 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 				(lstatus & BD_LENGTH_MASK))
 			break;
 
-		dma_unmap_single(&priv->ofdev->dev,
-				bdp->bufPtr,
-				bdp->length,
-				DMA_TO_DEVICE);
+		if (unlikely(shtx->in_progress)) {
+			next = next_txbd(bdp, base, tx_ring_size);
+			buflen = next->length + GMAC_FCB_LEN;
+		} else
+			buflen = bdp->length;
+
+		dma_unmap_single(&priv->ofdev->dev, bdp->bufPtr,
+				buflen, DMA_TO_DEVICE);
+
+		if (unlikely(shtx->in_progress)) {
+			struct skb_shared_hwtstamps shhwtstamps;
+			u64 *ns = (u64*) (((u32)skb->data + 0x10) & ~0x7);
+			memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+			shhwtstamps.hwtstamp = ns_to_ktime(*ns);
+			skb_tstamp_tx(skb, &shhwtstamps);
+			bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
+			bdp = next;
+		}
 
 		bdp->lstatus &= BD_LFLAG(TXBD_WRAP);
 		bdp = next_txbd(bdp, base, tx_ring_size);
@@ -2364,7 +2444,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 
 		howmany++;
 		spin_lock_irqsave(&tx_queue->txlock, flags);
-		tx_queue->num_txbdfree += frags + 1;
+		tx_queue->num_txbdfree += nr_txbds;
 		spin_unlock_irqrestore(&tx_queue->txlock, flags);
 	}
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 1ea287c..ac4a92e 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -540,7 +540,7 @@ struct txbd8
 
 struct txfcb {
 	u8	flags;
-	u8	reserved;
+	u8	ptp;    /* Flag to enable tx timestamping */
 	u8	l4os;	/* Level 4 Header Offset */
 	u8	l3os; 	/* Level 3 Header Offset */
 	u16	phcs;	/* Pseudo-header Checksum */
@@ -1105,6 +1105,7 @@ struct gfar_private {
 
 	/* HW time stamping enabled flag */
 	int hwts_rx_en;
+	int hwts_tx_en;
 };
 
 extern unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH v2 net-next 1/2] gianfar: Add hardware RX timestamping support
From: Manfred Rudigier @ 2010-04-09  9:10 UTC (permalink / raw)
  To: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'
  Cc: 'sandeep.kumar@freescale.com', 'David Miller',
	'Scott Wood'

The device is configured to insert hardware timestamps into all
received packets. The RX timestamps are extracted from the padding
alingment bytes during the clean_rx_ring operation and copied into the
skb_shared_hwtstamps struct of the skb. This extraction only happens if
the rx_filter was set to something else than HWTSTAMP_FILTER_NONE with
the SIOCSHWTSTAMP ioctl command.

Hardware timestamping is only supported for eTSEC devices. To indicate
device support the new FSL_GIANFAR_DEV_HAS_TIMER flag was introduced.

Signed-off-by: Manfred Rudigier <manfred.rudigier@omicron.at>
---
 drivers/net/gianfar.c |   66 +++++++++++++++++++++++++++++++++++++++++++++---
 drivers/net/gianfar.h |    5 +++
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 080d1ce..552e10d 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -82,6 +82,7 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/in.h>
+#include <linux/net_tstamp.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -377,6 +378,13 @@ static void gfar_init_mac(struct net_device *ndev)
 		rctrl |= RCTRL_PADDING(priv->padding);
 	}
 
+	/* Insert receive time stamps into padding alignment bytes */
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER) {
+		rctrl &= ~RCTRL_PAL_MASK;
+		rctrl |= RCTRL_PRSDEP_INIT | RCTRL_TS_ENABLE | RCTRL_PADDING(8);
+		priv->padding = 8;
+	}
+
 	/* keep vlan related bits if it's enabled */
 	if (priv->vlgrp) {
 		rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT;
@@ -501,7 +509,8 @@ void unlock_tx_qs(struct gfar_private *priv)
 /* Returns 1 if incoming frames use an FCB */
 static inline int gfar_uses_fcb(struct gfar_private *priv)
 {
-	return priv->vlgrp || priv->rx_csum_enable;
+	return priv->vlgrp || priv->rx_csum_enable ||
+		(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER);
 }
 
 static void free_tx_pointers(struct gfar_private *priv)
@@ -742,7 +751,8 @@ static int gfar_of_init(struct of_device *ofdev, struct net_device **pdev)
 			FSL_GIANFAR_DEV_HAS_CSUM |
 			FSL_GIANFAR_DEV_HAS_VLAN |
 			FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
-			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
+			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH |
+			FSL_GIANFAR_DEV_HAS_TIMER;
 
 	ctype = of_get_property(np, "phy-connection-type", NULL);
 
@@ -772,6 +782,38 @@ err_grp_init:
 	return err;
 }
 
+static int gfar_hwtstamp_ioctl(struct net_device *netdev,
+			struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+	struct gfar_private *priv = netdev_priv(netdev);
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	if (config.tx_type)
+		return -ERANGE;
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		priv->hwts_rx_en = 0;
+		break;
+	default:
+		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
+			return -ERANGE;
+		priv->hwts_rx_en = 1;
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
 /* Ioctl MII Interface */
 static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
@@ -780,6 +822,9 @@ static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!netif_running(dev))
 		return -EINVAL;
 
+	if (cmd == SIOCSHWTSTAMP)
+		return gfar_hwtstamp_ioctl(dev, rq, cmd);
+
 	if (!priv->phydev)
 		return -ENODEV;
 
@@ -982,7 +1027,8 @@ static int gfar_probe(struct of_device *ofdev,
 	else
 		priv->padding = 0;
 
-	if (dev->features & NETIF_F_IP_CSUM)
+	if (dev->features & NETIF_F_IP_CSUM ||
+			priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
 		dev->hard_header_len += GMAC_FCB_LEN;
 
 	/* Program the isrg regs only if number of grps > 1 */
@@ -2474,6 +2520,17 @@ static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb,
 		skb_pull(skb, amount_pull);
 	}
 
+	/* Get receive timestamp from the skb */
+	if (priv->hwts_rx_en) {
+		struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+		u64 *ns = (u64 *) skb->data;
+		memset(shhwtstamps, 0, sizeof(*shhwtstamps));
+		shhwtstamps->hwtstamp = ns_to_ktime(*ns);
+	}
+
+	if (priv->padding)
+		skb_pull(skb, priv->padding);
+
 	if (priv->rx_csum_enable)
 		gfar_rx_checksum(skb, fcb);
 
@@ -2510,8 +2567,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 	bdp = rx_queue->cur_rx;
 	base = rx_queue->rx_bd_base;
 
-	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0) +
-		priv->padding;
+	amount_pull = (gfar_uses_fcb(priv) ? GMAC_FCB_LEN : 0);
 
 	while (!((bdp->status & RXBD_EMPTY) || (--rx_work_limit < 0))) {
 		struct sk_buff *newskb;
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 17d25e7..1ea287c 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -262,6 +262,7 @@ extern const char gfar_driver_version[];
 
 #define next_bd(bdp, base, ring_size) skip_bd(bdp, 1, base, ring_size)
 
+#define RCTRL_TS_ENABLE 	0x01000000
 #define RCTRL_PAL_MASK		0x001f0000
 #define RCTRL_VLEX		0x00002000
 #define RCTRL_FILREN		0x00001000
@@ -885,6 +886,7 @@ struct gfar {
 #define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET	0x00000100
 #define FSL_GIANFAR_DEV_HAS_BD_STASHING		0x00000200
 #define FSL_GIANFAR_DEV_HAS_BUF_STASHING	0x00000400
+#define FSL_GIANFAR_DEV_HAS_TIMER		0x00000800
 
 #if (MAXGROUPS == 2)
 #define DEFAULT_MAPPING 	0xAA
@@ -1100,6 +1102,9 @@ struct gfar_private {
 
 	/* Network Statistics */
 	struct gfar_extra_stats extra_stats;
+
+	/* HW time stamping enabled flag */
+	int hwts_rx_en;
 };
 
 extern unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH v2 net-next 0/2] timestamping support for gianfar
From: Manfred Rudigier @ 2010-04-09  9:09 UTC (permalink / raw)
  To: 'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'
  Cc: 'sandeep.kumar@freescale.com', 'David Miller',
	'Scott Wood'
In-Reply-To: <95DC1AA8EC908B48939B72CF375AA5E311A9F4DD@alice.at.omicron.at>

On 2010-04-07 Manfred Rudigier wrote:
> this patch series adds support for hardware time stamping to gianfar. It uses
> the new SO_TIMESTAMPING infrastructure to deliver raw hardware timestamps to
> user space applications.
> 
> Freescale CPUs with an eTSEC are able to time stamp all incoming network packets
> and can also time stamp transmit packets when instructed. The time stamps are
> generated by the eTSEC timer clock module which is running either from an
> external oscillator or internal clock.
> 
> The submitted patches do not initialize the timer clock module since the
> oscillator frequency might be different from board to board. Thus the user
> must configure the timer clock module by hand at the moment - otherwise no
> time stamps will be reported. Below is a simple example code which
> shows how to configure the timer clock module on the P2020DS/RDB. It can be
> used to quickly try out the patches.
> 
> Testing was done with the time stamping program from Patrick Ohly which can
> be found in the kernel sources under Documentation/networking/timestamping.
> I have verified the functionality on the MPC8313RDB, P2020DS and P2020RDB
> board with the latest net-2.6 kernel. Send and receive time stamps could be
> retrieved on all eTSEC ports.

Hello,

I've modified the patches according to your comments so far. 

Version 2 changes:
- Only two instead of four patches so every patch adds some useful functionality
- hwts_rx_en and hwts_tx_en are no bitfields any more

Manfred

^ permalink raw reply

* [PATCH net-next-2.6] net: sk_dst_cache RCUification
From: Eric Dumazet @ 2010-04-09  9:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Paul E. McKenney

With latest CONFIG_PROVE_RCU stuff, I felt more comfortable to make this
work.

sk->sk_dst_cache is currently protected by a rwlock (sk_dst_lock)

This rwlock is readlocked for a very small amount of time, and dst
entries are already freed after RCU grace period. This calls for RCU
again :)

This patch converts sk_dst_lock to a spinlock, and use RCU for readers.

__sk_dst_get() is supposed to be called with rcu_read_lock() or if
socket locked by user, so use appropriate rcu_dereference_check()
condition (rcu_read_lock_held() || sock_owned_by_user(sk))

This patch avoids two atomic ops per tx packet on UDP connected sockets,
for example, and permits sk_dst_lock to be much less dirtied.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/dst.h        |   15 -----------
 include/net/ip6_route.h  |    4 +--
 include/net/sock.h       |   47 +++++++++++++++++++++++--------------
 net/core/dev.c           |    2 -
 net/core/sock.c          |    8 +++---
 net/dccp/timer.c         |    4 +--
 net/decnet/af_decnet.c   |    6 ++--
 net/ipv4/af_inet.c       |    2 -
 net/ipv4/tcp_input.c     |    4 +--
 net/ipv4/tcp_timer.c     |    4 +--
 net/ipv6/ipv6_sockglue.c |   25 ++++++++++---------
 11 files changed, 60 insertions(+), 61 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index ce078cd..aac5a5f 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -225,21 +225,6 @@ static inline void dst_confirm(struct dst_entry *dst)
 		neigh_confirm(dst->neighbour);
 }
 
-static inline void dst_negative_advice(struct dst_entry **dst_p,
-				       struct sock *sk)
-{
-	struct dst_entry * dst = *dst_p;
-	if (dst && dst->ops->negative_advice) {
-		*dst_p = dst->ops->negative_advice(dst);
-
-		if (dst != *dst_p) {
-			extern void sk_reset_txq(struct sock *sk);
-
-			sk_reset_txq(sk);
-		}
-	}
-}
-
 static inline void dst_link_failure(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 68f6783..278312c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -152,9 +152,9 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
 static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
 				 struct in6_addr *daddr, struct in6_addr *saddr)
 {
-	write_lock(&sk->sk_dst_lock);
+	spin_lock(&sk->sk_dst_lock);
 	__ip6_dst_store(sk, dst, daddr, saddr);
-	write_unlock(&sk->sk_dst_lock);
+	spin_unlock(&sk->sk_dst_lock);
 }
 
 static inline int ipv6_unicast_destination(struct sk_buff *skb)
diff --git a/include/net/sock.h b/include/net/sock.h
index 092b055..6d298be 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -261,7 +261,7 @@ struct sock {
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
-	rwlock_t		sk_dst_lock;
+	spinlock_t		sk_dst_lock;
 	atomic_t		sk_rmem_alloc;
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
@@ -1191,7 +1191,8 @@ extern unsigned long sock_i_ino(struct sock *sk);
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
-	return sk->sk_dst_cache;
+	return rcu_dereference_check(sk->sk_dst_cache, rcu_read_lock_held() ||
+						       sock_owned_by_user(sk));
 }
 
 static inline struct dst_entry *
@@ -1199,50 +1200,62 @@ sk_dst_get(struct sock *sk)
 {
 	struct dst_entry *dst;
 
-	read_lock(&sk->sk_dst_lock);
-	dst = sk->sk_dst_cache;
+	rcu_read_lock();
+	dst = rcu_dereference(sk->sk_dst_cache);
 	if (dst)
 		dst_hold(dst);
-	read_unlock(&sk->sk_dst_lock);
+	rcu_read_unlock();
 	return dst;
 }
 
+extern void sk_reset_txq(struct sock *sk);
+
+static inline void dst_negative_advice(struct sock *sk)
+{
+	struct dst_entry *ndst, *dst = __sk_dst_get(sk);
+
+	if (dst && dst->ops->negative_advice) {
+		ndst = dst->ops->negative_advice(dst);
+
+		if (ndst != dst) {
+			rcu_assign_pointer(sk->sk_dst_cache, ndst);
+			sk_reset_txq(sk);
+		}
+	}
+}
+
 static inline void
 __sk_dst_set(struct sock *sk, struct dst_entry *dst)
 {
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
-	old_dst = sk->sk_dst_cache;
-	sk->sk_dst_cache = dst;
+	old_dst = rcu_dereference_check(sk->sk_dst_cache,
+					lockdep_is_held(&sk->sk_dst_lock));
+	rcu_assign_pointer(sk->sk_dst_cache, dst);
 	dst_release(old_dst);
 }
 
 static inline void
 sk_dst_set(struct sock *sk, struct dst_entry *dst)
 {
-	write_lock(&sk->sk_dst_lock);
+	spin_lock(&sk->sk_dst_lock);
 	__sk_dst_set(sk, dst);
-	write_unlock(&sk->sk_dst_lock);
+	spin_unlock(&sk->sk_dst_lock);
 }
 
 static inline void
 __sk_dst_reset(struct sock *sk)
 {
-	struct dst_entry *old_dst;
-
-	sk_tx_queue_clear(sk);
-	old_dst = sk->sk_dst_cache;
-	sk->sk_dst_cache = NULL;
-	dst_release(old_dst);
+	__sk_dst_set(sk, NULL);
 }
 
 static inline void
 sk_dst_reset(struct sock *sk)
 {
-	write_lock(&sk->sk_dst_lock);
+	spin_lock(&sk->sk_dst_lock);
 	__sk_dst_reset(sk);
-	write_unlock(&sk->sk_dst_lock);
+	spin_unlock(&sk->sk_dst_lock);
 }
 
 extern struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
diff --git a/net/core/dev.c b/net/core/dev.c
index b98ddc6..e5dcb85 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2014,7 +2014,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 			if (dev->real_num_tx_queues > 1)
 				queue_index = skb_tx_hash(dev, skb);
 
-			if (sk && sk->sk_dst_cache)
+			if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
 				sk_tx_queue_set(sk, queue_index);
 		}
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index c5812bb..7effa1e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -364,11 +364,11 @@ EXPORT_SYMBOL(sk_reset_txq);
 
 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 {
-	struct dst_entry *dst = sk->sk_dst_cache;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
-		sk->sk_dst_cache = NULL;
+		rcu_assign_pointer(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
 	}
@@ -1157,7 +1157,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		skb_queue_head_init(&newsk->sk_async_wait_queue);
 #endif
 
-		rwlock_init(&newsk->sk_dst_lock);
+		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1898,7 +1898,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	} else
 		sk->sk_sleep	=	NULL;
 
-	rwlock_init(&sk->sk_dst_lock);
+	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index bbfeb5e..1a9aa05 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -38,7 +38,7 @@ static int dccp_write_timeout(struct sock *sk)
 
 	if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
 		if (icsk->icsk_retransmits != 0)
-			dst_negative_advice(&sk->sk_dst_cache, sk);
+			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ?
 			    : sysctl_dccp_request_retries;
 	} else {
@@ -63,7 +63,7 @@ static int dccp_write_timeout(struct sock *sk)
 			   Golden words :-).
 		   */
 
-			dst_negative_advice(&sk->sk_dst_cache, sk);
+			dst_negative_advice(sk);
 		}
 
 		retry_until = sysctl_dccp_retries2;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2b494fa..55e3b6b 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -446,7 +446,7 @@ static void dn_destruct(struct sock *sk)
 	skb_queue_purge(&scp->other_xmit_queue);
 	skb_queue_purge(&scp->other_receive_queue);
 
-	dst_release(xchg(&sk->sk_dst_cache, NULL));
+	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
 }
 
 static int dn_memory_pressure;
@@ -1105,7 +1105,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 	release_sock(sk);
 
 	dst = skb_dst(skb);
-	dst_release(xchg(&newsk->sk_dst_cache, dst));
+	sk_dst_set(newsk, dst);
 	skb_dst_set(skb, NULL);
 
 	DN_SK(newsk)->state        = DN_CR;
@@ -1956,7 +1956,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
-		dst_negative_advice(&sk->sk_dst_cache, sk);
+		dst_negative_advice(sk);
 
 	mss = scp->segsize_rem;
 	fctype = scp->services_rem & NSP_FC_MASK;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5924f1..b39600f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk)
 	WARN_ON(sk->sk_forward_alloc);
 
 	kfree(inet->opt);
-	dst_release(sk->sk_dst_cache);
+	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
 	sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7b31476..b4bca29 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3709,7 +3709,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
-		dst_confirm(sk->sk_dst_cache);
+		dst_confirm(__sk_dst_get(sk));
 
 	return 1;
 
@@ -5832,7 +5832,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (tp->snd_una == tp->write_seq) {
 				tcp_set_state(sk, TCP_FIN_WAIT2);
 				sk->sk_shutdown |= SEND_SHUTDOWN;
-				dst_confirm(sk->sk_dst_cache);
+				dst_confirm(__sk_dst_get(sk));
 
 				if (!sock_flag(sk, SOCK_DEAD))
 					/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b2e6bbc..7b3741b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -171,14 +171,14 @@ static int tcp_write_timeout(struct sock *sk)
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits)
-			dst_negative_advice(&sk->sk_dst_cache, sk);
+			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
 		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
-			dst_negative_advice(&sk->sk_dst_cache, sk);
+			dst_negative_advice(sk);
 		}
 
 		retry_until = sysctl_tcp_retries2;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 430454e..089c983 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -113,9 +113,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 		}
 		opt = xchg(&inet6_sk(sk)->opt, opt);
 	} else {
-		write_lock(&sk->sk_dst_lock);
+		spin_lock(&sk->sk_dst_lock);
 		opt = xchg(&inet6_sk(sk)->opt, opt);
-		write_unlock(&sk->sk_dst_lock);
+		spin_unlock(&sk->sk_dst_lock);
 	}
 	sk_dst_reset(sk);
 
@@ -970,14 +970,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	case IPV6_MTU:
 	{
 		struct dst_entry *dst;
+
 		val = 0;
-		lock_sock(sk);
-		dst = sk_dst_get(sk);
-		if (dst) {
+		rcu_read_lock();
+		dst = __sk_dst_get(sk);
+		if (dst)
 			val = dst_mtu(dst);
-			dst_release(dst);
-		}
-		release_sock(sk);
+		rcu_read_unlock();
 		if (!val)
 			return -ENOTCONN;
 		break;
@@ -1065,12 +1064,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		else
 			val = np->mcast_hops;
 
-		dst = sk_dst_get(sk);
-		if (dst) {
-			if (val < 0)
+		if (val < 0) {
+			rcu_read_lock();
+			dst = __sk_dst_get(sk);
+			if (dst)
 				val = ip6_dst_hoplimit(dst);
-			dst_release(dst);
+			rcu_read_unlock();
 		}
+
 		if (val < 0)
 			val = sock_net(sk)->ipv6.devconf_all->hop_limit;
 		break;



^ permalink raw reply related

* Re: Crashes in xfrm_lookup
From: Timo Teräs @ 2010-04-09  8:47 UTC (permalink / raw)
  To: Herbert Xu; +Cc: broonie, netdev
In-Reply-To: <20100409083934.GA2353@gondor.apana.org.au>

Herbert Xu wrote:
> On Fri, Apr 09, 2010 at 11:30:49AM +0300, Timo Teräs wrote:
>> It has been array all along. The only difference was that only
>> the first element was used if SUB_POLICY was not defined.
> 
> It was an array but prior to your patch it only had a single
> element when SUB_POLICY is not defined.  Your patch made it
> contain XFRM_POLICY_TYPE_MAX elements unconditionally.

No. Prior it had one element unconditionally. My patch made
it have zero or one element. The non-SUB_POLICY case crashed
because xfrm_pols_put(xxx, 0) unconditionally calls
xfrm_policy_put on unused pointer.

>> I still think xfrm_pols_put should do always what the function
>> name says it's doing.
>>
>> If we want to further optimize non-SUB_POLICY stuff, we should
>> probably make XFRM_POLICY_TYPE_MAX = 1 and arrange rest of code
>> so that the compiler can optimize things properly.
> 
> Anyway, the fact is prior to your patch SUB_POLICY had a minimal
> impact on people who don't like it (like me), and now its effect
> is being forced on everyone.

No. The effect is because the policies are now cached in bundles,
and lookup function should not anymore drop references to policies
which are kept in cache.

>> But the fact is, that in the new code we need to do conditional
>> xfrm_policy_put depending on if we had per-socket or global policy
>> which we matched. Thus we either end up with "if (x)" or the
>> inline functions for loop's implicit test. Or do you have better
>> ideas how to avoid that?
> 
> Which particular piece of code are you referring to?

__xfrm_lookup(). In the end it uses: "xfrm_pols_put(pols, drop_pols)"
to free up policies that are looked up with xfrm_sk_policy_lookup().
The only major code path there is, if per-socket policy has no
transformations (which is common case, ike daemons do this so they
can talk IKE without transformations).

If we have cached bundle, the policies are referenced to from the
bundle and we do not need to reference, or release them in the
lookup function.

It is a bit icky. But it's the only way to do it, since no one
wanted to cache per-socket bundles in the flow cache.

^ permalink raw reply

* Re: [PATCH] tcp: Set CHECKSUM_UNNECESSARY in tcp_init_nondata_skb
From: Herbert Xu @ 2010-04-09  8:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100408.113420.268424579.davem@davemloft.net>

On Thu, Apr 08, 2010 at 11:34:20AM -0700, David Miller wrote:
>
> That still leaves that MC loopback code in ip_dev_loopback_xmit()
> which still sets CHECKSUM_UNNECESSARY unconditionally.
> 
> Should it do like the loopback driver and just leave the ip_summed
> value alone?

Yes I think so.

It will mean an unnecessary verification on the receiving end,
but we could always add CHECKSUM_PARTIAL to the spots that we
care about like your patch did for TCP.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Herbert Xu @ 2010-04-09  8:39 UTC (permalink / raw)
  To: Timo Teräs; +Cc: broonie, netdev
In-Reply-To: <4BBEE5B9.2060509@iki.fi>

On Fri, Apr 09, 2010 at 11:30:49AM +0300, Timo Teräs wrote:
>
> It has been array all along. The only difference was that only
> the first element was used if SUB_POLICY was not defined.

It was an array but prior to your patch it only had a single
element when SUB_POLICY is not defined.  Your patch made it
contain XFRM_POLICY_TYPE_MAX elements unconditionally.

> I still think xfrm_pols_put should do always what the function
> name says it's doing.
>
> If we want to further optimize non-SUB_POLICY stuff, we should
> probably make XFRM_POLICY_TYPE_MAX = 1 and arrange rest of code
> so that the compiler can optimize things properly.

Anyway, the fact is prior to your patch SUB_POLICY had a minimal
impact on people who don't like it (like me), and now its effect
is being forced on everyone.

> But the fact is, that in the new code we need to do conditional
> xfrm_policy_put depending on if we had per-socket or global policy
> which we matched. Thus we either end up with "if (x)" or the
> inline functions for loop's implicit test. Or do you have better
> ideas how to avoid that?

Which particular piece of code are you referring to?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Timo Teräs @ 2010-04-09  8:30 UTC (permalink / raw)
  To: Herbert Xu; +Cc: broonie, netdev
In-Reply-To: <20100409082239.GA2194@gondor.apana.org.au>

Herbert Xu wrote:
> On Fri, Apr 09, 2010 at 11:11:48AM +0300, Timo Teräs wrote:
>> It's still really misleading to have generic function that does not
>> do the expected thing based on some config. Compiler should know
>> how to optimize the for loop away if it's being called with fixed
>> array size.
> 
> But the compiler doesn't know because your patch makes it an
> array unconditionally.
> 
> The SUB_POLICY stuff was a hack from the very start, but at least
> you could compile it out previously.  Now it'll pollute things
> regardless of the configuration.

It has been array all along. The only difference was that only
the first element was used if SUB_POLICY was not defined.

I still think xfrm_pols_put should do always what the function
name says it's doing.

If we want to further optimize non-SUB_POLICY stuff, we should
probably make XFRM_POLICY_TYPE_MAX = 1 and arrange rest of code
so that the compiler can optimize things properly.

But the fact is, that in the new code we need to do conditional
xfrm_policy_put depending on if we had per-socket or global policy
which we matched. Thus we either end up with "if (x)" or the
inline functions for loop's implicit test. Or do you have better
ideas how to avoid that?

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Herbert Xu @ 2010-04-09  8:22 UTC (permalink / raw)
  To: Timo Teräs; +Cc: broonie, netdev
In-Reply-To: <4BBEE144.8070600@iki.fi>

On Fri, Apr 09, 2010 at 11:11:48AM +0300, Timo Teräs wrote:
>
> It's still really misleading to have generic function that does not
> do the expected thing based on some config. Compiler should know
> how to optimize the for loop away if it's being called with fixed
> array size.

But the compiler doesn't know because your patch makes it an
array unconditionally.

The SUB_POLICY stuff was a hack from the very start, but at least
you could compile it out previously.  Now it'll pollute things
regardless of the configuration.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* RE: net-next: 2.6.34-rc1 regression: panic when running diagnostic on interface with IPv6
From: Tantilov, Emil S @ 2010-04-09  8:19 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <20100408175444.1e0da2c6@nehalam>

Stephen Hemminger wrote:
> On Mon, 05 Apr 2010 16:53:17 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
>> From: "Tantilov, Emil S" <emil.s.tantilov@intel.com>
>> Date: Mon, 5 Apr 2010 17:50:38 -0600
>> 
>>> David Miller wrote:
>>>> From: "Tantilov, Emil S" <emil.s.tantilov@intel.com>
>>>> Date: Mon, 5 Apr 2010 17:03:56 -0600
>>>> 
>>>>> David Miller wrote:
>>>>>> From: "Tantilov, Emil S" <emil.s.tantilov@intel.com>
>>>>>> Date: Tue, 23 Mar 2010 12:28:08 -0600
>>>>>> 
>>>>>>> Bisecting points to this patch:
>>>>>>> http://git.kernel.org/?p=linux/kernel/git/davem/net-next-2.6.git;a=commitdiff;h=84e8b803f1e16f3a2b8b80f80a63fa2f2f8a9be6
>>>>>>> 
>>>>>>> And I confirmed that the issue goes away after reverting it.
>>>>>>> 
>>>>>>> Steps to reproduce:
>>>>>>> 1. Load the driver and configure IPv6 address.
>>>>>>> 2. Run ethtool diag:
>>>>>>> ethtool -t eth0
>>>>>>> 
>>>>>>> 3. If this doesn't brake it try again, or just do ifdown/up.
>>>>>>> Other operations on the interface will eventually panic the
>>>>>>> system: 
>>>>>> 
>>>>>> Stephen please fix this, thanks.
>>>>> 
>>>>> Just FYI - I still see this issue with latest pull from net-2.6.
>>>> 
>>>> It's net-next-2.6 that introduced the problem and has the follow-on
>>>> fixes, not net-2.6
>>> 
>>> Same in net-next:
>> 
>> Ok, Stephen please look into this, we've had this regression
>> for almost two weeks now.
> 
> I can't reproduce it, on e1000e.
> Since the symptoms match the original problem that you already fixed,
> I have to assume that it is fixed but the reporter (Emil) had not
> updated his kernel correctly.

My tree is up to date, but can you point me to the patch that is supposed to fix this?

Thanks,
Emil

^ permalink raw reply

* Re: FEC driver: rcv is not +last
From: Matthias Kaehlcke @ 2010-04-09  8:17 UTC (permalink / raw)
  To: Bryan Wu; +Cc: netdev, Sascha Hauer
In-Reply-To: <4BBEBE57.8020800@canonical.com>

Hi Bryan,

El Fri, Apr 09, 2010 at 01:42:47PM +0800 Bryan Wu ha dit:

> On 04/08/2010 06:40 PM, Matthias Kaehlcke wrote:
> >hi,
> >
> >i have problems with the FEC on a i.MX25 3-Stack board. the kernel is
> >v2.6.34-rc2 plus the following patch:
> >http://patchwork.ozlabs.org/patch/41235/
> >
> >the following traces are generated at boot time:
> >
> >FEC Ethernet Driver
> >fec: PHY @ 0x1, ID 0x20005ce1 -- unknown PHY!
> >...

> please try this patch on your hardware, it introduced phylib
> supporting in fec.c driver:
> http://lists.infradead.org/pipermail/linux-arm-kernel/2010-March/012214.html

sascha pointed me yesterday to your patch, i tried it with v2
(http://lkml.org/lkml/2010/3/31/130), but without success.

our distributor informed us that the board they let us is a
preliminary hardware version and that this is probably the cause of
the problem :/

i hope i'll get soon my hands on an offical board release and can
verify that the FEC just works ;)

thanks for your help!

-- 
Matthias Kaehlcke
Embedded Linux Developer
Barcelona

               If sharing a thing in no way diminishes it,
               it is not rightly owned if it is not shared
                                                                 .''`.
    using free software / Debian GNU/Linux | http://debian.org  : :'  :
                                                                `. `'`
gpg --keyserver pgp.mit.edu --recv-keys 47D8E5D4                  `-

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: Dont use netdev_warn()
From: Eric Dumazet @ 2010-04-09  8:13 UTC (permalink / raw)
  To: Joe Perches; +Cc: netdev, David Miller, Tom Herbert
In-Reply-To: <1270799606.19154.71.camel@Joe-Laptop.home>

Le vendredi 09 avril 2010 à 00:53 -0700, Joe Perches a écrit :
> On Fri, 2010-04-09 at 09:26 +0200, Eric Dumazet wrote:
> > Dont use netdev_warn() in dev_cap_txqueue() and get_rps_cpu() so that we
> > can catch following warnings without crash.
> 
> Crash?  Can you describe the crash a bit please.
> 
> > bond0.2240 received packet on queue 6, but number of RX queues is 1
> > bond0.2240 received packet on queue 11, but number of RX queues is 1
> 
> Bond0 doesn't have a valid (netdev)->dev.parent?
> 
> 	dev_printk(level, (netdev)->dev.parent,			\
> 
> Maybe there should be some additional check on
> netdev->dev.parent.
> 
> 

CC again original CC, no need to have a private conversation about this
Joe...


Yes, bonding have a NULL dev->dev.parent, I suspect other virtual
network device also have this property.


You can see in following patch Arjan's intent, and the difference
between :

dev->name and netdev_drivername(dev, drivername, 64))


commit 6579e57b31d79d31d9b806e41ba48774e73257dc
Author: Arjan van de Ven <arjan@linux.intel.com>
Date:   Mon Jul 21 13:31:48 2008 -0700

    net: Print the module name as part of the watchdog message
    
    As suggested by Dave:
    
    This patch adds a function to get the driver name from a struct net_device,
    and consequently uses this in the watchdog timeout handler to print as
    part of the message.
    
    Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 812bcd8..f5ea445 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1645,6 +1645,8 @@ extern void dev_seq_stop(struct seq_file *seq, void *v);
 extern int netdev_class_create_file(struct class_attribute *class_attr);
 extern void netdev_class_remove_file(struct class_attribute *class_attr);
 
+extern char *netdev_drivername(struct net_device *dev, char *buffer, int len);
+
 extern void linkwatch_run_queue(void);
 
 extern int netdev_compute_features(unsigned long all, unsigned long one);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1698b39..ad5598d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4686,6 +4686,26 @@ err_name:
        return -ENOMEM;
 }
 
+char *netdev_drivername(struct net_device *dev, char *buffer, int len)
+{
+       struct device_driver *driver;
+       struct device *parent;
+
+       if (len <= 0 || !buffer)
+               return buffer;
+       buffer[0] = 0;
+
+       parent = dev->dev.parent;
+
+       if (!parent)
+               return buffer;
+
+       driver = parent->driver;
+       if (driver && driver->name)
+               strlcpy(buffer, driver->name, len);
+       return buffer;
+}
+
 static void __net_exit netdev_exit(struct net *net)
 {
        kfree(net->dev_name_head);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cb625b4..4ac7e3a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -212,9 +212,9 @@ static void dev_watchdog(unsigned long arg)
                        if (some_queue_stopped &&
                            time_after(jiffies, (dev->trans_start +
                                                 dev->watchdog_timeo))) {
-                               printk(KERN_INFO "NETDEV WATCHDOG: %s: "
-                                      "transmit timed out\n",
-                                      dev->name);
+                               char drivername[64];
+                               printk(KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
+                                      dev->name, netdev_drivername(dev, drivername, 64));
                                dev->tx_timeout(dev);
                                WARN_ON_ONCE(1);
                        }




^ permalink raw reply related

* Re: Crashes in xfrm_lookup
From: Timo Teräs @ 2010-04-09  8:11 UTC (permalink / raw)
  To: Herbert Xu; +Cc: broonie, netdev
In-Reply-To: <20100409080907.GA2029@gondor.apana.org.au>

Herbert Xu wrote:
> Timo Teräs <timo.teras@iki.fi> wrote:
>> Happens because CONFIG_XFRM_SUB_POLICY is not enabled, and one of
>> the helper functions I used did unexpected things in that case.
>>
>> Try the following:
> 
> Ugh, can we fix this some other way?
> 
> The policy array should really only exist if SUB_POLICY is defined.

The problem was that my code could call it with zero polcies
assuming it'd do the right thing.

It's still really misleading to have generic function that does not
do the expected thing based on some config. Compiler should know
how to optimize the for loop away if it's being called with fixed
array size.

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Herbert Xu @ 2010-04-09  8:09 UTC (permalink / raw)
  To: Timo Teräs; +Cc: broonie, netdev
In-Reply-To: <4BBDBCF9.5060906@iki.fi>

Timo Teräs <timo.teras@iki.fi> wrote:
>
> Happens because CONFIG_XFRM_SUB_POLICY is not enabled, and one of
> the helper functions I used did unexpected things in that case.
> 
> Try the following:

Ugh, can we fix this some other way?

The policy array should really only exist if SUB_POLICY is defined.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [RFC PATCH 2/2] netdev: Add tracepoint to netif_receive_skb
From: Koki Sanagi @ 2010-04-09  7:41 UTC (permalink / raw)
  To: netdev; +Cc: izumi.taku, kaneshige.kenji, davem, nhorman
In-Reply-To: <4BBED951.8040406@jp.fujitsu.com>

This patch adds tracepoint to netif_receive_skb.
It notices that receive packet passes network/driver interface.
An output is below.

<idle>-0     [001] 68238.417058: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=52
<idle>-0     [001] 68238.704363: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=100
<idle>-0     [001] 68238.706891: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=52
<idle>-0     [001] 68238.878736: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=100

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
  include/trace/events/skb.h |   28 ++++++++++++++++++++++++++++
  net/core/dev.c             |    1 +
  2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 425a062..ca78d49 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -37,6 +37,34 @@ TRACE_EVENT(netdev_start_xmit,
  );
  
  /*
+ * netdev_receive_skb - invoked when skb is received from the driver
+ * @skb:		pointer to struct sk_buff
+ * @dev:		pointer to struct net_device
+ */
+TRACE_EVENT(netdev_receive_skb,
+
+	TP_PROTO(struct sk_buff *skb,
+		 struct net_device *dev),
+
+	TP_ARGS(skb, dev),
+
+	TP_STRUCT__entry(
+		__field(	const void *, 	skbaddr		)
+		__field(	unsigned int, 	len		)
+		__string(	name, 		dev->name	)
+       ),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+
+/*
   * Tracepoint for free an sk_buff:
   */
  TRACE_EVENT(kfree_skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 4667a96..7281286 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2683,6 +2683,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
  
  	__get_cpu_var(netdev_rx_stat).total++;
  
+	trace_netdev_receive_skb(skb, orig_dev);
  	skb_reset_network_header(skb);
  	skb_reset_transport_header(skb);
  	skb->mac_len = skb->network_header - skb->mac_header;


^ permalink raw reply related

* [RFC PATCH 1/2] netdev: Add tracepoint to dev_hard_start_xmit
From: Koki Sanagi @ 2010-04-09  7:39 UTC (permalink / raw)
  To: netdev; +Cc: izumi.taku, kaneshige.kenji, davem, nhorman
In-Reply-To: <4BBED951.8040406@jp.fujitsu.com>

This patch adds tracepoint to dev_hard_start_xmit.
It notices that xmit packet passes network/driver interface.
An output is below.

   sshd-2443  [001] 68238.415621: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114
   sshd-2443  [001] 68238.705459: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114
   sshd-2443  [001] 68238.880361: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
  include/trace/events/skb.h |   28 ++++++++++++++++++++++++++++
  net/core/dev.c             |    3 +++
  2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 4b2be6d..425a062 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -9,6 +9,34 @@
  #include <linux/tracepoint.h>
  
  /*
+ *  netdev_start_xmit - invoked when skb is passed to the driver
+ *  @skb:		pointer to struct sk_buff
+ *  @dev:		pointer to struct net_device
+ */
+TRACE_EVENT(netdev_start_xmit,
+
+	TP_PROTO(struct sk_buff *skb,
+		 struct net_device *dev),
+
+	TP_ARGS(skb, dev),
+
+	TP_STRUCT__entry(
+		__field(	const void *, 	skbaddr		)
+		__field(	unsigned int, 	len		)
+		__string(	name, 		dev->name	)
+       ),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+
+/*
   * Tracepoint for free an sk_buff:
   */
  TRACE_EVENT(kfree_skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 2a9b7dd..4667a96 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -129,6 +129,7 @@
  #include <linux/jhash.h>
  #include <linux/random.h>
  #include <trace/events/napi.h>
+#include <trace/events/skb.h>
  #include <linux/pci.h>
  
  #include "net-sysfs.h"
@@ -1903,6 +1904,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
  		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
  			skb_dst_drop(skb);
  
+		trace_netdev_start_xmit(skb, dev);
  		rc = ops->ndo_start_xmit(skb, dev);
  		if (rc == NETDEV_TX_OK)
  			txq_trans_update(txq);
@@ -1937,6 +1939,7 @@ gso:
  		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
  			skb_dst_drop(nskb);
  
+		trace_netdev_start_xmit(nskb, dev);
  		rc = ops->ndo_start_xmit(nskb, dev);
  		if (unlikely(rc != NETDEV_TX_OK)) {
  			if (rc & ~NETDEV_TX_MASK)



^ permalink raw reply related

* Re: [PATCH v3] rfs: Receive Flow Steering
From: Eric Dumazet @ 2010-04-09  7:37 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <alpine.DEB.1.00.1004082331250.19688@pokey.mtv.corp.google.com>

Le jeudi 08 avril 2010 à 23:33 -0700, Tom Herbert a écrit :

>  #ifdef CONFIG_RPS
> +/* One global table that all flow-based protocols share. */
> +struct rps_sock_flow_table *rps_sock_flow_table;
> +EXPORT_SYMBOL(rps_sock_flow_table);
> +
> +int rps_sock_flow_sysctl(ctl_table *table, int write, void __user *buffer,
> +			 size_t *lenp, loff_t *ppos)
> +{
> +	unsigned int orig_size, size;
> +	int ret, i;
> +	ctl_table tmp = {
> +		.data = &size,
> +		.maxlen = sizeof(size),
> +		.mode = table->mode
> +	};
> +	struct rps_sock_flow_table *orig_sock_table, *sock_table;
> +
> +	rcu_read_lock();
> +
> +	orig_sock_table = rcu_dereference(rps_sock_flow_table);
> +	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
> +
> +	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
> +
> +	if (write) {
> +		if (size) {
> +			size = roundup_pow_of_two(size);
> +			if (size != orig_size) {
> +				sock_table =
> +				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
> +				if (!sock_table) {
> +					rcu_read_unlock();
> +					return -ENOMEM;
> +				}
> +
> +				sock_table->mask = size - 1;
> +			} else
> +				sock_table = orig_sock_table;
> +
> +			for (i = 0; i < size; i++)
> +				sock_table->ents[i] = RPS_NO_CPU;
> +		} else
> +			sock_table = NULL;
> +
> +		if (sock_table != orig_sock_table) {
> +			rcu_assign_pointer(rps_sock_flow_table, sock_table);
> +			synchronize_rcu();
> +			vfree(orig_sock_table);
> +		}
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
>  /

It is not allowed to call vmalloc() inside rcu_read_unlock() section.

Anyway, rcu_read_unlock() is not appropriate (you want mutual exclusion
betwen concurrent writers here)

You should use a mutex here.
Or a spinlock (if you do the vmalloc()/vfree() things outside of the
locked section)





^ permalink raw reply

* [RFC PATCH 0/2] netdev: Add tracepoint to network/driver interface
From: Koki Sanagi @ 2010-04-09  7:37 UTC (permalink / raw)
  To: netdev; +Cc: izumi.taku, kaneshige.kenji, davem, nhorman

These patches add tracepoints to network/driver interface.

These tracepoints are helpful to investigate whether a packet passes or not.
For example, when Heart Beat is disconnected, that information is helpful
to investigate the cause is whether driver/device side or not.

An output is below.

   sshd-2443  [001] 68238.415621: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114
<idle>-0     [001] 68238.417058: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=52
<idle>-0     [001] 68238.704363: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=100
   sshd-2443  [001] 68238.705459: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114
<idle>-0     [001] 68238.706891: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=52
<idle>-0     [001] 68238.878736: netdev_receive_skb: dev=eth3 skbaddr=f3c81540 len=100
   sshd-2443  [001] 68238.880361: netdev_start_xmit: dev=eth3 skbaddr=f3db5138 len=114

As other use case I have, we can get throughput per interface with some sort of
perf scripts. I plan to create it.

Thanks
Koki Sanagi

^ permalink raw reply

* Re: [PATCH v3] rfs: Receive Flow Steering
From: Eric Dumazet @ 2010-04-09  7:31 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <alpine.DEB.1.00.1004082331250.19688@pokey.mtv.corp.google.com>

Le jeudi 08 avril 2010 à 23:33 -0700, Tom Herbert a écrit :

> @@ -1257,8 +1258,12 @@ EXPORT_SYMBOL(udp_lib_unhash);
>  
>  static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>  {
> -	int rc = sock_queue_rcv_skb(sk, skb);
> +	int rc;
> +
> +	if (inet_sk(sk)->inet_daddr)
> +		inet_rps_save_rxhash(sk, skb->rxhash);
>  
> +	 rc = sock_queue_rcv_skb(sk, skb);
>  	if (rc < 0) {
>  		int is_udplite = IS_UDPLITE(sk);
>  

There is an extra space before "rc = sock_queue_rcv_skb(sk, skb);"



^ permalink raw reply

* [PATCH net-next-2.6] net: Dont use netdev_warn()
From: Eric Dumazet @ 2010-04-09  7:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tom Herbert, Joe Perches

Dont use netdev_warn() in dev_cap_txqueue() and get_rps_cpu() so that we
can catch following warnings without crash.

bond0.2240 received packet on queue 6, but number of RX queues is 1
bond0.2240 received packet on queue 11, but number of RX queues is 1

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
Or maybe netdev_warn() implementation should be changed ?

 net/core/dev.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index b98ddc6..ad51ffb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1986,9 +1986,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 {
 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
 		if (net_ratelimit()) {
-			netdev_warn(dev, "selects TX queue %d, but "
-			     "real number of TX queues is %d\n",
-			     queue_index, dev->real_num_tx_queues);
+			pr_warning("%s selects TX queue %d, but "
+				"real number of TX queues is %d\n",
+				dev->name, queue_index, dev->real_num_tx_queues);
 		}
 		return 0;
 	}
@@ -2222,9 +2222,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
 		u16 index = skb_get_rx_queue(skb);
 		if (unlikely(index >= dev->num_rx_queues)) {
 			if (net_ratelimit()) {
-				netdev_warn(dev, "received packet on queue "
-				    "%u, but number of RX queues is %u\n",
-				     index, dev->num_rx_queues);
+				pr_warning("%s received packet on queue "
+					"%u, but number of RX queues is %u\n",
+					dev->name, index, dev->num_rx_queues);
 			}
 			goto done;
 		}



^ permalink raw reply related

* Re: [PATCH v3] rfs: Receive Flow Steering
From: Eric Dumazet @ 2010-04-09  7:17 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <alpine.DEB.1.00.1004082331250.19688@pokey.mtv.corp.google.com>

Le jeudi 08 avril 2010 à 23:33 -0700, Tom Herbert a écrit :
> Version 3 of RFS:
> - Use sysctl instead using kernel init parameter and alloc_large_system_hash
> - Created inline function for "queue->input_queue_head++" to reduce number of #ifdef's
> - Added RFS support for connected UDP sockets (thanks Eric!)
> ---
> This patch implements receive flow steering (RFS).  RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running.  RFS is an extension of Receive Packet Steering (RPS).
> 
> The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure.  The rxhash is passed in skb's received on the connection from netif_receive_skb.  For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms.
> 
> The convolution of the simple approach is that it would potentially allow OOO packets.  If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter.
> 
> To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table.
> 
> rps_sock_table is a global hash table.  Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above.  This table contains the "desired" CPUs for flows.
> 
> rps_dev_flow_table is specific to each device queue.  Each entry contains a CPU and a tail queue counter.  The CPU is the "current" CPU for a matching flow.  The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry.
> 
> Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length.  When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table.
> 
> And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted.  When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true:
> 
> - The current CPU is unset (equal to NR_CPUS)
> - Current CPU is offline
> - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table.  This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry.  This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery.
> 
> Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality.  2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant.
> 
> This patch implements RFS for TCP and connected UDP sockets.  It should be usable for other flow oriented protocols.
> 
> There are two configuration parameters for RFS.  The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue.  Both are rounded to power of two.
> 
> The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency).
> 
> The benefits of RFS are dependent on cache hierarchy, application load, and other factors.  On simple benchmarks, we don't necessarily see improvement and sometimes see degradation.  However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well.
> 
> Below are some benchmark results which show the potential benfit of this patch.  The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp.  The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf.
> 
> e1000e on 8 core Intel
>    No RFS or RPS		104K tps at 30% CPU
>    No RFS (best RPS config):    290K tps at 63% CPU
>    RFS				303K tps at 61% CPU
> 
> RPC test		tps	CPU%	50/90/99% usec latency	StdDev
>    No RFS or RPS	103K	48%	757/900/3185		4472.35
>    RPS only:		174K	73%	415/993/2468		491.66
>    RFS			223K	73%	379/651/1382		315.61
>    
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---

Changelog messages should be formatted with small lines (70 char limits)

> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d1a21b5..573e775 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -530,14 +530,77 @@ struct rps_map {
>  };
>  #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
>  
> +/*
> + * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
> + * tail pointer for that CPU's input queue at the time of last enqueue.
> + */
> +struct rps_dev_flow {
> +	u16 cpu;
> +	u16 fill;
> +	unsigned int last_qtail;
> +};
> +
> +/*
> + * The rps_dev_flow_table structure contains a table of flow mappings.
> + */
> +struct rps_dev_flow_table {
> +	unsigned int mask;
> +	struct rcu_head rcu;
> +	struct work_struct free_work;
> +	struct rps_dev_flow flows[0];
> +};
> +#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
> +    (_num * sizeof(struct rps_dev_flow)))
> +
> +/*
> + * The rps_sock_flow_table contains mappings of flows to the last CPU
> + * on which they were processed by the application (set in recvmsg).
> + */
> +struct rps_sock_flow_table {
> +	unsigned int mask;
> +	u16 ents[0];
> +};
> +#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
> +    (_num * sizeof(u16)))
> +
> +extern int rps_sock_flow_sysctl(ctl_table *table, int write,
> +				void __user *buffer, size_t *lenp,
> +				loff_t *ppos);
> +
> +#define RPS_NO_CPU 0xffff
> +
> +static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
> +					u32 hash)
> +{
> +	if (table && hash) {
> +		unsigned int cpu, index = hash & table->mask;
> +
> +		/* We only give a hint, preemption can change cpu under us */
> +		cpu = raw_smp_processor_id();
> +
> +		if (table->ents[index] != cpu)
> +			table->ents[index] = cpu;
> +	}
> +}
> +
> +static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
> +				       u32 hash)
> +{
> +	if (table && hash)
> +		table->ents[hash & table->mask] = RPS_NO_CPU;
> +}
> +
> +extern struct rps_sock_flow_table *rps_sock_flow_table;
> +
>  /* This structure contains an instance of an RX queue. */
>  struct netdev_rx_queue {
>  	struct rps_map *rps_map;
> +	struct rps_dev_flow_table *rps_flow_table;
>  	struct kobject kobj;
>  	struct netdev_rx_queue *first;
>  	atomic_t count;
>  } ____cacheline_aligned_in_smp;
> -#endif
> +#endif /* CONFIG_RPS */
>  
>  /*
>   * This structure defines the management hooks for network devices.
> @@ -1331,13 +1394,21 @@ struct softnet_data {
>  	struct sk_buff		*completion_queue;
>  
>  	/* Elements below can be accessed between CPUs for RPS */
> -#ifdef CONFIG_SMP
> +#ifdef CONFIG_RPS
>  	struct call_single_data	csd ____cacheline_aligned_in_smp;
> +	unsigned int		input_queue_head;
>  #endif
>  	struct sk_buff_head	input_pkt_queue;
>  	struct napi_struct	backlog;
>  };
>  
> +static inline void incr_input_queue_head(struct softnet_data *queue)
> +{
> +#ifdef CONFIG_RPS
> +	queue->input_queue_head++;
> +#endif
> +}
> +
>  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>  
>  #define HAVE_NETIF_QUEUE
> diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
> index 83fd344..b487bc1 100644
> --- a/include/net/inet_sock.h
> +++ b/include/net/inet_sock.h
> @@ -21,6 +21,7 @@
>  #include <linux/string.h>
>  #include <linux/types.h>
>  #include <linux/jhash.h>
> +#include <linux/netdevice.h>
>  
>  #include <net/flow.h>
>  #include <net/sock.h>
> @@ -101,6 +102,7 @@ struct rtable;
>   * @uc_ttl - Unicast TTL
>   * @inet_sport - Source port
>   * @inet_id - ID counter for DF pkts
> + * @rxhash - flow hash received from netif layer
>   * @tos - TOS
>   * @mc_ttl - Multicasting TTL
>   * @is_icsk - is this an inet_connection_sock?
> @@ -124,6 +126,9 @@ struct inet_sock {
>  	__u16			cmsg_flags;
>  	__be16			inet_sport;
>  	__u16			inet_id;
> +#ifdef CONFIG_RPS
> +	__u32			rxhash;
> +#endif

I am a bit worried, because dirtying this cache line might hurt non RPS
setups (if network interrupts are balanced to all cpus)

Best place would be to put rxhash close to sk_refcnt (because we dirty
it to get a reference on rcu sk lookups)

I believe we have a 32bits hole on 64bit arches for this :)


While testint latest net-nex-2.6 on my nehalem machine, I got a crash
(in RPS I am afraid...)

I am going to correct this crash before testing RFS and let you know the
results.

Thanks



^ permalink raw reply

* [PATCH v3] rfs: Receive Flow Steering
From: Tom Herbert @ 2010-04-09  6:33 UTC (permalink / raw)
  To: davem, netdev; +Cc: eric.dumazet

Version 3 of RFS:
- Use sysctl instead using kernel init parameter and alloc_large_system_hash
- Created inline function for "queue->input_queue_head++" to reduce number of #ifdef's
- Added RFS support for connected UDP sockets (thanks Eric!)
---
This patch implements receive flow steering (RFS).  RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running.  RFS is an extension of Receive Packet Steering (RPS).

The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure.  The rxhash is passed in skb's received on the connection from netif_receive_skb.  For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms.

The convolution of the simple approach is that it would potentially allow OOO packets.  If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter.

To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table.

rps_sock_table is a global hash table.  Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above.  This table contains the "desired" CPUs for flows.

rps_dev_flow_table is specific to each device queue.  Each entry contains a CPU and a tail queue counter.  The CPU is the "current" CPU for a matching flow.  The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry.

Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length.  When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table.

And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted.  When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true:

- The current CPU is unset (equal to NR_CPUS)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table.  This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry.  This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery.

Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality.  2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant.

This patch implements RFS for TCP and connected UDP sockets.  It should be usable for other flow oriented protocols.

There are two configuration parameters for RFS.  The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue.  Both are rounded to power of two.

The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency).

The benefits of RFS are dependent on cache hierarchy, application load, and other factors.  On simple benchmarks, we don't necessarily see improvement and sometimes see degradation.  However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well.

Below are some benchmark results which show the potential benfit of this patch.  The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp.  The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf.

e1000e on 8 core Intel
   No RFS or RPS		104K tps at 30% CPU
   No RFS (best RPS config):    290K tps at 63% CPU
   RFS				303K tps at 61% CPU

RPC test		tps	CPU%	50/90/99% usec latency	StdDev
   No RFS or RPS	103K	48%	757/900/3185		4472.35
   RPS only:		174K	73%	415/993/2468		491.66
   RFS			223K	73%	379/651/1382		315.61
   
Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a21b5..573e775 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,14 +530,77 @@ struct rps_map {
 };
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
+/*
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
+ * tail pointer for that CPU's input queue at the time of last enqueue.
+ */
+struct rps_dev_flow {
+	u16 cpu;
+	u16 fill;
+	unsigned int last_qtail;
+};
+
+/*
+ * The rps_dev_flow_table structure contains a table of flow mappings.
+ */
+struct rps_dev_flow_table {
+	unsigned int mask;
+	struct rcu_head rcu;
+	struct work_struct free_work;
+	struct rps_dev_flow flows[0];
+};
+#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
+    (_num * sizeof(struct rps_dev_flow)))
+
+/*
+ * The rps_sock_flow_table contains mappings of flows to the last CPU
+ * on which they were processed by the application (set in recvmsg).
+ */
+struct rps_sock_flow_table {
+	unsigned int mask;
+	u16 ents[0];
+};
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
+    (_num * sizeof(u16)))
+
+extern int rps_sock_flow_sysctl(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos);
+
+#define RPS_NO_CPU 0xffff
+
+static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
+					u32 hash)
+{
+	if (table && hash) {
+		unsigned int cpu, index = hash & table->mask;
+
+		/* We only give a hint, preemption can change cpu under us */
+		cpu = raw_smp_processor_id();
+
+		if (table->ents[index] != cpu)
+			table->ents[index] = cpu;
+	}
+}
+
+static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
+				       u32 hash)
+{
+	if (table && hash)
+		table->ents[hash & table->mask] = RPS_NO_CPU;
+}
+
+extern struct rps_sock_flow_table *rps_sock_flow_table;
+
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
 	struct rps_map *rps_map;
+	struct rps_dev_flow_table *rps_flow_table;
 	struct kobject kobj;
 	struct netdev_rx_queue *first;
 	atomic_t count;
 } ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_RPS */
 
 /*
  * This structure defines the management hooks for network devices.
@@ -1331,13 +1394,21 @@ struct softnet_data {
 	struct sk_buff		*completion_queue;
 
 	/* Elements below can be accessed between CPUs for RPS */
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RPS
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	unsigned int		input_queue_head;
 #endif
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
+static inline void incr_input_queue_head(struct softnet_data *queue)
+{
+#ifdef CONFIG_RPS
+	queue->input_queue_head++;
+#endif
+}
+
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 #define HAVE_NETIF_QUEUE
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 83fd344..b487bc1 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jhash.h>
+#include <linux/netdevice.h>
 
 #include <net/flow.h>
 #include <net/sock.h>
@@ -101,6 +102,7 @@ struct rtable;
  * @uc_ttl - Unicast TTL
  * @inet_sport - Source port
  * @inet_id - ID counter for DF pkts
+ * @rxhash - flow hash received from netif layer
  * @tos - TOS
  * @mc_ttl - Multicasting TTL
  * @is_icsk - is this an inet_connection_sock?
@@ -124,6 +126,9 @@ struct inet_sock {
 	__u16			cmsg_flags;
 	__be16			inet_sport;
 	__u16			inet_id;
+#ifdef CONFIG_RPS
+	__u32			rxhash;
+#endif
 
 	struct ip_options	*opt;
 	__u8			tos;
@@ -219,4 +224,37 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return inet_sk(sk)->transparent ? FLOWI_FLAG_ANYSRC : 0;
 }
 
+static inline void inet_rps_record_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table *sock_flow_table;
+
+	rcu_read_lock();
+	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	rps_record_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
+	rcu_read_unlock();
+#endif
+}
+
+static inline void inet_rps_reset_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table *sock_flow_table;
+
+	rcu_read_lock();
+	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	rps_reset_sock_flow(sock_flow_table, inet_sk(sk)->rxhash);
+	rcu_read_unlock();
+#endif
+}
+
+static inline void inet_rps_save_rxhash(const struct sock *sk, u32 rxhash)
+{
+#ifdef CONFIG_RPS
+	if (unlikely(inet_sk(sk)->rxhash != rxhash)) {
+		inet_rps_reset_flow(sk);
+		inet_sk(sk)->rxhash = rxhash;
+	}
+#endif
+}
 #endif	/* _INET_SOCK_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index b98ddc6..29ef1db 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/random.h>
 #include <trace/events/napi.h>
 #include <linux/pci.h>
+#include <linux/bootmem.h>
 
 #include "net-sysfs.h"
 
@@ -2202,22 +2203,80 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 #ifdef CONFIG_RPS
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+int rps_sock_flow_sysctl(ctl_table *table, int write, void __user *buffer,
+			 size_t *lenp, loff_t *ppos)
+{
+	unsigned int orig_size, size;
+	int ret, i;
+	ctl_table tmp = {
+		.data = &size,
+		.maxlen = sizeof(size),
+		.mode = table->mode
+	};
+	struct rps_sock_flow_table *orig_sock_table, *sock_table;
+
+	rcu_read_lock();
+
+	orig_sock_table = rcu_dereference(rps_sock_flow_table);
+	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+	if (write) {
+		if (size) {
+			size = roundup_pow_of_two(size);
+			if (size != orig_size) {
+				sock_table =
+				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+				if (!sock_table) {
+					rcu_read_unlock();
+					return -ENOMEM;
+				}
+
+				sock_table->mask = size - 1;
+			} else
+				sock_table = orig_sock_table;
+
+			for (i = 0; i < size; i++)
+				sock_table->ents[i] = RPS_NO_CPU;
+		} else
+			sock_table = NULL;
+
+		if (sock_table != orig_sock_table) {
+			rcu_assign_pointer(rps_sock_flow_table, sock_table);
+			synchronize_rcu();
+			vfree(orig_sock_table);
+		}
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /*
  * get_rps_cpu is called from netif_receive_skb and returns the target
  * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+		       struct rps_dev_flow **rflowp)
 {
 	struct ipv6hdr *ip6;
 	struct iphdr *ip;
 	struct netdev_rx_queue *rxqueue;
 	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_sock_flow_table *sock_flow_table;
 	int cpu = -1;
 	u8 ip_proto;
+	u16 tcpu;
 	u32 addr1, addr2, ports, ihl;
 
-	rcu_read_lock();
-
 	if (skb_rx_queue_recorded(skb)) {
 		u16 index = skb_get_rx_queue(skb);
 		if (unlikely(index >= dev->num_rx_queues)) {
@@ -2232,7 +2291,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
 	} else
 		rxqueue = dev->_rx;
 
-	if (!rxqueue->rps_map)
+	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
 		goto done;
 
 	if (skb->rxhash)
@@ -2284,9 +2343,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
 		skb->rxhash = 1;
 
 got_hash:
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	if (flow_table && sock_flow_table) {
+		u16 next_cpu;
+		struct rps_dev_flow *rflow;
+
+		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+		tcpu = rflow->cpu;
+
+		next_cpu = sock_flow_table->ents[skb->rxhash &
+		    sock_flow_table->mask];
+
+		/*
+		 * If the desired CPU (where last recvmsg was done) is
+		 * different from current CPU (one in the rx-queue flow
+		 * table entry), switch if one of the following holds:
+		 *   - Current CPU is unset (equal to RPS_NO_CPU).
+		 *   - Current CPU is offline.
+		 *   - The current CPU's queue tail has advanced beyond the
+		 *     last packet that was enqueued using this table entry.
+		 *     This guarantees that all previous packets for the flow
+		 *     have been dequeued, thus preserving in order delivery.
+		 */
+		if (unlikely(tcpu != next_cpu) &&
+		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+		      rflow->last_qtail)) >= 0)) {
+			tcpu = rflow->cpu = next_cpu;
+			if (tcpu != RPS_NO_CPU)
+				rflow->last_qtail = per_cpu(softnet_data,
+				    tcpu).input_queue_head;
+		}
+		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+			*rflowp = rflow;
+			cpu = tcpu;
+			goto done;
+		}
+	}
+
 	map = rcu_dereference(rxqueue->rps_map);
 	if (map) {
-		u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
 
 		if (cpu_online(tcpu)) {
 			cpu = tcpu;
@@ -2295,7 +2393,6 @@ got_hash:
 	}
 
 done:
-	rcu_read_unlock();
 	return cpu;
 }
 
@@ -2321,13 +2418,14 @@ static void trigger_softirq(void *data)
 	__napi_schedule(&queue->backlog);
 	__get_cpu_var(netdev_rx_stat).received_rps++;
 }
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_RPS */
 
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
  */
-static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+			      unsigned int *qtail)
 {
 	struct softnet_data *queue;
 	unsigned long flags;
@@ -2342,6 +2440,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
 		if (queue->input_pkt_queue.qlen) {
 enqueue:
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_RPS
+			*qtail = queue->input_queue_head +
+			    queue->input_pkt_queue.qlen;
+#endif
 			rps_unlock(queue);
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
@@ -2356,11 +2458,10 @@ enqueue:
 
 				cpu_set(cpu, rcpus->mask[rcpus->select]);
 				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-			} else
-				__napi_schedule(&queue->backlog);
-#else
-			__napi_schedule(&queue->backlog);
+				goto enqueue;
+			}
 #endif
+			__napi_schedule(&queue->backlog);
 		}
 		goto enqueue;
 	}
@@ -2391,7 +2492,7 @@ enqueue:
 
 int netif_rx(struct sk_buff *skb)
 {
-	int cpu;
+	unsigned int qtail;
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -2401,14 +2502,24 @@ int netif_rx(struct sk_buff *skb)
 		net_timestamp(skb);
 
 #ifdef CONFIG_RPS
-	cpu = get_rps_cpu(skb->dev, skb);
-	if (cpu < 0)
-		cpu = smp_processor_id();
-#else
-	cpu = smp_processor_id();
+	{
+		struct rps_dev_flow voidflow, *rflow = &voidflow;
+		int cpu, err;
+
+		rcu_read_lock();
+
+		cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		if (cpu >= 0) {
+			err = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+			rcu_read_unlock();
+			return err;
+		}
+
+		rcu_read_unlock();
+	}
 #endif
 
-	return enqueue_to_backlog(skb, cpu);
+	return enqueue_to_backlog(skb, smp_processor_id(), &qtail);
 }
 EXPORT_SYMBOL(netif_rx);
 
@@ -2775,17 +2886,22 @@ out:
 int netif_receive_skb(struct sk_buff *skb)
 {
 #ifdef CONFIG_RPS
-	int cpu;
+	struct rps_dev_flow voidflow, *rflow = &voidflow;
+	int cpu, err;
+
+	rcu_read_lock();
 
-	cpu = get_rps_cpu(skb->dev, skb);
+	cpu = get_rps_cpu(skb->dev, skb, &rflow);
 
-	if (cpu < 0)
-		return __netif_receive_skb(skb);
-	else
-		return enqueue_to_backlog(skb, cpu);
-#else
-	return __netif_receive_skb(skb);
+	if (cpu >= 0) {
+		err = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+		rcu_read_unlock();
+		return err;
+	}
+
+	rcu_read_unlock();
 #endif
+	return __netif_receive_skb(skb);
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
@@ -2801,6 +2917,7 @@ static void flush_backlog(void *arg)
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &queue->input_pkt_queue);
 			kfree_skb(skb);
+			incr_input_queue_head(queue);
 		}
 	rps_unlock(queue);
 }
@@ -3124,6 +3241,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			local_irq_enable();
 			break;
 		}
+		incr_input_queue_head(queue);
 		rps_unlock(queue);
 		local_irq_enable();
 
@@ -5487,8 +5605,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
-	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
+		incr_input_queue_head(oldsd);
+	}
 
 	return NOTIFY_OK;
 }
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1e7fdd6..95863b2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -600,22 +600,105 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	return len;
 }
 
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+					   struct rx_queue_attribute *attr,
+					   char *buf)
+{
+	struct rps_dev_flow_table *flow_table;
+	unsigned int val = 0;
+
+	rcu_read_lock();
+	flow_table = rcu_dereference(queue->rps_flow_table);
+	if (flow_table)
+		val = flow_table->mask + 1;
+	rcu_read_unlock();
+
+	return sprintf(buf, "%u\n", val);
+}
+
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+	struct rps_dev_flow_table *table = container_of(work,
+	    struct rps_dev_flow_table, free_work);
+
+	vfree(table);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+	struct rps_dev_flow_table *table = container_of(rcu,
+	    struct rps_dev_flow_table, rcu);
+
+	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+	schedule_work(&table->free_work);
+}
+
+ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+				     struct rx_queue_attribute *attr,
+				     const char *buf, size_t len)
+{
+	unsigned int count;
+	char *endp;
+	struct rps_dev_flow_table *table, *old_table;
+	static DEFINE_SPINLOCK(rps_dev_flow_lock);
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	count = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+
+	if (count) {
+		int i;
+
+		count = roundup_pow_of_two(count);
+		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+		if (!table)
+			return -ENOMEM;
+
+		table->mask = count - 1;
+		for (i = 0; i < count; i++)
+			table->flows[i].cpu = RPS_NO_CPU;
+	} else
+		table = NULL;
+
+	spin_lock(&rps_dev_flow_lock);
+	old_table = queue->rps_flow_table;
+	rcu_assign_pointer(queue->rps_flow_table, table);
+	spin_unlock(&rps_dev_flow_lock);
+
+	if (old_table)
+		call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+	return len;
+}
+
 static struct rx_queue_attribute rps_cpus_attribute =
 	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
 
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+
 static struct attribute *rx_queue_default_attrs[] = {
 	&rps_cpus_attribute.attr,
+	&rps_dev_flow_table_cnt_attribute.attr,
 	NULL
 };
 
 static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
-	struct rps_map *map = queue->rps_map;
 	struct netdev_rx_queue *first = queue->first;
 
-	if (map)
-		call_rcu(&map->rcu, rps_map_release);
+	if (queue->rps_map)
+		call_rcu(&queue->rps_map->rcu, rps_map_release);
+
+	if (queue->rps_flow_table)
+		call_rcu(&queue->rps_flow_table->rcu,
+		    rps_dev_flow_table_release);
 
 	if (atomic_dec_and_test(&first->count))
 		kfree(first);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0612487..f597189 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -81,6 +81,14 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_RPS
+	{
+		.procname	= "rps_sock_flow_entries",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= rps_sock_flow_sysctl
+	},
+#endif
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5924f1..cc46052 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -418,6 +418,8 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
+		inet_rps_reset_flow(sk);
+
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
@@ -719,6 +721,8 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 {
 	struct sock *sk = sock->sk;
 
+	inet_rps_record_flow(sk);
+
 	/* We may need to bind the socket. */
 	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
@@ -727,12 +731,13 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
-
 static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 			     size_t size, int flags)
 {
 	struct sock *sk = sock->sk;
 
+	inet_rps_record_flow(sk);
+
 	/* We may need to bind the socket. */
 	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
@@ -742,6 +747,22 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 	return sock_no_sendpage(sock, page, offset, size, flags);
 }
 
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		 size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	int addr_len = 0;
+	int err;
+
+	inet_rps_record_flow(sk);
+
+	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+				   flags & ~MSG_DONTWAIT, &addr_len);
+	if (err >= 0)
+		msg->msg_namelen = addr_len;
+	return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
 
 int inet_shutdown(struct socket *sock, int how)
 {
@@ -871,7 +892,7 @@ const struct proto_ops inet_stream_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = tcp_sendmsg,
-	.recvmsg	   = sock_common_recvmsg,
+	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = tcp_sendpage,
 	.splice_read	   = tcp_splice_read,
@@ -898,7 +919,7 @@ const struct proto_ops inet_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
-	.recvmsg	   = sock_common_recvmsg,
+	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
@@ -928,7 +949,7 @@ static const struct proto_ops inet_sockraw_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
-	.recvmsg	   = sock_common_recvmsg,
+	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f4df5f9..2f40fe0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1674,6 +1674,8 @@ process:
 
 	skb->dev = NULL;
 
+	inet_rps_save_rxhash(sk, skb->rxhash);
+
 	bh_lock_sock_nested(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7af756d..11c7ce3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1216,6 +1216,7 @@ int udp_disconnect(struct sock *sk, int flags)
 	sk->sk_state = TCP_CLOSE;
 	inet->inet_daddr = 0;
 	inet->inet_dport = 0;
+	inet_rps_save_rxhash(sk, 0);
 	sk->sk_bound_dev_if = 0;
 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 		inet_reset_saddr(sk);
@@ -1257,8 +1258,12 @@ EXPORT_SYMBOL(udp_lib_unhash);
 
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int rc = sock_queue_rcv_skb(sk, skb);
+	int rc;
+
+	if (inet_sk(sk)->inet_daddr)
+		inet_rps_save_rxhash(sk, skb->rxhash);
 
+	 rc = sock_queue_rcv_skb(sk, skb);
 	if (rc < 0) {
 		int is_udplite = IS_UDPLITE(sk);
 

^ permalink raw reply related

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-09  5:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, bridge, Andy Gospodarek, Neil Horman,
	Jeff Moyer, Matt Mackall, bonding-devel, Jay Vosburgh,
	David Miller
In-Reply-To: <20100408083710.2b61ee44@nehalam>

Stephen Hemminger wrote:
> On Thu, 8 Apr 2010 02:18:58 -0400
> Amerigo Wang <amwang@redhat.com> wrote:
> 
>> Based on the previous patch, make bridge support netpoll by:
>>
>> 1) implement the 2 methods to support netpoll for bridge;
>>
>> 2) modify netpoll during forwarding packets via bridge;
>>
>> 3) disable netpoll support of bridge when a netpoll-unabled device
>>    is added to bridge;
>>
>> 4) enable netpoll support when all underlying devices support netpoll.
>>
>> Cc: David Miller <davem@davemloft.net>
>> Cc: Neil Horman <nhorman@tuxdriver.com>
>> Cc: Stephen Hemminger <shemminger@linux-foundation.org>
>> Cc: Matt Mackall <mpm@selenic.com>
>> Signed-off-by: WANG Cong <amwang@redhat.com>
>>
>> ---
>>
>> Index: linux-2.6/net/bridge/br_device.c
>> ===================================================================
>> --- linux-2.6.orig/net/bridge/br_device.c
>> +++ linux-2.6/net/bridge/br_device.c
>> @@ -13,8 +13,10 @@
>>  
>>  #include <linux/kernel.h>
>>  #include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>>  #include <linux/etherdevice.h>
>>  #include <linux/ethtool.h>
>> +#include <linux/list.h>
>>  
>>  #include <asm/uaccess.h>
>>  #include "br_private.h"
>> @@ -162,6 +164,59 @@ static int br_set_tx_csum(struct net_dev
>>  	return 0;
>>  }
>>  
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +bool br_devices_support_netpoll(struct net_bridge *br)
>> +{
>> +	struct net_bridge_port *p;
>> +	bool ret = true;
>> +	int count = 0;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&br->lock, flags);
>> +	list_for_each_entry(p, &br->port_list, list) {
>> +		count++;
>> +		if (p->dev->priv_flags & IFF_DISABLE_NETPOLL
>> +				|| !p->dev->netdev_ops->ndo_poll_controller)
>> +			ret = false;
>> +	}
>> +	spin_unlock_irqrestore(&br->lock, flags);
>> +	return count != 0 && ret;
>> +}
>> +
>> +static void br_poll_controller(struct net_device *br_dev)
>> +{
>> +	struct netpoll *np = br_dev->npinfo->netpoll;
>> +
>> +	if (np->real_dev != br_dev)
>> +		netpoll_poll_dev(np->real_dev);
>> +}
>> +
>> +void br_netpoll_cleanup(struct net_device *br_dev)
>> +{
>> +	struct net_bridge *br = netdev_priv(br_dev);
>> +	struct net_bridge_port *p, *n;
>> +	const struct net_device_ops *ops;
>> +
>> +	br->dev->npinfo = NULL;
>> +	list_for_each_entry_safe(p, n, &br->port_list, list) {
>> +		if (p->dev) {
>> +			ops = p->dev->netdev_ops;
>> +			if (ops->ndo_netpoll_cleanup)
>> +				ops->ndo_netpoll_cleanup(p->dev);
>> +			else
>> +				p->dev->npinfo = NULL;
>> +		}
>> +	}
>> +}
>> +
>> +#else
>> +
>> +void br_netpoll_cleanup(struct net_device *br_dev)
>> +{
>> +}
>> +
>> +#endif
> 
> Could you use more stub functions to eliminate #ifdef's in code.


Probably no, because only br_netpoll_cleanup() will be called
no matter if CONFIG_NET_POLL_CONTROLLER is defined.


>> @@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
>>  		else {
>>  			skb_push(skb, ETH_HLEN);
>>  
>> -			dev_queue_xmit(skb);
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +			if (skb->dev->priv_flags & IFF_IN_NETPOLL) {
>> +				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
>> +				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
>> +			} else
>> +#endif
> 
> There is no protection on dev->priv_flags for SMP access.
> It would better bit value in dev->state if you are using it as control flag.
> 
> Then you could use 
> 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> 				netpoll_send_skb(...)
> 


Yes? netpoll_send_skb() needs to see IFF_IN_NETPOLL is set, so
we can't clear this bit before calling it.

But we do need a find a safe way to check/set this flag.


>>  static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
>>  {
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +	struct net_bridge *br = to->br;
>> +	if (br->dev->priv_flags & IFF_IN_NETPOLL) {
>> +		struct netpoll *np;
>> +		to->dev->npinfo = skb->dev->npinfo;
>> +		np = skb->dev->npinfo->netpoll;
>> +		np->real_dev = np->dev = to->dev;
>> +		to->dev->priv_flags |= IFF_IN_NETPOLL;
>> +	}
>> +#endif
> 
> This is n hot path, so use unlikely()


Ok, good point.


>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +	if (br_devices_support_netpoll(br)) {
>> +		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
>> +		if (br->dev->npinfo)
>> +			dev->npinfo = br->dev->npinfo;
>> +	} else if (!(br->dev->priv_flags & IFF_DISABLE_NETPOLL)) {
>> +		br->dev->priv_flags |= IFF_DISABLE_NETPOLL;
>> +		printk(KERN_INFO "New device %s does not support netpoll\n",
>> +			dev->name);
>> +		printk(KERN_INFO "Disabling netpoll for %s\n",
>> +			br->dev->name);
> 
> One message is sufficient.
> 

Yes? The first messages explains the reason for the second message.


Thanks.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox