Netdev List
 help / color / mirror / Atom feed
* [PATCH V5 5/6 net-next] macvtap: macvtap TX zero-copy support
From: Shirley Ma @ 2011-05-16 19:36 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
enables zero-copy.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/macvtap.c |  129 ++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 118 insertions(+), 11 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56..145e51c 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
  */
 static dev_t macvtap_major;
 #define MACVTAP_NUM_DEVS 65536
+#define GOODCOPY_LEN 256
 static struct class *macvtap_class;
 static struct cdev macvtap_cdev;
 
@@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
+	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -369,6 +371,15 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
+	/*
+	 * so far only KVM virtio_net uses macvtap, enable zero copy between
+	 * guest kernel and host kernel when lower device supports zerocopy
+	 */
+	if (vlan) {
+		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
+			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+	}
+
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
 		sock_put(&q->sk);
@@ -433,6 +444,80 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 	return skb;
 }
 
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+	skb_frag_t *f;
+
+	/* Skip over from offset */
+	while (count && (offset >= from->iov_len)) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (count && (copy > 0)) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+		}
+		copy -= size;
+		offset1 += size;
+		offset = 0;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+
+		len = from->iov_len - offset1;
+		if (!len) {
+			offset1 = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset1;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if ((num_pages != size) ||
+		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+			/* put_page is in skb free */
+			return -EFAULT;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+		atomic_add(len, &skb->sk->sk_wmem_alloc);
+		while (len) {
+			f = &skb_shinfo(skb)->frags[i];
+			f->page = page[i];
+			f->page_offset = base & ~PAGE_MASK;
+			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			base += f->size;
+			len -= f->size;
+			i++;
+		}
+		offset1 = 0;
+		++from;
+	}
+	return 0;
+}
+
 /*
  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
  * be shared with the tun/tap driver.
@@ -515,16 +600,18 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
-				const struct iovec *iv, size_t count,
-				int noblock)
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
+				const struct iovec *iv, unsigned long total_len,
+				size_t count, int noblock)
 {
 	struct sk_buff *skb;
 	struct macvlan_dev *vlan;
-	size_t len = count;
+	unsigned long len = total_len;
 	int err;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int vnet_hdr_len = 0;
+	int copylen;
+	bool zerocopy = false;
 
 	if (q->flags & IFF_VNET_HDR) {
 		vnet_hdr_len = q->vnet_hdr_sz;
@@ -552,12 +639,32 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
-				noblock, &err);
+	if (m && m->msg_control)
+		zerocopy = true;
+
+	if (zerocopy) {
+		/* There are 256 bytes to be copied in skb, so there is enough
+		 * room for skb expand head in case it is used.
+		 * The rest buffer is mapped from userspace.
+		 */
+		copylen = vnet_hdr.hdr_len;
+		if (!copylen)
+			copylen = GOODCOPY_LEN;
+	} else
+		copylen = len;
+
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
+				vnet_hdr.hdr_len, noblock, &err);
 	if (!skb)
 		goto err;
 
-	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
+	if (zerocopy) {
+		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
+		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
+		       sizeof(struct skb_ubuf_info));
+	} else
+		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
+						   len);
 	if (err)
 		goto err_kfree;
 
@@ -579,7 +686,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 		kfree_skb(skb);
 	rcu_read_unlock_bh();
 
-	return count;
+	return total_len;
 
 err_kfree:
 	kfree_skb(skb);
@@ -601,8 +708,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 	ssize_t result = -ENOLINK;
 	struct macvtap_queue *q = file->private_data;
 
-	result = macvtap_get_user(q, iv, iov_length(iv, count),
-			      file->f_flags & O_NONBLOCK);
+	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
+				  file->f_flags & O_NONBLOCK);
 	return result;
 }
 
@@ -815,7 +922,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m->msg_iov, total_len,
+	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
 			    m->msg_flags & MSG_DONTWAIT);
 }
 

^ permalink raw reply related

* Re: linux-next: Tree for May 16 (net/ipv4/ping)
From: Randy Dunlap @ 2011-05-16 19:35 UTC (permalink / raw)
  To: Stephen Rothwell, netdev; +Cc: linux-next, LKML
In-Reply-To: <20110516151019.be338f1c.sfr@canb.auug.org.au>

On Mon, 16 May 2011 15:10:19 +1000 Stephen Rothwell wrote:

> Hi all,
> 
> Changes since 20110513:


when CONFIG_PROC_SYSCTL is not enabled:

ping.c:(.text+0x52af3): undefined reference to `inet_get_ping_group_range_net'

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* Re: [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Ben Hutchings @ 2011-05-16 19:35 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <1305574128.3456.23.camel@localhost.localdomain>

On Mon, 2011-05-16 at 12:28 -0700, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
> 
>  include/linux/netdevice.h |   10 ++++++++++
>  1 files changed, 10 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index a134d80..2646251 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1066,6 +1066,16 @@ struct net_device {
>  #define NETIF_F_NOCACHE_COPY	(1 << 30) /* Use no-cache copyfromuser */
>  #define NETIF_F_LOOPBACK	(1 << 31) /* Enable loopback */
>  
> +/*
> + * Bit 31 is for device to map userspace buffers -- zerocopy
> + * Device can set this flag when it supports HIGHDMA.
> + * Device can't recycle this kind of skb buffers.
> + * There are 256 bytes copied, the rest of buffers are mapped.
> + * The userspace callback should only be called when last reference to this skb
> + * is gone.
> + */
> +#define NETIF_F_ZEROCOPY	(1 << 31)

Sorry, bit 31 is taken.  You get the job of turning features into a
wider bitmap.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH V5 4/6 net-next] vhost: vhost TX zero-copy support
From: Shirley Ma @ 2011-05-16 19:34 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

This patch maintains the outstanding userspace buffers in the 
sequence it is delivered to vhost. The outstanding userspace buffers 
will be marked as done once the lower device buffers DMA has finished. 
This is monitored through last reference of kfree_skb callback. Two
buffer index are used for this purpose.

The vhost passes the userspace buffers info to lower device skb 
through message control. Since there will be some done DMAs when
entering vhost handle_tx. The worse case is all buffers in the vq are
in pending/done status, so we need to notify guest to release DMA done 
buffers first before get any new buffers from the vq.

Signed-off-by: Shirley <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   39 +++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   58 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |   12 ++++++++++
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..c964000 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,10 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* MAX number of TX used buffers for outstanding zerocopy */
+#define VHOST_MAX_PEND 128
+#define VHOST_GOODCOPY_LEN 256
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +133,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +156,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (atomic_read(&vq->refcnt) > VHOST_MAX_ZEROCOPY_PEND)
+			vhost_zerocopy_signal_used(vq, false);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +175,12 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND) {
+				tx_poll_start(net, sock);
+				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,17 +203,37 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			vq->heads[vq->upend_idx].id = head;
+			if (len <= VHOST_GOODCOPY_LEN)
+				/* copy don't need to wait for DMA done */
+				vq->heads[vq->upend_idx].len =
+							VHOST_DMA_DONE_LEN;
+			else {
+				vq->heads[vq->upend_idx].len = len;
+				pend.callback = vhost_zerocopy_callback;
+				pend.arg = vq;
+				pend.desc = vq->upend_idx;
+				msg.msg_control = &pend;
+				msg.msg_controllen = sizeof(pend);
+			}
+			atomic_inc(&vq->refcnt);
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq, 1);
+			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+				vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..1a40310 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -385,16 +388,62 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+/* Since we need to keep the order of used_idx as avail_idx, it's possible that
+ * DMA done not in order in lower device driver for some reason. To prevent
+ * used_idx out of order, upend_idx is used to track avail_idx order, done_idx
+ * is used to track used_idx order. Once lower device DMA done, then upend_idx
+ * can move to done_idx.
+ */
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown)
+{
+	int i, j = 0;
+
+	i = vq->done_idx;
+	while (i != vq->upend_idx) {
+		if ((vq->heads[i].len == VHOST_DMA_DONE_LEN) || shutdown) {
+			/* reset len = 0 */
+			vq->heads[i].len = 0;
+			i = (i + 1) % UIO_MAXIOV;
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		/* heads is an array, i is used to tracking the last done_idx,
+		 * if last done_idx is less than current done_idx, then
+		 * add_used_n is not contiguous */
+		if (i > vq->done_idx)
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+		else {
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+					 UIO_MAXIOV - vq->done_idx);
+			vhost_add_used_n(vq, vq->heads, i);
+		}
+		vq->done_idx = i;
+		vhost_signal(vq->dev, vq);
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
+	unsigned long begin = jiffies;
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
 			vhost_poll_stop(&dev->vqs[i].poll);
 			vhost_poll_flush(&dev->vqs[i].poll);
 		}
+		/* Wait for all lower device DMAs done, then notify virtio_net
+		 * or just notify it without waiting for all DMA done here ?
+		 * in case of DMAs never done for some reason */
+		if (atomic_read(&dev->vqs[i].refcnt)) {
+			/* how long should we wait ? */
+			msleep(1000);
+			vhost_zerocopy_signal_used(&dev->vqs[i], true);
+		}
 		if (dev->vqs[i].error_ctx)
 			eventfd_ctx_put(dev->vqs[i].error_ctx);
 		if (dev->vqs[i].error)
@@ -1416,3 +1465,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = VHOST_DMA_DONE_LEN;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..8e3ecc7 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,10 @@
 #include <linux/virtio_ring.h>
 #include <asm/atomic.h>
 
+/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
+ * done */
+#define VHOST_DMA_DONE_LEN	1
+
 struct vhost_device;
 
 struct vhost_work;
@@ -108,6 +112,12 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* copy of avail idx to monitor outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* copy of used idx to monintor DMA done zerocopy buffers */
+	int done_idx;
 };
 
 struct vhost_dev {
@@ -154,6 +164,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq, bool shutdown);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \



^ permalink raw reply related

* [PATCH V5 3/6 net-next] skbuff: Add userspace zero-copy buffers in skb
From: Shirley Ma @ 2011-05-16 19:31 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

This patch adds userspace buffers support in skb shared info. A new 
struct skb_ubuf_info is needed to maintain the userspace buffers
argument and index, a callback is used to notify userspace to release
the buffers once lower device has done DMA (Last reference to that skb
has gone).

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 include/linux/skbuff.h |   26 ++++++++++++++++++++++++++
 net/core/skbuff.c      |   13 +++++++++++++
 2 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 79aafbb..40faffe 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -189,6 +189,18 @@ enum {
 	SKBTX_DRV_NEEDS_SK_REF = 1 << 3,
 };
 
+/*
+ * The callback notifies userspace to release buffers when skb DMA is
done in
+ * lower device, the skb last reference should be 0 when calling this.
+ * The desc is used to track userspace buffer index.
+ */
+struct skb_ubuf_info {
+	/* support buffers allocation from userspace */
+	void		(*callback)(struct sk_buff *);
+	void		*arg;
+	size_t		desc;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -211,6 +223,10 @@ struct skb_shared_info {
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
 	void *		destructor_arg;
+
+	/* DMA mapping from/to userspace buffers */
+	struct skb_ubuf_info ubuf;
+
 	/* must be last field, see pskb_expand_head() */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
@@ -2261,5 +2277,15 @@ static inline void
skb_checksum_none_assert(struct sk_buff *skb)
 }
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
+/*
+ *	skb_ubuf - is the buffer from userspace
+ *	@skb: buffer to check
+ */
+static inline int skb_ubuf(const struct sk_buff *skb)
+{
+	return (skb_shinfo(skb)->ubuf.callback != NULL);
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ebeed0..9cbd3fc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t
gfp_mask,
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
+	shinfo->ubuf.callback = NULL;
+	shinfo->ubuf.arg = NULL;
 	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	if (fclone) {
@@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff *skb)
 				put_page(skb_shinfo(skb)->frags[i].page);
 		}
 
+		/*
+		 * if skb buf is from userspace, we need to notify the caller
+		 * the lower device DMA has done;
+		 */
+		if (skb_shinfo(skb)->ubuf.callback) {
+			skb_shinfo(skb)->ubuf.callback(skb);
+			skb_shinfo(skb)->ubuf.callback = NULL;
+		}
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
@@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int
skb_size)
 	if (irqs_disabled())
 		return false;
 
+	if (skb_ubuf(skb))
+		return false;
+
 	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
 		return false;
 

^ permalink raw reply related

* [PATCH V5 2/6 net-next] netdevice.h: Add zero-copy flag in netdevice
From: Shirley Ma @ 2011-05-16 19:28 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 include/linux/netdevice.h |   10 ++++++++++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a134d80..2646251 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1066,6 +1066,16 @@ struct net_device {
 #define NETIF_F_NOCACHE_COPY	(1 << 30) /* Use no-cache copyfromuser */
 #define NETIF_F_LOOPBACK	(1 << 31) /* Enable loopback */
 
+/*
+ * Bit 31 is for device to map userspace buffers -- zerocopy
+ * Device can set this flag when it supports HIGHDMA.
+ * Device can't recycle this kind of skb buffers.
+ * There are 256 bytes copied, the rest of buffers are mapped.
+ * The userspace callback should only be called when last reference to this skb
+ * is gone.
+ */
+#define NETIF_F_ZEROCOPY	(1 << 31)
+
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
 #define NETIF_F_GSO_MASK	0x00ff0000

^ permalink raw reply related

* [PATCH V5 1/6 net-next] sock.h: Add a new sock zero-copy flag
From: Shirley Ma @ 2011-05-16 19:26 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 include/net/sock.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index f2046e4..2229bd1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -563,6 +563,7 @@ enum sock_flags {
 	SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */
 	SOCK_FASYNC, /* fasync() active */
 	SOCK_RXQ_OVFL,
+	SOCK_ZEROCOPY, /* buffers from userspace */
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)

^ permalink raw reply related

* [PATCH V5 0/6 net-next] macvtap/vhost TX zero-copy support
From: Shirley Ma @ 2011-05-16 19:24 UTC (permalink / raw)
  To: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann
  Cc: netdev, kvm, linux-kernel

This patchset add supports for TX zero-copy between guest and host
kernel through vhost. It significantly reduces CPU utilization on the
local host on which the guest is located (It reduced 30-50% CPU usage
for vhost thread for single stream test). The patchset is based on
previous submission and comments from the community regarding when/how
to handle guest kernel buffers to be released. This is the simplest
approach I can think of after comparing with several other solutions.

This patchset has integrated V3 review comments from community: 

1. Add more comments on how to use device ZEROCOPY flag;

2. Change device ZEROCOPY to available bit 31

3. Fix skb header linear allocation when virtio_net GSO is not enabled

It has integrated V4 review comments from MST and Sridhar:

1. In vhost, using socket poll wake up for outstanding DMAs

2. Add detailed comments for vhost_zerocopy_signal_used call

3. Add sleep in vhost shutting down instead of busy-wait for outstanding
   DMAs.

4. Copy small packets, don't do zero-copy callback in mavtap, mark it's
   DMA done in vhost

5. change zerocopy to bool in macvtap.

One comment doesn't address:

Should we count the whole page in sk_wmem_alloc for zerocopy buffer? If
so, sock_wfree needs to change too.


This patchset includes:

1/6: Add a new sock zero-copy flag, SOCK_ZEROCOPY;

2/6: Add a new device flag, NETIF_F_ZEROCOPY for lower level device
support zero-copy;

3/6: Add a new struct skb_ubuf_info in skb_share_info for userspace
buffers release callback when lower device DMA has done for that skb,
which is the last reference count gone;

4/6: Add vhost zero-copy callback in vhost when skb last refcnt is gone;
add vhost_zerocopy_signal_used to notify guest to release TX skb
buffers.

5/6: Add macvtap zero-copy in lower device when sending packet is
greater than 256 bytes to make sure there is enough room for expanding
skb head.

6/6: example: enable Intel 10Gb NIC zero-copy feature flag

The patchset is built against most recent net-next linux 2.6.39-rc7. It
has passed netperf/netserver multiple streams stress test.

Single TCP_STREAM 120 secs test results over ixgbe 10Gb NIC results:

Message BW(Gb/s)qemu-kvm (NumCPU)vhost-net(NumCPU) PerfTop irq/s
4K      7408.57         92.1%           22.6%           1229
4K(Orig)4913.17         118.1%          84.1%           2086    
8K      9129.90         89.3%           23.3%           1141
8K(Orig)7094.55         115.9%          84.7%           2157
16K     9178.81         89.1%           23.3%           1139
16K(Orig)8927.1         118.7%          83.4%           2262
64K     9171.43         88.4%           24.9%           1253
64K(Orig)9085.85        115.9%          82.4%           2229

For message size less or equal than 2K, there is a known KVM guest TX
overrun issue. With this zero-copy patch, the issue becomes more severe,
guest io_exits has tripled than before, so the performance is not good.
Once the TX overrun problem has been addressed, I will retest the small
message size performance.


^ permalink raw reply

* Re: [PATCH net-next-2.6 v2] ipv4: more compliant RFC 3168 support
From: Joe Perches @ 2011-05-16 19:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stefanos Harhalakis, David Miller, netdev
In-Reply-To: <1305571057.2581.2.camel@edumazet-laptop>

On Mon, 2011-05-16 at 20:37 +0200, Eric Dumazet wrote:
> Commit 6623e3b24a5e (ipv4: IP defragmentation must be ECN aware) was an
> attempt to not lose "Congestion Experienced" (CE) indications when
> performing datagram defragmentation.
> Stefanos Harhalakis raised the point that RFC 3168 requirements were not
> completely met by this commit.
> In particular, we MUST detect invalid combinations and eventually drop
> illegal frames.
> Reported-by: Stefanos Harhalakis <v13@v13.gr>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> v2 : Use a table instead of a switch()

Just trivia:

bitmasks and hard coded constants can be a bit of a minefield.

> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> @@ -77,22 +77,42 @@ struct ipq {
[]
> +/* RFC 3168 support :
> + * We want to check ECN values of all fragments, do detect invalid combinations.
> + * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
> + */
> +enum {
> +	IPFRAG_ECN_NOT_ECT	= 0x01, /* one frag had ECN_NOT_ECT */
> +	IPFRAG_ECN_ECT_1	= 0x02, /* one frag had ECN_ECT_1 */
> +	IPFRAG_ECN_ECT_0	= 0x04, /* one frag had ECN_ECT_0 */
> +	IPFRAG_ECN_CE		= 0x08, /* one frag had ECN_CE */

Might be better to have a IPFRAG_ECN_TABLESIZE or some such.

	IPFRAG_ECN_TABLESIZE = 0x10,

> +static const u8 ip4_frag_ecn_table[16] = {

static const u8 ip4_frag_ecn_table[IPFRAG_ECN_TABLESIZE] = {



^ permalink raw reply

* Re: [PATCH] net: Change netdev_fix_features messages loglevel
From: David Miller @ 2011-05-16 19:14 UTC (permalink / raw)
  To: mirq-linux; +Cc: netdev, mst, herbert, bhutchings
In-Reply-To: <20110516131757.ED57E13A6A@rere.qmqm.pl>

From: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Date: Mon, 16 May 2011 15:17:57 +0200 (CEST)

> Those reduced to DEBUG can possibly be triggered by unprivileged processes
> and are nothing exceptional. Illegal checksum combinations can only be
> caused by driver bug, so promote those messages to WARN.
> 
> Since GSO without SG will now only cause DEBUG message from
> netdev_fix_features(), remove the workaround from register_netdevice().
> 
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] CDC NCM: release interfaces fix in unbind()
From: David Miller @ 2011-05-16 19:13 UTC (permalink / raw)
  To: alexey.orishko; +Cc: netdev, linux-usb, gregkh, oliver, alexey.orishko
In-Reply-To: <1305491307-6548-1-git-send-email-alexey.orishko@stericsson.com>

From: Alexey Orishko <alexey.orishko@gmail.com>
Date: Sun, 15 May 2011 22:28:27 +0200

> Signed-off-by: Alexey Orishko <alexey.orishko@stericsson.com>

I don't know about this.

The usb_set_intfdata() calls at cdc_ncm_bind() time are unconditional,
and are performed regardless of whether ->data_claimed or
->control_claimed are set.

So conditionalizing the setting of them to NULL here doesn't seem
right at all.

I'm not applying this patch.

^ permalink raw reply

* Re: [RFC 2/3] RDMA/cma: Add support for netlink statistics export
From: Or Gerlitz @ 2011-05-16 19:13 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Or Gerlitz, Roland Dreier,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1828884A29C6694DAF28B7E6B8A82373F654-P5GAC/sN6hmkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>

Hefty, Sean <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

> That assumes a 1:1 relationship between the port space and QP type, or more
> specifically, that the IB SID range is divided based on QP type.  This is outside of the
> architecture.  If we look at UC or XRC, it's not clear what port space those map to, but I
> believe it's desirable for the rdma_cm to support those.

makes sense, so we can add the qp type here.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] vmxnet3: Fix inconsistent LRO state after initialization
From: David Miller @ 2011-05-16 19:05 UTC (permalink / raw)
  To: shemminger; +Cc: thomas.jarosch, netdev, sbhatewara
In-Reply-To: <20110516101444.29eec9dc@nehalam>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 16 May 2011 10:14:44 -0700

> On Mon, 16 May 2011 18:28:15 +0200
> Thomas Jarosch <thomas.jarosch@intra2net.com> wrote:
> 
>> During initialization of vmxnet3, the state of LRO
>> gets out of sync with netdev->features.
 ...
>> Signed-off-by: Thomas Jarosch <thomas.jarosch@intra2net.com>
> 
> Thanks for finding this. Vyatta ended up changing the default of adapter->lro
> to workaround just this problem
> 
> Acked-by: Stephen Hemminger <shemminger@vyatta.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-2.6] sfc: Fix oops in register dump after mapping change
From: David Miller @ 2011-05-16 19:05 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev, linux-net-drivers
In-Reply-To: <1305562429.2885.23.camel@bwh-desktop>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 16 May 2011 17:13:49 +0100

> Commit 747df2258b1b9a2e25929ef496262c339c380009 ('sfc: Always map MCDI
> shared memory as uncacheable') introduced a separate mapping for the
> MCDI shared memory (MC_TREG_SMEM).  This means we can no longer easily
> include it in the register dump.  Since it is not particularly useful
> in debugging, substitute a recognisable dummy value.
> 
> Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: cpuvert to new cpumask API
From: David Miller @ 2011-05-16 18:53 UTC (permalink / raw)
  To: kosaki.motohiro; +Cc: linux-kernel, therbert, eric.dumazet, netdev
In-Reply-To: <20110428235813.3D5A.A69D9226@jp.fujitsu.com>

From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 28 Apr 2011 23:56:35 +0900 (JST)

> We plan to remove cpu_xx() old api later. Thus this patch
> convert it.
> 
> This patch has no functional change.
> 
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next-2.6 v2] ipv4: more compliant RFC 3168 support
From: David Miller @ 2011-05-16 18:49 UTC (permalink / raw)
  To: eric.dumazet; +Cc: v13, netdev
In-Reply-To: <1305571057.2581.2.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 16 May 2011 20:37:37 +0200

> Commit 6623e3b24a5e (ipv4: IP defragmentation must be ECN aware) was an
> attempt to not lose "Congestion Experienced" (CE) indications when
> performing datagram defragmentation.
> 
> Stefanos Harhalakis raised the point that RFC 3168 requirements were not
> completely met by this commit.
> 
> In particular, we MUST detect invalid combinations and eventually drop
> illegal frames.
> 
> Reported-by: Stefanos Harhalakis <v13@v13.gr>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> v2 : Use a table instead of a switch()

Applied, thanks!

^ permalink raw reply

* [PATCH net-next-2.6 v2] ipv4: more compliant RFC 3168 support
From: Eric Dumazet @ 2011-05-16 18:37 UTC (permalink / raw)
  To: Stefanos Harhalakis; +Cc: David Miller, netdev
In-Reply-To: <1305566204.2898.49.camel@edumazet-laptop>

Commit 6623e3b24a5e (ipv4: IP defragmentation must be ECN aware) was an
attempt to not lose "Congestion Experienced" (CE) indications when
performing datagram defragmentation.

Stefanos Harhalakis raised the point that RFC 3168 requirements were not
completely met by this commit.

In particular, we MUST detect invalid combinations and eventually drop
illegal frames.

Reported-by: Stefanos Harhalakis <v13@v13.gr>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
v2 : Use a table instead of a switch()

 net/ipv4/ip_fragment.c |   60 ++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b1d282f..9e1458d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -77,22 +77,42 @@ struct ipq {
 	struct inet_peer *peer;
 };
 
-#define IPFRAG_ECN_CLEAR  0x01 /* one frag had INET_ECN_NOT_ECT */
-#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */
+/* RFC 3168 support :
+ * We want to check ECN values of all fragments, do detect invalid combinations.
+ * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
+ */
+enum {
+	IPFRAG_ECN_NOT_ECT	= 0x01, /* one frag had ECN_NOT_ECT */
+	IPFRAG_ECN_ECT_1	= 0x02, /* one frag had ECN_ECT_1 */
+	IPFRAG_ECN_ECT_0	= 0x04, /* one frag had ECN_ECT_0 */
+	IPFRAG_ECN_CE		= 0x08, /* one frag had ECN_CE */
+};
 
 static inline u8 ip4_frag_ecn(u8 tos)
 {
-	tos = (tos & INET_ECN_MASK) + 1;
-	/*
-	 * After the last operation we have (in binary):
-	 * INET_ECN_NOT_ECT => 001
-	 * INET_ECN_ECT_1   => 010
-	 * INET_ECN_ECT_0   => 011
-	 * INET_ECN_CE      => 100
-	 */
-	return (tos & 2) ? 0 : tos;
+	return 1 << (tos & INET_ECN_MASK);
 }
 
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+static const u8 ip4_frag_ecn_table[16] = {
+	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
+
+	/* invalid combinations : drop frame */
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+
 static struct inet_frags ip4_frags;
 
 int ip_frag_nqueues(struct net *net)
@@ -524,9 +544,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	int len;
 	int ihlen;
 	int err;
+	u8 ecn;
 
 	ipq_kill(qp);
 
+	ecn = ip4_frag_ecn_table[qp->ecn];
+	if (unlikely(ecn == 0xff)) {
+		err = -EINVAL;
+		goto out_fail;
+	}
 	/* Make the one we just received the head. */
 	if (prev) {
 		head = prev->next;
@@ -605,17 +631,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph = ip_hdr(head);
 	iph->frag_off = 0;
 	iph->tot_len = htons(len);
-	/* RFC3168 5.3 Fragmentation support
-	 * If one fragment had INET_ECN_NOT_ECT,
-	 *	reassembled frame also has INET_ECN_NOT_ECT
-	 * Elif one fragment had INET_ECN_CE
-	 *	reassembled frame also has INET_ECN_CE
-	 */
-	if (qp->ecn & IPFRAG_ECN_CLEAR)
-		iph->tos &= ~INET_ECN_MASK;
-	else if (qp->ecn & IPFRAG_ECN_SET_CE)
-		iph->tos |= INET_ECN_CE;
-
+	iph->tos |= ecn;
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;



^ permalink raw reply related

* Re: [PATCH net-next-2.6 0/3] be2net: patch series
From: David Miller @ 2011-05-16 18:14 UTC (permalink / raw)
  To: padmanabh.ratnakar; +Cc: netdev
In-Reply-To: <97ee12ca-4d87-4cff-bc15-fec8930d7ac3@exht1.ad.emulex.com>

From: Padmanabh Ratnakar <padmanabh.ratnakar@Emulex.Com>
Date: Mon, 16 May 2011 23:06:01 +0530

> Please Apply.

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] output ipconfig info message as one printk
From: Micha Nelissen @ 2011-05-16 18:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110516.140035.72516501072074389.davem@davemloft.net>

David Miller wrote:
> Second, you should not do this because now the lines after the
> first won't have the default loglevel prepended.
> 
> I think this should just be left alone, and you should properly
> mark your netconsole logs so that you can discern which machine
> the individual messages come from so you can piece them together
> properly if you need to.

Hmm, perhaps the printks can at least be grouped per line?

Micha

^ permalink raw reply

* Re: [PATCH v3 0/2] Fix uevent race in register_netdevice()
From: David Miller @ 2011-05-16 18:11 UTC (permalink / raw)
  To: kvalo; +Cc: netdev, linux-wireless, linux-kernel
In-Reply-To: <20110516143913.13838.85357.stgit@localhost6.localdomain6>

From: Kalle Valo <kvalo@adurom.com>
Date: Mon, 16 May 2011 17:46:30 +0300

> I'm trying to fix a race in register_netdevice(). The problem is that
> there's a uevent to userspace before the netdevice is ready for use. The
> problem is described here:
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=15606
> 
> I have sent few different ways to fix this, but none of them have been
> really usable. Now I came up with a way which changes the driver core
> to make it possible send the uevent in a separate call. This is a clean
> and safe way to fix the race. Downside is that two new functions are
> added to the driver core interface.
> 
> Please comment.

This doesn't work.

The sysfs file will still be there before the uevent, so any
process can go in there, and see the inconsistent state.

^ permalink raw reply

* Re: [PATCH] ethtool: ETHTOOL_SFEATURES: remove NETIF_F_COMPAT return
From: David Miller @ 2011-05-16 18:09 UTC (permalink / raw)
  To: mirq-linux; +Cc: bhutchings, netdev
In-Reply-To: <20110516142340.GA2980@rere.qmqm.pl>


You guys really need to sort this out properly.

Please resubmit whatever final solution is agreed upon.

Thanks.

^ permalink raw reply

* Re: [PATCH 1/1] igmp: fix ip_mc_clear_src to not reset ip_mc_list->sf{mode,count}
From: David Miller @ 2011-05-16 18:03 UTC (permalink / raw)
  To: vfalico
  Cc: mmarek, kuznet, pekkas, jmorris, yoshfuji, kaber, linux-kbuild,
	linux-kernel, netdev, dlstevens
In-Reply-To: <20110515165945.GA20024@darkmag.usersys.redhat.com>

From: Veaceslav Falico <vfalico@redhat.com>
Date: Sun, 15 May 2011 18:59:45 +0200

> ip_mc_clear_src resets the imc->sfcount and imc->sfmode, without taking into
> account the current number of sockets listening on that multicast struct, which
> can lead to bogus routes for local listeners.
> 
> On NETDEV_DOWN/UP event, if there were 3 multicast listeners for that interface's
> address, the imc->sfcount[MCAST_EXCLUDE] will be reset to 1. And after that a
> listener socket destroys, multicast traffic will not be delivered to local
> listeners because __mkroute_output drops the local flag for the route (by
> checking ip_check_mc).
> 
> Signed-off-by: Veaceslav Falico <vfalico@redhat.com>

David, please take a look at this.  Thanks.

> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index 1fd3d9c..b14f371 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -1775,9 +1775,6 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
>  		kfree(psf);
>  	}
>  	pmc->sources = NULL;
> -	pmc->sfmode = MCAST_EXCLUDE;
> -	pmc->sfcount[MCAST_INCLUDE] = 0;
> -	pmc->sfcount[MCAST_EXCLUDE] = 1;
>  }
>  
>  

^ permalink raw reply

* Re: [PATCH] ipconfig wait for carrier
From: David Miller @ 2011-05-16 18:02 UTC (permalink / raw)
  To: micha; +Cc: netdev
In-Reply-To: <1305405386-25187-1-git-send-email-micha@neli.hopto.org>

From: Micha Nelissen <micha@neli.hopto.org>
Date: Sat, 14 May 2011 22:36:26 +0200

> Currently the ip auto configuration has a hardcoded delay of 1 second.
> When (ethernet) link takes longer to come up (e.g. more than 3 seconds),
> nfs root may not be found.
> 
> Remove the hardcoded delay, and wait for carrier on at least one network
> device.

Again, you are missing a proper "Signed-off-by: " line, also:

> 
> Index: atom-linux/net/ipv4/ipconfig.c
> ===================================================================
> --- atom-linux/net/ipv4/ipconfig.c	(revision 1445)
> +++ atom-linux/net/ipv4/ipconfig.c	(working copy)

I'm not taking patches which are against some random atom-linux tree,
you have to make your patches against the current upstream sources
only.

^ permalink raw reply

* Re: [PATCH] output ipconfig info message as one printk
From: David Miller @ 2011-05-16 18:00 UTC (permalink / raw)
  To: micha; +Cc: netdev
In-Reply-To: <1305409519-25404-1-git-send-email-micha@neli.hopto.org>

From: Micha Nelissen <micha@neli.hopto.org>
Date: Sat, 14 May 2011 23:45:19 +0200

> the "ip-config complete" message with ip address etc is output using
> many printks. When using the netconsole, and multiple agents are booting
> (and logging their boot) simultaneously, the syslog of the receiving
> host gets very messy, as the individual printks are interleaved.
> Combining the many printks into one printk improves syslog readability.

First, you are missing a proper "Signed-off-by: " tag in your
patch submissions.

Second, you should not do this because now the lines after the
first won't have the default loglevel prepended.

I think this should just be left alone, and you should properly
mark your netconsole logs so that you can discern which machine
the individual messages come from so you can piece them together
properly if you need to.

^ permalink raw reply

* Re: [PATCH 1/1] netlink: nla_nest_end return "unsigned init"
From: David Miller @ 2011-05-16 17:51 UTC (permalink / raw)
  To: linville; +Cc: wey-yi.w.guy, linux-wireless, ipw3945-devel, netdev
In-Reply-To: <20110516173548.GB6551@tuxdriver.com>

From: "John W. Linville" <linville@tuxdriver.com>
Date: Mon, 16 May 2011 13:35:48 -0400

> This should go to netdev@vger.kernel.org instead...
> 
> On Mon, May 16, 2011 at 07:23:30AM -0700, Wey-Yi Guy wrote:
>> skb->len has unsigned int, return the correct value from nla_nest_end call.
>> 
>> Signed-off-by: Wey-Yi Guy <wey-yi.w.guy@intel.com>

Plus, the only users of the return value use it to feed "int"
return values of functions.  F.e. net/core/neighbour.c's use
in neightbl_fill_parms().

I don't see this as being any better or worse, and we should
just leave the return value alone.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox