Netdev List
 help / color / mirror / Atom feed
* [RFC][PATCH v4 15/18] Manipulate external buffers in mp device.
From: xiaohui.xin @ 2010-04-25  9:20 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1272187206-18534-14-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/mpassthru.c |  237 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 236 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index c5ede17..b171f21 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -159,6 +159,39 @@ static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
 	return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+		struct sk_buff *skb, int npages)
+{
+	int i;
+	unsigned long flags;
+	struct page_ctor *ctor;
+	struct page_info *info = NULL;
+
+	ctor = container_of(port, struct page_ctor, port);
+
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq, struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	if (!info)
+		return NULL;
+
+	for (i = 0; i < info->pnum; i++) {
+		get_page(info->pages[i]);
+		info->frag[i].page = info->pages[i];
+		info->frag[i].page_offset = i ? 0 : info->offset;
+		info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+			port->data_len;
+	}
+	info->skb = skb;
+	info->ext_page.frags = info->frag;
+	info->ext_page.ushinfo = &info->ushinfo;
+	return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
 	int rc;
@@ -191,7 +224,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
 	dev_hold(dev);
 	ctor->dev = dev;
-	ctor->port.ctor = NULL;
+	ctor->port.ctor = page_ctor;
 	ctor->port.sock = &mp->socket;
 	ctor->lock_pages = 0;
 	rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -258,6 +291,52 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
 	task_unlock(current->group_leader);
 	return 0;
 }
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+	struct page_info *info = (struct page_info *)(iocb->private);
+	int i;
+
+	if (info->flags == INFO_READ) {
+		for (i = 0; i < info->pnum; i++) {
+			if (info->pages[i]) {
+				set_page_dirty_lock(info->pages[i]);
+				put_page(info->pages[i]);
+			}
+		}
+		skb_shinfo(info->skb)->destructor_arg = &info->ext_page;
+		info->skb->destructor = NULL;
+		kfree_skb(info->skb);
+	}
+	/* Decrement the number of locked pages */
+	info->ctor->lock_pages -= info->pnum;
+	kmem_cache_free(info->ctor->cache, info);
+
+	return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+	struct kiocb *iocb = NULL;
+
+	iocb = info->iocb;
+	if (!iocb)
+		return iocb;
+	iocb->ki_flags = 0;
+	iocb->ki_users = 1;
+	iocb->ki_key = 0;
+	iocb->ki_ctx = NULL;
+	iocb->ki_cancel = NULL;
+	iocb->ki_retry = NULL;
+	iocb->ki_iovec = NULL;
+	iocb->ki_eventfd = NULL;
+	iocb->ki_pos = info->desc_pos;
+	iocb->ki_nbytes = size;
+	iocb->ki_dtor(iocb);
+	iocb->private = (void *)info;
+	iocb->ki_dtor = mp_ki_dtor;
+
+	return iocb;
+}
 
 static int page_ctor_detach(struct mp_struct *mp)
 {
@@ -275,6 +354,7 @@ static int page_ctor_detach(struct mp_struct *mp)
 		for (i = 0; i < info->pnum; i++)
 			if (info->pages[i])
 				put_page(info->pages[i]);
+		create_iocb(info, 0);
 		kmem_cache_free(ctor->cache, info);
 	}
 	set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
@@ -328,6 +408,161 @@ static void mp_put(struct mp_file *mfile)
 		mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+	struct page_info *info;
+	struct page_ctor *ctor;
+	struct sock *sk;
+	struct sk_buff *skb;
+	struct kiocb *iocb = NULL;
+	unsigned long flags;
+
+	if (!ext_page)
+		return;
+	info = container_of(ext_page, struct page_info, ext_page);
+	if (!info)
+		return;
+	ctor = info->ctor;
+	skb = info->skb;
+
+	if ((info->flags == INFO_READ) && info->skb)
+		info->skb->head = NULL;
+
+	/* If the info->total is 0, make it to be reused */
+	if (!info->total) {
+		spin_lock_irqsave(&ctor->read_lock, flags);
+		list_add(&info->list, &ctor->readq);
+		spin_unlock_irqrestore(&ctor->read_lock, flags);
+		return;
+	}
+
+	if (info->flags == INFO_READ)
+		return;
+
+	/* For transmit, we should wait for the DMA finish by hardware.
+	 * Queue the notifier to wake up the backend driver
+	 */
+
+	iocb = create_iocb(info, info->total);
+
+	sk = ctor->port.sock->sk;
+	sk->sk_write_space(sk);
+
+	return;
+}
+
+/* For small exteranl buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+		struct kiocb *iocb, int total)
+{
+	struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	info->total = total;
+	info->ext_page.dtor = page_dtor;
+	info->ctor = ctor;
+	info->flags = INFO_WRITE;
+	info->iocb = iocb;
+	return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the external buffer address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+		struct kiocb *iocb, struct iovec *iov,
+		int count, struct frag *frags,
+		int npages, int total)
+{
+	int rc;
+	int i, j, n = 0;
+	int len;
+	unsigned long base, lock_limit;
+	struct page_info *info = NULL;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if (ctor->lock_pages + count > lock_limit) {
+		printk(KERN_INFO "exceed the locked memory rlimit.");
+		return NULL;
+	}
+
+	info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+
+	for (i = j = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		if (!len)
+			continue;
+		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+		rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+				&info->pages[j]);
+		if (rc != n)
+			goto failed;
+
+		while (n--) {
+			frags[j].offset = base & ~PAGE_MASK;
+			frags[j].size = min_t(int, len,
+					PAGE_SIZE - frags[j].offset);
+			len -= frags[j].size;
+			base += frags[j].size;
+			j++;
+		}
+	}
+
+#ifdef CONFIG_HIGHMEM
+	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < j; i++) {
+			if (PageHighMem(info->pages[i]))
+				goto failed;
+		}
+	}
+#endif
+
+	info->total = total;
+	info->ext_page.dtor = page_dtor;
+	info->ctor = ctor;
+	info->pnum = j;
+	info->iocb = iocb;
+	if (!npages)
+		info->flags = INFO_WRITE;
+	if (info->flags == INFO_READ) {
+		info->ext_page.start = (u8 *)(((unsigned long)
+				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+				frags[0].offset));
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+		info->ext_page.size = SKB_DATA_ALIGN(
+				iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD);
+#else
+		info->ext_page.size = SKB_DATA_ALIGN(
+				iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD) -
+			NET_IP_ALIGN - NET_SKB_PAD;
+#endif
+	}
+	/* increment the number of locked pages */
+	ctor->lock_pages += j;
+	return info;
+
+failed:
+	for (i = 0; i < j; i++)
+		put_page(info->pages[i]);
+
+	kmem_cache_free(ctor->cache, info);
+
+	return NULL;
+}
+
 /* Ops structure to mimic raw sockets with mp device */
 static const struct proto_ops mp_socket_ops = {
 };
-- 
1.5.4.4

^ permalink raw reply related

* [RFC][PATCH v4 16/18] Export proto_ops to vhost-net driver.
From: xiaohui.xin @ 2010-04-25  9:20 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1272187206-18534-15-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/mpassthru.c |  321 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 317 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index b171f21..0ac1a71 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -563,8 +563,321 @@ failed:
 	return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor = NULL;
+	struct sk_buff *skb = NULL;
+	struct page_info *info = NULL;
+	struct ethhdr *eth;
+	struct kiocb *iocb = NULL;
+	int len, i;
+
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	};
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (skb_shinfo(skb)->destructor_arg) {
+			info = container_of(skb_shinfo(skb)->destructor_arg,
+					struct page_info, ext_page);
+			info->skb = skb;
+			if (skb->len > info->len) {
+				mp->dev->stats.rx_dropped++;
+				DBG(KERN_INFO "Discarded truncated rx packet: "
+				    " len %d > %zd\n", skb->len, info->len);
+				info->total = skb->len;
+				goto clean;
+			} else {
+				int i;
+				struct skb_shared_info *gshinfo =
+					(struct skb_shared_info *)
+					(&info->ushinfo);
+				struct skb_shared_info *hshinfo =
+					skb_shinfo(skb);
+
+				if (gshinfo->nr_frags < hshinfo->nr_frags)
+					goto clean;
+				eth = eth_hdr(skb);
+				skb_push(skb, ETH_HLEN);
+
+				hdr.hdr_len = skb_headlen(skb);
+				info->total = skb->len;
+
+				for (i = 0; i < gshinfo->nr_frags; i++)
+					gshinfo->frags[i].size = 0;
+				for (i = 0; i < hshinfo->nr_frags; i++)
+					gshinfo->frags[i].size =
+						hshinfo->frags[i].size;
+				memcpy(skb_shinfo(skb), &info->ushinfo,
+						sizeof(struct skb_shared_info));
+			}
+		} else {
+			/* The skb composed with kernel buffers
+			 * in case external buffers are not sufficent.
+			 * The case should be rare.
+			 */
+			unsigned long flags;
+			int i;
+			struct skb_shared_info *gshinfo = NULL;
+
+			info = NULL;
+
+			spin_lock_irqsave(&ctor->read_lock, flags);
+			if (!list_empty(&ctor->readq)) {
+				info = list_first_entry(&ctor->readq,
+						struct page_info, list);
+				list_del(&info->list);
+			}
+			spin_unlock_irqrestore(&ctor->read_lock, flags);
+			if (!info) {
+				DBG(KERN_INFO
+				    "No external buffer avaliable %p\n",
+				    skb);
+				skb_queue_head(&sk->sk_receive_queue,
+						skb);
+				break;
+			}
+			info->skb = skb;
+			/* compute the guest skb frags info */
+			gshinfo = (struct skb_shared_info *)
+				  (info->ext_page.start +
+				  SKB_DATA_ALIGN(info->ext_page.size));
+
+			if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+				goto clean;
+
+			eth = eth_hdr(skb);
+			skb_push(skb, ETH_HLEN);
+			info->total = skb->len;
+
+			for (i = 0; i < gshinfo->nr_frags; i++)
+				gshinfo->frags[i].size = 0;
+			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+				gshinfo->frags[i].size =
+					skb_shinfo(skb)->frags[i].size;
+			hdr.hdr_len = min_t(int, skb->len,
+					info->iov[1].iov_len);
+			skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+		}
+
+		len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+				sizeof hdr);
+		if (len) {
+			DBG(KERN_INFO
+				"Unable to write vnet_hdr at addr %p: %d\n",
+				info->hdr->iov_base, len);
+			goto clean;
+		}
+
+		iocb = create_iocb(info, skb->len + sizeof(hdr));
+		continue;
+
+clean:
+		kfree_skb(skb);
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ctor->cache, info);
+	}
+	return;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct iovec *iov = m->msg_iov;
+	struct page_info *info = NULL;
+	struct frag frags[MAX_SKB_FRAGS];
+	struct sk_buff *skb;
+	int count = m->msg_iovlen;
+	int total = 0, header, n, i, len, rc;
+	unsigned long base;
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -ENODEV;
+
+	total = iov_length(iov, count);
+
+	if (total < ETH_HLEN)
+		return -EINVAL;
+
+	if (total <= COPY_THRESHOLD)
+		goto copy;
+
+	n = 0;
+	for (i = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		if (!len)
+			continue;
+		n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (n > MAX_SKB_FRAGS)
+			return -EINVAL;
+	}
+
+copy:
+	header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+	skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+	if (!skb)
+		goto drop;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	skb_set_network_header(skb, ETH_HLEN);
+
+	memcpy_fromiovec(skb->data, iov, header);
+	skb_put(skb, header);
+	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+	if (header == total) {
+		rc = total;
+		info = alloc_small_page_info(ctor, iocb, total);
+	} else {
+		info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
+		if (info)
+			for (i = 0; info->pages[i]; i++) {
+				skb_add_rx_frag(skb, i, info->pages[i],
+						frags[i].offset, frags[i].size);
+				info->pages[i] = NULL;
+			}
+	}
+	if (info != NULL) {
+		info->desc_pos = iocb->ki_pos;
+		info->total = total;
+		info->skb = skb;
+		skb_shinfo(skb)->destructor_arg = &info->ext_page;
+		skb->dev = mp->dev;
+		dev_queue_xmit(skb);
+		return 0;
+	}
+drop:
+	kfree_skb(skb);
+	if (info) {
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(info->ctor->cache, info);
+	}
+	mp->dev->stats.tx_dropped++;
+	return -ENOMEM;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len,
+		int flags)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+	int npages, payload;
+	struct page_info *info;
+	struct frag frags[MAX_SKB_FRAGS];
+	unsigned long base;
+	int i, len;
+	unsigned long flag;
+
+	if (!(flags & MSG_DONTWAIT))
+		return -EINVAL;
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -EINVAL;
+
+	/* Error detections in case invalid external buffer */
+	if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+			mp->dev->features & NETIF_F_SG) {
+		return -EINVAL;
+	}
+
+	npages = ctor->port.npages;
+	payload = ctor->port.data_len;
+
+	/* If KVM guest virtio-net FE driver use SG feature */
+	if (count > 2) {
+		for (i = 2; i < count; i++) {
+			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+			len = iov[i].iov_len;
+			if (npages == 1)
+				len = min_t(int, len, PAGE_SIZE - base);
+			else if (base)
+				break;
+			payload -= len;
+			if (payload <= 0)
+				goto proceed;
+			if (npages == 1 || (len & ~PAGE_MASK))
+				break;
+		}
+	}
+
+	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+		goto proceed;
+
+	return -EINVAL;
+
+proceed:
+	/* skip the virtnet head */
+	iov++;
+	count--;
+
+	if (!ctor->lock_pages)
+		set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+				iocb->ki_user_data * 4096,
+				iocb->ki_user_data * 4096);
+
+	/* Translate address to kernel */
+	info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+	if (!info)
+		return -ENOMEM;
+	info->len = total_len;
+	info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base;
+	info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len;
+	info->offset = frags[0].offset;
+	info->desc_pos = iocb->ki_pos;
+
+	iov--;
+	count++;
+
+	memcpy(info->iov, iov, sizeof(struct iovec) * count);
+
+	spin_lock_irqsave(&ctor->read_lock, flag);
+	list_add_tail(&info->list, &ctor->readq);
+	spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+	return 0;
+}
+
 /* Ops structure to mimic raw sockets with mp device */
 static const struct proto_ops mp_socket_ops = {
+	.sendmsg = mp_sendmsg,
+	.recvmsg = mp_recvmsg,
 };
 
 static struct proto mp_proto = {
@@ -687,10 +1000,10 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd,
 		sk->sk_sndbuf = INT_MAX;
 		container_of(sk, struct mp_sock, sk)->mp = mp;
 
-		sk->sk_destruct = NULL;
-		sk->sk_data_ready = NULL;
-		sk->sk_write_space = NULL;
-		sk->sk_state_change = NULL;
+		sk->sk_destruct = mp_sock_destruct;
+		sk->sk_data_ready = mp_sock_data_ready;
+		sk->sk_write_space = mp_sock_write_space;
+		sk->sk_state_change = mp_sock_state_change;
 		ret = mp_attach(mp, file);
 		if (ret < 0)
 			goto err_free_sk;
-- 
1.5.4.4

^ permalink raw reply related

* [RFC][PATCH v4 17/18] Add a kconfig entry and make entry for mp device.
From: xiaohui.xin @ 2010-04-25  9:20 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1272187206-18534-16-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/Kconfig  |   10 ++++++++++
 drivers/vhost/Makefile |    2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
 	  To compile this driver as a module, choose M here: the module will
 	  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+	tristate "mediate passthru network driver (EXPERIMENTAL)"
+	depends on VHOST_NET
+	---help---
+	  zerocopy network I/O support, we call it as mediate passthru to
+	  be distiguish with hardare passthru.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

^ permalink raw reply related

* [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: xiaohui.xin @ 2010-04-25  9:20 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike
In-Reply-To: <1272187206-18534-18-git-send-email-xiaohui.xin@intel.com>

We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-12:  	net core changes.
patch 13-17:  	new device as interface to mantpulate external buffers.
patch 18: 	for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:	Modify __alloc_skb() function a bit, it can only allocate a 
	structure of sk_buff, and the data pointer is pointing to a 
	user buffer which is coming from a page constructor API.
	Then the shinfo of the skb is also from guest.
	When packet is received from hardware, the skb->data is filled
	directly by h/w. What we have done is in this way.

	Pros:	We can avoid any copy here.
	Cons:	Guest virtio-net driver needs to allocate skb as almost
		the same method with the host NIC drivers, say the size
		of netdev_alloc_skb() and the same reserved space in the
		head of skb. Many NIC drivers are the same with guest and
		ok for this. But some lastest NIC drivers reserves special
		room in skb head. To deal with it, we suggest to provide
		a method in guest virtio-net driver to ask for parameter
		we interest from the NIC driver when we know which device 
		we have bind to do zero-copy. Then we ask guest to do so.
		Is that reasonable?

Two:	Modify driver to get user buffer allocated from a page constructor
	API(to substitute alloc_page()), the user buffer are used as payload
	buffers and filled by h/w directly when packet is received. Driver
	should associate the pages with skb (skb_shinfo(skb)->frags). For 
	the head buffer side, let host allocates skb, and h/w fills it. 
	After that, the data filled in host skb header will be copied into
	guest header buffer which is submitted together with the payload buffer.

	Pros:	We could less care the way how guest or host allocates their
		buffers.
	Cons:	We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

Please give comments especially for the network part modifications.


We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
	packet split support
	To support GRO
	Performance tuning

what we have done in v1:
	polish the RCU usage
	deal with write logging in asynchroush mode in vhost
	add notifier block for mp device
	rename page_ctor to mp_port in netdevice.h to make it looks generic
	add mp_dev_change_flags() for mp device to change NIC state
	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
	a small fix for missing dev_put when fail
	using dynamic minor instead of static minor number
	a __KERNEL__ protect to mp_get_sock()

what we have done in v2:
	
	remove most of the RCU usage, since the ctor pointer is only
	changed by BIND/UNBIND ioctl, and during that time, NIC will be
	stopped to get good cleanup(all outstanding requests are finished),
	so the ctor pointer cannot be raced into wrong situation.

	Remove the struct vhost_notifier with struct kiocb.
	Let vhost-net backend to alloc/free the kiocb and transfer them
	via sendmsg/recvmsg.

	use get_user_pages_fast() and set_page_dirty_lock() when read.

	Add some comments for netdev_mp_port_prep() and handle_mpassthru().

what we have done in v3:
	the async write logging is rewritten 
	a drafted synchronous write function for qemu live migration
	a limit for locked pages from get_user_pages_fast() to prevent Dos
	by using RLIMIT_MEMLOCK
	

what we have done in v4:
	add iocb completion callback from vhost-net to queue iocb in mp device
	replace vq->receiver by mp_sock_data_ready()
	remove stuff in mp device which access structures from vhost-net
	modify skb_reserve() to ignore host NIC driver reserved space
	rebase to the latest vhost tree
	split large patches into small pieces, especially for net core part.
	
		
performance:
	using netperf with GSO/TSO disabled, 10G NIC, 
	disabled packet split mode, with raw socket case compared to vhost.

	bindwidth will be from 1.1Gbps to 1.7Gbps
	CPU % from 120%-140% to 140%-160%

^ permalink raw reply

* [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: Jiri Pirko @ 2010-04-25  9:26 UTC (permalink / raw)
  To: netdev; +Cc: davem, ebiederm

There's no need to iterate this twice. We can free net generic variables right
after exit is called.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index bd8c471..16217bc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -82,7 +82,7 @@ static void ops_free_list(const struct pernet_operations *ops,
 static __net_init int setup_net(struct net *net)
 {
 	/* Must be called with net_mutex held */
-	const struct pernet_operations *ops, *saved_ops;
+	const struct pernet_operations *ops;
 	int error = 0;
 	LIST_HEAD(net_exit_list);
 
@@ -105,13 +105,10 @@ out_undo:
 	 * for the pernet modules whose init functions did not fail.
 	 */
 	list_add(&net->exit_list, &net_exit_list);
-	saved_ops = ops;
-	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
 		ops_exit_list(ops, &net_exit_list);
-
-	ops = saved_ops;
-	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
+	}
 
 	rcu_barrier();
 	goto out;
@@ -231,13 +228,14 @@ static void cleanup_net(struct work_struct *work)
 	 */
 	synchronize_rcu();
 
-	/* Run all of the network namespace exit methods */
-	list_for_each_entry_reverse(ops, &pernet_list, list)
+	/*
+	 * Run all of the network namespace exit methods and free
+	 * the net generic variables
+	 */
+	list_for_each_entry_reverse(ops, &pernet_list, list) {
 		ops_exit_list(ops, &net_exit_list);
-
-	/* Free the net generic variables */
-	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
+	}
 
 	mutex_unlock(&net_mutex);
 

^ permalink raw reply related

* Re: [RFC][PATCH v4 05/18] Add a function to indicate if device use external buffer.
From: Changli Gao @ 2010-04-25  9:33 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, jdike
In-Reply-To: <1272187206-18534-5-git-send-email-xiaohui.xin@intel.com>

On Sun, Apr 25, 2010 at 5:19 PM,  <xiaohui.xin@intel.com> wrote:
> +static int dev_is_mpassthru(struct net_device *dev)
> +{
> +       if (dev && dev->mp_port)
> +               return 1;
> +       return 0;
> +}
> +

Please make it a inline function. And you would write it with less
lines of code.

return dev && dev->mp_port;


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC][PATCH v4 05/18] Add a function to indicate if device use external buffer.
From: Changli Gao @ 2010-04-25  9:35 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, jdike
In-Reply-To: <1272187206-18534-5-git-send-email-xiaohui.xin@intel.com>

On Sun, Apr 25, 2010 at 5:19 PM,  <xiaohui.xin@intel.com> wrote:
> +static int dev_is_mpassthru(struct net_device *dev)

bool return value should be better here.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC][PATCH v4 05/18] Add a function to indicate if device use external buffer.
From: David Miller @ 2010-04-25  9:51 UTC (permalink / raw)
  To: xiaosuo; +Cc: xiaohui.xin, netdev, kvm, linux-kernel, mst, mingo, jdike
In-Reply-To: <t2g412e6f7f1004250233pbdfe4d8aj7dc0e0a4980d8eb9@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Sun, 25 Apr 2010 17:33:02 +0800

> On Sun, Apr 25, 2010 at 5:19 PM,  <xiaohui.xin@intel.com> wrote:
>> +static int dev_is_mpassthru(struct net_device *dev)
>> +{
>> +       if (dev && dev->mp_port)
>> +               return 1;
>> +       return 0;
>> +}
>> +
> 
> Please make it a inline function. And you would write it with less
> lines of code.
> 
> return dev && dev->mp_port;

And use "bool" :-)

^ permalink raw reply

* Re: [RFC][PATCH v4 05/18] Add a function to indicate if device use external buffer.
From: David Miller @ 2010-04-25  9:51 UTC (permalink / raw)
  To: xiaosuo; +Cc: xiaohui.xin, netdev, kvm, linux-kernel, mst, mingo, jdike
In-Reply-To: <t2z412e6f7f1004250235ib1002b3doe604d786dadefaa5@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Sun, 25 Apr 2010 17:35:01 +0800

> On Sun, Apr 25, 2010 at 5:19 PM,  <xiaohui.xin@intel.com> wrote:
>> +static int dev_is_mpassthru(struct net_device *dev)
> 
> bool return value should be better here.

Right.

^ permalink raw reply

* Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: David Miller @ 2010-04-25  9:55 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, jdike
In-Reply-To: <1272187206-18534-19-git-send-email-xiaohui.xin@intel.com>

From: xiaohui.xin@intel.com
Date: Sun, 25 Apr 2010 17:20:06 +0800

> The idea is simple, just to pin the guest VM user space and then let
> host NIC driver has the chance to directly DMA to it.

Isn't it much easier to map the RX ring of the network device into the
guest's address space, have DMA map calls translate guest addresses to
physical/DMA addresses as well as do all of this crazy page pinning
stuff, and provide the translations and protections via the IOMMU?

What's being proposed here looks a bit over-engineered.

^ permalink raw reply

* Re: [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: David Miller @ 2010-04-25  9:59 UTC (permalink / raw)
  To: jpirko; +Cc: netdev, ebiederm
In-Reply-To: <20100425092600.GB2866@psychotron.redhat.com>

From: Jiri Pirko <jpirko@redhat.com>
Date: Sun, 25 Apr 2010 11:26:01 +0200

> There's no need to iterate this twice. We can free net generic
> variables right after exit is called.
>
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

Are you sure there are no problems with doing this?

What if there are inter-net variable reference dependencies
or something like that?

I really suspect it is being done this way on purpose, but
in the end I defer to experts like Eric B. :-)

^ permalink raw reply

* Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-25 10:46 UTC (permalink / raw)
  To: David Miller; +Cc: xiaohui.xin, netdev, kvm, linux-kernel, mingo, jdike
In-Reply-To: <20100425.025529.123989625.davem@davemloft.net>

On Sun, Apr 25, 2010 at 02:55:29AM -0700, David Miller wrote:
> From: xiaohui.xin@intel.com
> Date: Sun, 25 Apr 2010 17:20:06 +0800
> 
> > The idea is simple, just to pin the guest VM user space and then let
> > host NIC driver has the chance to directly DMA to it.
> 
> Isn't it much easier to map the RX ring of the network device into the
> guest's address space, have DMA map calls translate guest addresses to
> physical/DMA addresses as well as do all of this crazy page pinning
> stuff, and provide the translations and protections via the IOMMU?

This means we need guest know how the specific network device works.
So we won't be able to, for example, move guest between different hosts.
There are other problems: many physical systems do not have an iommu,
some guest OS-es do not support DMA map calls, doing VM exit
on each DMA map call might turn out to be very slow. And so on.

> What's being proposed here looks a bit over-engineered.

This is an attempt to reduce overhead for virtio (paravirtualization).
'Don't use PV' is kind of an alternative, but I do not
think it's a simpler one.

-- 
MST

^ permalink raw reply

* Re: [PATCH] can: Add driver for esd CAN-USB/2 device
From: Wolfgang Grandegger @ 2010-04-25 10:53 UTC (permalink / raw)
  To: Matthias Fuchs; +Cc: netdev, socketcan-core
In-Reply-To: <201004231015.16751.matthias.fuchs@esd.eu>

Hi Matthias,

Matthias Fuchs wrote:
> This patch adds a driver for esd's USB high speed
> CAN interface. The driver supports devices with
> multiple CAN interfaces.
> 
> Signed-off-by: Matthias Fuchs <matthias.fuchs@esd.eu>

Could you please add support for the recently added feature:

  commit 52c793f24054f5dc30d228e37e0e19cc8313f086
  Author: Wolfgang Grandegger <wg@grandegger.com>
  Date:   Mon Feb 22 22:21:17 2010 +0000

    can: netlink support for bus-error reporting and counters
    
    This patch makes the bus-error reporting configurable and allows to
    retrieve the CAN TX and RX bus error counters via netlink interface.
    I have added support for the SJA1000. The TX and RX bus error counters
    are also copied to the data fields 6..7 of error messages when state
    changes are reported.

Should not be a big deal. Also, please make a CC to the USB Linux
mailing list. Some minor comments below:

> ---
>  drivers/net/can/usb/Kconfig    |    6 +
>  drivers/net/can/usb/Makefile   |    1 +
>  drivers/net/can/usb/esd_usb2.c | 1107 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1114 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/can/usb/esd_usb2.c
> 
...
> diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
> new file mode 100644
> index 0000000..c714ce9
> --- /dev/null
> +++ b/drivers/net/can/usb/esd_usb2.c
...
> +struct id_filter_msg {
> +	u8 len;
> +	u8 cmd;
> +	u8 net;
> +	u8 option;
> +	__le32 mask[65];

ESD_MAX_ID_SEGMENT + 1 ?

...
> +static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
> +				      struct net_device *netdev)
> +{
> +	struct esd_usb2_net_priv *priv = netdev_priv(netdev);
> +	struct esd_usb2 *dev = priv->usb2;
> +	struct esd_tx_urb_context *context = NULL;
> +	struct net_device_stats *stats = &netdev->stats;
> +	struct can_frame *cf = (struct can_frame *)skb->data;
> +	struct esd_usb2_msg *msg;
> +	struct urb *urb;
> +	u8 *buf;
> +	int i, err;
> +	int ret = NETDEV_TX_OK;
> +	size_t size = sizeof(struct esd_usb2_msg);
> +
> +	if (can_dropped_invalid_skb(netdev, skb))
> +		return NETDEV_TX_OK;
> +
> +	/* create a URB, and a buffer for it, and copy the data to the URB */
> +	urb = usb_alloc_urb(0, GFP_ATOMIC);
> +	if (!urb) {
> +		dev_err(netdev->dev.parent, "No memory left for URBs\n");
> +		stats->tx_dropped++;
> +		dev_kfree_skb(skb);
> +		goto nourbmem;
> +	}
> +
> +	buf = usb_buffer_alloc(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma);
> +	if (!buf) {
> +		dev_err(netdev->dev.parent, "No memory left for USB buffer\n");
> +		stats->tx_dropped++;
> +		dev_kfree_skb(skb);
> +		goto nobufmem;
> +	}
> +
> +	msg = (struct esd_usb2_msg *)buf;
> +
> +	msg->msg.hdr.len = 3; /* minimal length */
> +	msg->msg.hdr.cmd = CMD_CAN_TX;
> +	msg->msg.tx.net = priv->index;
> +	msg->msg.tx.dlc = cf->can_dlc;
> +	msg->msg.tx.id = cpu_to_le32(cf->can_id & CAN_ERR_MASK);
> +
> +	if (cf->can_id & CAN_RTR_FLAG)
> +		msg->msg.tx.dlc |= ESD_RTR;
> +
> +	if (cf->can_id & CAN_EFF_FLAG)
> +		msg->msg.tx.id |= cpu_to_le32(ESD_EXTID);
> +
> +	for (i = 0; i < cf->can_dlc; i++)
> +		msg->msg.tx.data[i] = cf->data[i];
> +
> +	msg->msg.hdr.len += (cf->can_dlc + 3) >> 2;
> +
> +	for (i = 0; i < MAX_TX_URBS; i++) {
> +		if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) {
> +			context = &priv->tx_contexts[i];
> +			break;
> +		}
> +	}
> +
> +	/*
> +	 * This may never happen.
> +	 */
> +	if (!context) {
> +		dev_warn(netdev->dev.parent, "couldn't find free context\n");
> +		ret = NETDEV_TX_BUSY;
> +		goto releasebuf;
> +	}
> +
> +	context->priv = priv;
> +	context->echo_index = i;
> +	context->dlc = cf->can_dlc;
> +
> +	/* hnd must not be 0 */
> +	msg->msg.tx.hnd = 0x80000000 | i; /* returned in TX done message */

ESD_USB2_UBR ?

Wolfgang.

^ permalink raw reply

* Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-25 12:14 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mingo, davem, jdike
In-Reply-To: <1272187206-18534-19-git-send-email-xiaohui.xin@intel.com>

On Sun, Apr 25, 2010 at 05:20:06PM +0800, xiaohui.xin@intel.com wrote:
> We provide an zero-copy method which driver side may get external
> buffers to DMA. Here external means driver don't use kernel space
> to allocate skb buffers. Currently the external buffer can be from
> guest virtio-net driver.
> 
> The idea is simple, just to pin the guest VM user space and then
> let host NIC driver has the chance to directly DMA to it. 
> The patches are based on vhost-net backend driver. We add a device
> which provides proto_ops as sendmsg/recvmsg to vhost-net to
> send/recv directly to/from the NIC driver. KVM guest who use the
> vhost-net backend may bind any ethX interface in the host side to
> get copyless data transfer thru guest virtio-net frontend.
> 
> patch 01-12:  	net core changes.
> patch 13-17:  	new device as interface to mantpulate external buffers.
> patch 18: 	for vhost-net.
> 
> The guest virtio-net driver submits multiple requests thru vhost-net
> backend driver to the kernel. And the requests are queued and then
> completed after corresponding actions in h/w are done.
> 
> For read, user space buffers are dispensed to NIC driver for rx when
> a page constructor API is invoked. Means NICs can allocate user buffers
> from a page constructor. We add a hook in netif_receive_skb() function
> to intercept the incoming packets, and notify the zero-copy device.
> 
> For write, the zero-copy deivce may allocates a new host skb and puts
> payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
> The request remains pending until the skb is transmitted by h/w.
> 
> Here, we have ever considered 2 ways to utilize the page constructor
> API to dispense the user buffers.
> 
> One:	Modify __alloc_skb() function a bit, it can only allocate a 
> 	structure of sk_buff, and the data pointer is pointing to a 
> 	user buffer which is coming from a page constructor API.
> 	Then the shinfo of the skb is also from guest.
> 	When packet is received from hardware, the skb->data is filled
> 	directly by h/w. What we have done is in this way.
> 
> 	Pros:	We can avoid any copy here.
> 	Cons:	Guest virtio-net driver needs to allocate skb as almost
> 		the same method with the host NIC drivers, say the size
> 		of netdev_alloc_skb() and the same reserved space in the
> 		head of skb. Many NIC drivers are the same with guest and
> 		ok for this. But some lastest NIC drivers reserves special
> 		room in skb head. To deal with it, we suggest to provide
> 		a method in guest virtio-net driver to ask for parameter
> 		we interest from the NIC driver when we know which device 
> 		we have bind to do zero-copy. Then we ask guest to do so.
> 		Is that reasonable?

Do you still do this?

> Two:	Modify driver to get user buffer allocated from a page constructor
> 	API(to substitute alloc_page()), the user buffer are used as payload
> 	buffers and filled by h/w directly when packet is received. Driver
> 	should associate the pages with skb (skb_shinfo(skb)->frags). For 
> 	the head buffer side, let host allocates skb, and h/w fills it. 
> 	After that, the data filled in host skb header will be copied into
> 	guest header buffer which is submitted together with the payload buffer.
> 
> 	Pros:	We could less care the way how guest or host allocates their
> 		buffers.
> 	Cons:	We still need a bit copy here for the skb header.
> 
> We are not sure which way is the better here. This is the first thing we want
> to get comments from the community. We wish the modification to the network
> part will be generic which not used by vhost-net backend only, but a user
> application may use it as well when the zero-copy device may provides async
> read/write operations later.

I commented on this in the past. Do you still want comments?

> Please give comments especially for the network part modifications.
> 
> 
> We provide multiple submits and asynchronous notifiicaton to 
> vhost-net too.
> 
> Our goal is to improve the bandwidth and reduce the CPU usage.
> Exact performance data will be provided later. But for simple
> test with netperf, we found bindwidth up and CPU % up too,
> but the bindwidth up ratio is much more than CPU % up ratio.
> 
> What we have not done yet:
> 	packet split support
> 	To support GRO
> 	Performance tuning
> 
> what we have done in v1:
> 	polish the RCU usage
> 	deal with write logging in asynchroush mode in vhost
> 	add notifier block for mp device
> 	rename page_ctor to mp_port in netdevice.h to make it looks generic
> 	add mp_dev_change_flags() for mp device to change NIC state
> 	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
> 	a small fix for missing dev_put when fail
> 	using dynamic minor instead of static minor number
> 	a __KERNEL__ protect to mp_get_sock()
> 
> what we have done in v2:
> 	
> 	remove most of the RCU usage, since the ctor pointer is only
> 	changed by BIND/UNBIND ioctl, and during that time, NIC will be
> 	stopped to get good cleanup(all outstanding requests are finished),
> 	so the ctor pointer cannot be raced into wrong situation.
> 
> 	Remove the struct vhost_notifier with struct kiocb.
> 	Let vhost-net backend to alloc/free the kiocb and transfer them
> 	via sendmsg/recvmsg.
> 
> 	use get_user_pages_fast() and set_page_dirty_lock() when read.
> 
> 	Add some comments for netdev_mp_port_prep() and handle_mpassthru().
> 
> what we have done in v3:
> 	the async write logging is rewritten 
> 	a drafted synchronous write function for qemu live migration
> 	a limit for locked pages from get_user_pages_fast() to prevent Dos
> 	by using RLIMIT_MEMLOCK
> 	
> 
> what we have done in v4:
> 	add iocb completion callback from vhost-net to queue iocb in mp device
> 	replace vq->receiver by mp_sock_data_ready()
> 	remove stuff in mp device which access structures from vhost-net
> 	modify skb_reserve() to ignore host NIC driver reserved space
> 	rebase to the latest vhost tree
> 	split large patches into small pieces, especially for net core part.
> 	
> 		
> performance:
> 	using netperf with GSO/TSO disabled, 10G NIC, 
> 	disabled packet split mode, with raw socket case compared to vhost.
> 
> 	bindwidth will be from 1.1Gbps to 1.7Gbps
> 	CPU % from 120%-140% to 140%-160%

That's nice. The thing to do is probably to enable GSO/TSO
and see what we get this way. Also, mergeable buffer support
was recently posted and I hope to merge it for 2.6.35.
You might want to take a look.

-- 
MST

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Michael S. Tsirkin @ 2010-04-25 14:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Evgeniy Polyakov, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <1271877975.7895.3171.camel@edumazet-laptop>

On Wed, Apr 21, 2010 at 09:26:15PM +0200, Eric Dumazet wrote:
> Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :
> 
> > Damn it, I tried multiple times :)
> > You are right of course!
> > 
> 
> Here is a formal patch then :)
> 
> [PATCH] tcp: bind() fix when many ports are bound
> 
> Port autoselection done by kernel only works when number of bound
> sockets is under a threshold (typically 30000).
> 
> When this threshold is over, we must check if there is a conflict before
> exiting first loop in inet_csk_get_port()
> 
> Change inet_csk_bind_conflict() to forbid two reuse-enabled sockets to
> bind on same (address,port) tuple (with a non ANY address)
> 
> Same change for inet6_csk_bind_conflict()
> 
> Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/ipv4/inet_connection_sock.c  |   16 +++++++++++-----
>  net/ipv6/inet6_connection_sock.c |   15 ++++++++++-----
>  2 files changed, 21 insertions(+), 10 deletions(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index e0a3e35..78cbc39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -70,13 +70,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
>  		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
> +
>  			if (!reuse || !sk2->sk_reuse ||
>  			    sk2->sk_state == TCP_LISTEN) {
> -				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
>  				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
>  				    sk2_rcv_saddr == sk_rcv_saddr)
>  					break;
> -			}
> +			} else if (reuse && sk2->sk_reuse &&
> +				   sk2_rcv_saddr &&
> +				   sk2_rcv_saddr == sk_rcv_saddr)
> +				break;
>  		}
>  	}
>  	return node != NULL;
> @@ -120,9 +124,11 @@ again:
>  						smallest_size = tb->num_owners;
>  						smallest_rover = rover;
>  						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
> -							spin_unlock(&head->lock);
> -							snum = smallest_rover;
> -							goto have_snum;
> +							if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
> +								spin_unlock(&head->lock);
> +								snum = smallest_rover;
> +								goto have_snum;
> +							}
>  						}
>  					}
>  					goto next;
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index 0c5e3c3..fb6959c 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>  		if (sk != sk2 &&
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
> -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> -		     sk2->sk_state == TCP_LISTEN) &&
> -		     ipv6_rcv_saddr_equal(sk, sk2))
> -			break;
> +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> +			     sk2->sk_state == TCP_LISTEN) &&
> +			     ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +			else if (sk->sk_reuse && sk2->sk_reuse &&
> +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> +				ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +		}
>  	}
>  
>  	return node != NULL;
> 

With this applied, my box crashes on boot:
rhel6 beta userspace, v2.6.34-rc5-204-gddc9b34 kernel.
2.6.34-rc5 kernel boots fine.
the crash seems to be around net/ipv6/inet6_connection_sock.c:50
after reverting fda48a0d7a8412cedacda46a9c0bf8ef9cd13559,
the crash goes away.

I created https://bugzilla.kernel.org/show_bug.cgi?id=15847
to track this.

Oops below:

BUG: unable to handle kernel NULL pointer dereference at
0000000000000004
IP: [<ffffffffa02b99aa>] inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
PGD 0 
Oops: 0000 [#1] SMP 
last sysfs file:
/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/net/eth0/ifindex
CPU 9 
Modules linked in: ip6t_REJECT nf_conntrack_ipv6 ip6table_filter
ip6_tables ipv6 dm_mirror dm_region_hash dm_log igb i2c_i801 sg iTCO_wdt
iTCO_vendor_support shpchp ioatdma dca pcspkr sr_mod cdrom ext4 mbcache
jbd2 sd_mod ata_generic crc_t10dif pata_acpi ahci pata_jmicron radeon
ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mod [last unloaded:
scsi_wait_scan]

Pid: 1640, comm: master Not tainted 2.6.34-rc5-mst #1 X8DTN/X8DTN
RIP: 0010:[<ffffffffa02b99aa>]  [<ffffffffa02b99aa>]
inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
RSP: 0018:ffff8803357a7d98  EFLAGS: 00010293
RAX: 0000000000000000 RBX: ffff880335709440 RCX: 0000000000000000
RDX: 0000000000020011 RSI: ffff880335709440 RDI: ffff880334c61e78
RBP: ffff8803357a7db8 R08: 0000000000000019 R09: 0000000000000019
R10: 00000000000000d4 R11: 0000000000000400 R12: ffff880335709468
R13: ffff880334c61800 R14: ffff880335489500 R15: ffffffff8225d700
FS:  00007feacd26f7c0(0000) GS:ffff8801c5700000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 00000003341ef000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process master (pid: 1640, threadinfo ffff8803357a6000, task
ffff880334225540)
Stack:
 0000000000000000 ffffffff8225b500 ffffc9001251ced0 ffff880334c61800
<0> ffff8803357a7e48 ffffffff81418fa8 ffff880300000019 ffffffff8149ceb6
<0> 0000000536306140 0000000000000246 ffff8803357a7e08 0000000000000246
Call Trace:
 [<ffffffff81418fa8>] inet_csk_get_port+0x238/0x450
 [<ffffffff8149ceb6>] ? _raw_spin_lock_bh+0x16/0x40
 [<ffffffff8149ce15>] ? _raw_read_unlock_bh+0x15/0x20
 [<ffffffffa0290226>] ? ipv6_chk_addr+0xe6/0x100 [ipv6]

-- 
MST

^ permalink raw reply

* 2.6.34-rc5+: oops in IPv6
From: Manuel Lauss @ 2010-04-25 14:45 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

2.6.34-rc5-00204-gddc9b34  dies when sshd (openssh 5.5) is started
Last pull I made on April 23 was fine.

(transcribed from a photo):

BUG: unable to handle kernel NULL pointer dereference at 00000004
IP: [<b1535b72>] inet6_csk_bind_conflict+0x6e/0xb0

EIP: 0060:[<b1535b72>] EFLAGS: 00010293 CPU: 0
EAX: 0000 EBX: ed49c8c0 ECX: 00000000 EDX: 00000000
ESI: ed49c8dc EDI: ee223040 EBP: ef940058 ESP: ed7e9e84
 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068

Stack:
 ee223040 eebf12e0 b1927198 b14dee45 00000016 ffffffff 00000016 00000005
<0> e8dd766c c8dd75d0 ed7e9ee0 ee223040 ed7e9f04 ee22346c b1517846
<0> ed7e9ef4 00000000 00000016 0000001c 00000000 b166ae1c ef422800 affdb8ab
Call Trace:
b144dee45  inet_csk_get_port+0x1a5/0x27c
b1517846 inet6_bind+0x1b5/0x293
b14aec6c sys_bind+0x63
b1524fdc ipv6_setsockopt+0x38/0x88
b14e0cd7 tcp_setsockopt+0x1b/0x36
b14afa68 sock_common_setsockopt+0x12
b14ae653 sys_setsockopt+0x5e
sys_socketcall
...

GDB says:

0xb1535b72 is in inet6_csk_bind_conflict
(/usr/src/linux-2.6.git/include/net/ipv6.h:376).
371     void ip6_frag_init(struct inet_frag_queue *q, void *a);
372     int ip6_frag_match(struct inet_frag_queue *q, void *a);
373
374     static inline int ipv6_addr_any(const struct in6_addr *a)
375     {
376             return ((a->s6_addr32[0] | a->s6_addr32[1] |
377                      a->s6_addr32[2] | a->s6_addr32[3] ) == 0);
378     }
379
380     static inline int ipv6_addr_loopback(const struct in6_addr *a)


Thanks,
      Manuel Lauss

^ permalink raw reply

* Re: [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: Eric W. Biederman @ 2010-04-25 14:50 UTC (permalink / raw)
  To: David Miller; +Cc: jpirko, netdev
In-Reply-To: <20100425.025902.94572342.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: Jiri Pirko <jpirko@redhat.com>
> Date: Sun, 25 Apr 2010 11:26:01 +0200
>
>> There's no need to iterate this twice. We can free net generic
>> variables right after exit is called.
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
> Are you sure there are no problems with doing this?
>
> What if there are inter-net variable reference dependencies
> or something like that?
>
> I really suspect it is being done this way on purpose, but
> in the end I defer to experts like Eric B. :-)

I am pretty certain there is a problem.  My memory is fuzzy this
morning but I believe we can have rcu references between various
pieces of the networking stack for a single network namespace.  So we
need to cause all of the network namespace to exit before it is safe
to free those pieces.

Eric



^ permalink raw reply

* Re: 2.6.34-rc5+: oops in IPv6
From: Tetsuo Handa @ 2010-04-25 14:58 UTC (permalink / raw)
  To: manuel.lauss, netdev; +Cc: linux-kernel
In-Reply-To: <z2if861ec6f1004250745u94892bdbw9b17db4be57b131b@mail.gmail.com>

Manuel Lauss wrote:
> 2.6.34-rc5-00204-gddc9b34  dies when sshd (openssh 5.5) is started
> Last pull I made on April 23 was fine.

This seems a regression introduced while handling
"PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open hugeamount of outgoing connections (unable to bind ... )"
problem. It is in https://bugzilla.kernel.org/show_bug.cgi?id=15847 .

Regards.

^ permalink raw reply

* Re: [PATCH] RCU: don't turn off lockdep when find suspicious rcu_dereference_check() usage
From: Miles Lane @ 2010-04-25 15:49 UTC (permalink / raw)
  To: paulmck
  Cc: Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, eric.dumazet, netdev, Jens Axboe,
	Gui Jianfeng, Li Zefan, Johannes Berg
In-Reply-To: <20100425023455.GM2440@linux.vnet.ibm.com>

On Sat, Apr 24, 2010 at 10:34 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Fri, Apr 23, 2010 at 06:59:12PM -0400, Miles Lane wrote:
>> On Fri, Apr 23, 2010 at 3:42 PM, Paul E. McKenney
>> <paulmck@linux.vnet.ibm.com> wrote:
>> > On Fri, Apr 23, 2010 at 08:50:59AM -0400, Miles Lane wrote:
>> >> Hi Paul,
>> >> There has been a bit of back and forth, and I am not sure what patches
>> >> I should test now.
>> >> Could you send me a bundle of whatever needs testing now?
>> >
>> > Hello, Miles,
>> >
>> > I am posting my set as replies to this message.  There are a couple
>> > of KVM fixes that are going up via Avi's tree, and a number of networking
>> > fixes that are going up via Dave Miller's tree -- a number of these
>> > are against quickly changing code, so it didn't make sense for me to
>> > keep them separately.
>> >
>> > I believe that the two splats below are addressed by this patch set
>> > carried in the networking tree:
>> >
>> >        https://patchwork.kernel.org/patch/90754/
>>
>> With your twelve patches and the one linked to above applied to
>> 2.6.34-rc5-git3, here are the warnings I see:
>>
>> [    0.173969] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [    0.174097] ---------------------------------------------------
>> [    0.174226] include/linux/cgroup.h:534 invoked
>> rcu_dereference_check() without protection!
>> [    0.174429]
>> [    0.174430] other info that might help us debug this:
>> [    0.174431]
>> [    0.174792]
>> [    0.174793] rcu_scheduler_active = 1, debug_locks = 1
>> [    0.175037] no locks held by watchdog/0/5.
>> [    0.175162]
>> [    0.175163] stack backtrace:
>> [    0.175405] Pid: 5, comm: watchdog/0 Not tainted 2.6.34-rc5-git3 #22
>> [    0.175534] Call Trace:
>> [    0.175666]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [    0.175799]  [<ffffffff8102d678>] task_subsys_state+0x59/0x70
>> [    0.175931]  [<ffffffff810328fa>] __sched_setscheduler+0x19d/0x300
>> [    0.176064]  [<ffffffff8102b477>] ? need_resched+0x1e/0x28
>> [    0.176196]  [<ffffffff813cd401>] ? schedule+0x5c3/0x66e
>> [    0.176327]  [<ffffffff81091943>] ? watchdog+0x0/0x8c
>> [    0.176457]  [<ffffffff81032a78>] sched_setscheduler+0xe/0x10
>> [    0.176587]  [<ffffffff8109196d>] watchdog+0x2a/0x8c
>> [    0.176677]  [<ffffffff81091943>] ? watchdog+0x0/0x8c
>> [    0.176808]  [<ffffffff81057152>] kthread+0x89/0x91
>> [    0.176939]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    0.177073]  [<ffffffff81003994>] kernel_thread_helper+0x4/0x10
>> [    0.177204]  [<ffffffff813cfc40>] ? restore_args+0x0/0x30
>> [    0.177334]  [<ffffffff810570c9>] ? kthread+0x0/0x91
>> [    0.177463]  [<ffffffff81003990>] ? kernel_thread_helper+0x0/0x10
>
> According to Documentation/cgroups/cgroups.txt, we must hold cgroup_mutex,
> the task's task_alloc lock, or be in an RCU read-side critical section.
> We are in neither of these.
>
> I would argue that sched_setscheduler() should take care of
> synchronization, but am not sure which of these three are appropriate
> for sched_setscheduler() to acquire.  Peter, thoughts?
>
>> [    3.173419] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [    3.173419] ---------------------------------------------------
>> [    3.173419] kernel/cgroup.c:4438 invoked rcu_dereference_check()
>> without protection!
>> [    3.173419]
>> [    3.173419] other info that might help us debug this:
>> [    3.173419]
>> [    3.173419]
>> [    3.173419] rcu_scheduler_active = 1, debug_locks = 1
>> [    3.173419] 2 locks held by async/0/668:
>> [    3.173419]  #0:  (&shost->scan_mutex){+.+.+.}, at:
>> [<ffffffff812df020>] __scsi_add_device+0x83/0xe4
>> [    3.173419]  #1:  (&(&blkcg->lock)->rlock){......}, at:
>> [<ffffffff811f2df9>] blkiocg_add_blkio_group+0x29/0x7f
>> [    3.173419]
>> [    3.173419] stack backtrace:
>> [    3.173419] Pid: 668, comm: async/0 Not tainted 2.6.34-rc5-git3 #22
>> [    3.173419] Call Trace:
>> [    3.173419]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [    3.173419]  [<ffffffff8107f9ad>] css_id+0x3f/0x51
>> [    3.173419]  [<ffffffff811f2e08>] blkiocg_add_blkio_group+0x38/0x7f
>> [    3.173419]  [<ffffffff811f4dd0>] cfq_init_queue+0xdf/0x2dc
>> [    3.173419]  [<ffffffff811e33b1>] elevator_init+0xba/0xf5
>> [    3.173419]  [<ffffffff812dbfaa>] ? scsi_request_fn+0x0/0x451
>> [    3.173419]  [<ffffffff811e68d7>] blk_init_queue_node+0x12f/0x135
>> [    3.173419]  [<ffffffff811e68e9>] blk_init_queue+0xc/0xe
>> [    3.173419]  [<ffffffff812dc41c>] __scsi_alloc_queue+0x21/0x111
>> [    3.173419]  [<ffffffff812dc524>] scsi_alloc_queue+0x18/0x64
>> [    3.173419]  [<ffffffff812de520>] scsi_alloc_sdev+0x19e/0x256
>> [    3.173419]  [<ffffffff812de6be>] scsi_probe_and_add_lun+0xe6/0x9c5
>> [    3.173419]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    3.173419]  [<ffffffff813ce056>] ? __mutex_lock_common+0x3e4/0x43a
>> [    3.173419]  [<ffffffff812df020>] ? __scsi_add_device+0x83/0xe4
>> [    3.173419]  [<ffffffff812d09dc>] ? transport_setup_classdev+0x0/0x17
>> [    3.173419]  [<ffffffff812df020>] ? __scsi_add_device+0x83/0xe4
>> [    3.173419]  [<ffffffff812df055>] __scsi_add_device+0xb8/0xe4
>> [    3.173419]  [<ffffffff812ea945>] ata_scsi_scan_host+0x74/0x16e
>> [    3.173419]  [<ffffffff81057699>] ? autoremove_wake_function+0x0/0x34
>> [    3.173419]  [<ffffffff812e8de4>] async_port_probe+0xab/0xb7
>> [    3.173419]  [<ffffffff8105e1b1>] ? async_thread+0x0/0x1f4
>> [    3.173419]  [<ffffffff8105e2b6>] async_thread+0x105/0x1f4
>> [    3.173419]  [<ffffffff81033d8e>] ? default_wake_function+0x0/0xf
>> [    3.173419]  [<ffffffff8105e1b1>] ? async_thread+0x0/0x1f4
>> [    3.173419]  [<ffffffff81057152>] kthread+0x89/0x91
>> [    3.173419]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    3.173419]  [<ffffffff81003994>] kernel_thread_helper+0x4/0x10
>> [    3.173419]  [<ffffffff813cfc40>] ? restore_args+0x0/0x30
>> [    3.173419]  [<ffffffff810570c9>] ? kthread+0x0/0x91
>> [    3.173419]  [<ffffffff81003990>] ? kernel_thread_helper+0x0/0x10
>
> Please see below for a patch for this based on my earlier conversation
> with Vivek Goyal.  (Vivek, if you are already pushing a fix elsewhere,
> please let me know, and I will drop my patch in favor of yours.)
>
>> [   32.905446] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   32.905449] ---------------------------------------------------
>> [   32.905453] net/core/dev.c:1993 invoked rcu_dereference_check()
>> without protection!
>> [   32.905456]
>> [   32.905457] other info that might help us debug this:
>> [   32.905458]
>> [   32.905461]
>> [   32.905462] rcu_scheduler_active = 1, debug_locks = 1
>> [   32.905466] 2 locks held by canberra-gtk-pl/4182:
>> [   32.905469]  #0:  (sk_lock-AF_INET){+.+.+.}, at:
>> [<ffffffff81394f7d>] inet_stream_connect+0x3a/0x24d
>> [   32.905483]  #1:  (rcu_read_lock_bh){.+....}, at:
>> [<ffffffff8134a789>] dev_queue_xmit+0x14e/0x4b8
>> [   32.905495]
>> [   32.905496] stack backtrace:
>> [   32.905500] Pid: 4182, comm: canberra-gtk-pl Not tainted 2.6.34-rc5-git3 #22
>> [   32.905504] Call Trace:
>> [   32.905512]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   32.905518]  [<ffffffff8134a894>] dev_queue_xmit+0x259/0x4b8
>> [   32.905524]  [<ffffffff8134a789>] ? dev_queue_xmit+0x14e/0x4b8
>> [   32.905531]  [<ffffffff81041c66>] ? _local_bh_enable_ip+0xcd/0xda
>> [   32.905538]  [<ffffffff813536da>] neigh_resolve_output+0x234/0x285
>> [   32.905544]  [<ffffffff8136f69f>] ip_finish_output2+0x257/0x28c
>> [   32.905549]  [<ffffffff8136f73c>] ip_finish_output+0x68/0x6a
>> [   32.905554]  [<ffffffff81370433>] T.866+0x52/0x59
>> [   32.905559]  [<ffffffff8137067e>] ip_output+0xaa/0xb4
>> [   32.905565]  [<ffffffff8136eb38>] ip_local_out+0x20/0x24
>> [   32.905571]  [<ffffffff8136f184>] ip_queue_xmit+0x309/0x368
>> [   32.905578]  [<ffffffff810e4226>] ? __kmalloc_track_caller+0x111/0x155
>> [   32.905585]  [<ffffffff8138316f>] ? tcp_connect+0x223/0x3d3
>> [   32.905591]  [<ffffffff813818f1>] tcp_transmit_skb+0x707/0x745
>> [   32.905597]  [<ffffffff813832c2>] tcp_connect+0x376/0x3d3
>> [   32.905604]  [<ffffffff81268a43>] ? secure_tcp_sequence_number+0x55/0x6f
>> [   32.905610]  [<ffffffff81387270>] tcp_v4_connect+0x3df/0x455
>> [   32.905617]  [<ffffffff8133cb59>] ? lock_sock_nested+0xf3/0x102
>> [   32.905623]  [<ffffffff81394fe7>] inet_stream_connect+0xa4/0x24d
>> [   32.905629]  [<ffffffff8133b398>] sys_connect+0x90/0xd0
>> [   32.905636]  [<ffffffff81002b9c>] ? sysret_check+0x27/0x62
>> [   32.905642]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [   32.905649]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   32.905655]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b
>
> A fix for the above is already in Dave Miller's tree.
>
>> [   51.912282] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   51.912285] ---------------------------------------------------
>> [   51.912289] net/mac80211/sta_info.c:886 invoked
>> rcu_dereference_check() without protection!
>> [   51.912293]
>> [   51.912293] other info that might help us debug this:
>> [   51.912295]
>> [   51.912298]
>> [   51.912298] rcu_scheduler_active = 1, debug_locks = 1
>> [   51.912302] no locks held by wpa_supplicant/3951.
>> [   51.912305]
>> [   51.912306] stack backtrace:
>> [   51.912310] Pid: 3951, comm: wpa_supplicant Not tainted 2.6.34-rc5-git3 #22
>> [   51.912314] Call Trace:
>> [   51.912317]  <IRQ>  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   51.912345]  [<ffffffffa014f9ae>]
>> ieee80211_find_sta_by_hw+0x46/0x10f [mac80211]
>> [   51.912358]  [<ffffffffa014fa8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
>> [   51.912373]  [<ffffffffa01e50f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
>> [   51.912380]  [<ffffffff8106842b>] ? mark_lock+0x2d/0x235
>> [   51.912391]  [<ffffffffa0252f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
>> [   51.912399]  [<ffffffff8120a353>] ? is_swiotlb_buffer+0x2e/0x3b
>> [   51.912407]  [<ffffffffa024bbf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
>> [   51.912414]  [<ffffffff81068904>] ? trace_hardirqs_on_caller+0xfa/0x13f
>> [   51.912422]  [<ffffffffa024c3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
>> [   51.912429]  [<ffffffff810411f3>] tasklet_action+0xa7/0x10f
>> [   51.912435]  [<ffffffff81042205>] __do_softirq+0x144/0x252
>> [   51.912442]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
>> [   51.912447]  [<ffffffff810050e4>] do_softirq+0x38/0x80
>> [   51.912452]  [<ffffffff81041cd2>] irq_exit+0x45/0x94
>> [   51.912457]  [<ffffffff81004829>] do_IRQ+0xad/0xc4
>> [   51.912463]  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912470]  [<ffffffff813cfb93>] ret_from_intr+0x0/0xf
>> [   51.912474]  <EOI>  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912484]  [<ffffffff8106a75d>] ? lock_release+0x208/0x215
>> [   51.912490]  [<ffffffff810cbc1c>] might_fault+0xac/0xb3
>> [   51.912495]  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912501]  [<ffffffff812025e3>] __clear_user+0x15/0x59
>> [   51.912508]  [<ffffffff8100b2bc>] save_i387_xstate+0x9c/0x1bc
>> [   51.912515]  [<ffffffff81002276>] do_signal+0x240/0x686
>> [   51.912521]  [<ffffffff81002b9c>] ? sysret_check+0x27/0x62
>> [   51.912527]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [   51.912533]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   51.912539]  [<ffffffff810026e3>] do_notify_resume+0x27/0x5f
>> [   51.912545]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   51.912551]  [<ffffffff81002e86>] int_signal+0x12/0x17
>
> This is a repeat from last time that confused me at the time.  I could
> do a hacky "fix" by putting an RCU read-side critical section around
> the for_each_sta_info() in ieee80211_find_sta_by_hw(), but I do not
> understand this code well enough to feel comfortable doing so.
>
> Johannes, any enlightenment?
>
>> [   51.929529] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   51.929532] ---------------------------------------------------
>> [   51.929536] net/mac80211/sta_info.c:886 invoked
>> rcu_dereference_check() without protection!
>> [   51.929540]
>> [   51.929541] other info that might help us debug this:
>> [   51.929542]
>> [   51.929545]
>> [   51.929546] rcu_scheduler_active = 1, debug_locks = 1
>> [   51.929550] 1 lock held by Xorg/4013:
>> [   51.929553]  #0:  (clock-AF_UNIX){++.+..}, at: [<ffffffff8133cebd>]
>> sock_def_readable+0x19/0x62
>> [   51.929567]
>> [   51.929568] stack backtrace:
>> [   51.929573] Pid: 4013, comm: Xorg Not tainted 2.6.34-rc5-git3 #22
>> [   51.929576] Call Trace:
>> [   51.929579]  <IRQ>  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   51.929603]  [<ffffffffa014f9fe>]
>> ieee80211_find_sta_by_hw+0x96/0x10f [mac80211]
>> [   51.929615]  [<ffffffffa014fa8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
>> [   51.929631]  [<ffffffffa01e50f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
>> [   51.929642]  [<ffffffffa0252f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
>> [   51.929649]  [<ffffffff81068685>] ? mark_held_locks+0x52/0x70
>> [   51.929656]  [<ffffffff813cf46c>] ? _raw_spin_unlock_irqrestore+0x3a/0x69
>> [   51.929662]  [<ffffffff8120a353>] ? is_swiotlb_buffer+0x2e/0x3b
>> [   51.929671]  [<ffffffffa024bbf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
>> [   51.929680]  [<ffffffffa024c3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
>> [   51.929687]  [<ffffffff810411f3>] tasklet_action+0xa7/0x10f
>> [   51.929693]  [<ffffffff81042205>] __do_softirq+0x144/0x252
>> [   51.929700]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
>> [   51.929705]  [<ffffffff810050e4>] do_softirq+0x38/0x80
>> [   51.929711]  [<ffffffff81041cd2>] irq_exit+0x45/0x94
>> [   51.929717]  [<ffffffff81019b10>] smp_apic_timer_interrupt+0x87/0x95
>> [   51.929724]  [<ffffffff81003553>] apic_timer_interrupt+0x13/0x20
>> [   51.929727]  <EOI>  [<ffffffff813cf46e>] ?
>> _raw_spin_unlock_irqrestore+0x3c/0x69
>> [   51.929739]  [<ffffffff8102d3fb>] __wake_up_sync_key+0x49/0x52
>> [   51.929745]  [<ffffffff8133cee7>] sock_def_readable+0x43/0x62
>> [   51.929751]  [<ffffffff813b1c61>] unix_stream_sendmsg+0x243/0x2e2
>> [   51.929758]  [<ffffffff8133b912>] ? sock_aio_write+0x0/0xcf
>> [   51.929764]  [<ffffffff81339342>] __sock_sendmsg+0x59/0x64
>> [   51.929770]  [<ffffffff8133b9cd>] sock_aio_write+0xbb/0xcf
>> [   51.929777]  [<ffffffff810e9909>] do_sync_readv_writev+0xbc/0xfb
>> [   51.929785]  [<ffffffff811c1792>] ? selinux_file_permission+0xa2/0xaf
>> [   51.929790]  [<ffffffff810e9690>] ? copy_from_user+0x2a/0x2c
>> [   51.929797]  [<ffffffff811baff1>] ? security_file_permission+0x11/0x13
>> [   51.929804]  [<ffffffff810ea6a6>] do_readv_writev+0xa2/0x122
>> [   51.929810]  [<ffffffff810ead93>] ? fcheck_files+0x8f/0xc9
>> [   51.929816]  [<ffffffff810ea764>] vfs_writev+0x3e/0x49
>> [   51.929821]  [<ffffffff810ea84a>] sys_writev+0x45/0x8e
>> [   51.929828]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b
>
> Ditto.
>
>                                                Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 0868dd631def762ba00c2f0f397a53c5cdf24ae2
> Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Date:   Sat Apr 24 19:23:30 2010 -0700
>
>    block-cgroup: fix RCU-lockdep splat in blkiocg_add_blkio_group()
>
>    It is necessary to be in an RCU read-side critical section when invoking
>    css_id(), so this patch adds one to blkiocg_add_blkio_group().  This is
>    actually a false positive, because this is called at initialization time,
>    and hence always refers to the root cgroup, which cannot go away.
>
>    Located-by: Miles Lane <miles.lane@gmail.com>
>    Suggested-by: Vivek Goyal <vgoyal@redhat.com>
>    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 5fe03de..55c8c73 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -71,7 +71,9 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
>
>        spin_lock_irqsave(&blkcg->lock, flags);
>        rcu_assign_pointer(blkg->key, key);
> +       rcu_read_lock();
>        blkg->blkcg_id = css_id(&blkcg->css);
> +       rcu_read_unlock();
>        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
>        spin_unlock_irqrestore(&blkcg->lock, flags);
>  #ifdef CONFIG_DEBUG_BLK_CGROUP
>

I am down to seeing three suspicious rcu_dereference_check traces when
I apply this patch and all the previous patches to 2.6.34-rc5-git6.

1. The "__sched_setscheduler+0x19d/0x300" trace.
2. The two "is_swiotlb_buffer+0x2e/0x3b" traces (waiting to see
Johannes' patch show up in a Linux snapshot)

Did I miss a patch for the setscheduler issue?

Thanks!
        Miles

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Evgeniy Polyakov @ 2010-04-25 15:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Eric Dumazet, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425142642.GA11411@redhat.com>

On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:

> > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > index 0c5e3c3..fb6959c 100644
> > --- a/net/ipv6/inet6_connection_sock.c
> > +++ b/net/ipv6/inet6_connection_sock.c
> > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> >  		if (sk != sk2 &&
> >  		    (!sk->sk_bound_dev_if ||
> >  		     !sk2->sk_bound_dev_if ||
> > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > -		     sk2->sk_state == TCP_LISTEN) &&
> > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > -			break;
> > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > +			     sk2->sk_state == TCP_LISTEN) &&
> > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > +				break;
> > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&

I suppose above line is guilty when inet6_rcv_saddr() returns NULL?

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Eric Dumazet @ 2010-04-25 16:13 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Michael S. Tsirkin, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425155600.GA13319@ioremap.net>

Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> 
> > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > index 0c5e3c3..fb6959c 100644
> > > --- a/net/ipv6/inet6_connection_sock.c
> > > +++ b/net/ipv6/inet6_connection_sock.c
> > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > >  		if (sk != sk2 &&
> > >  		    (!sk->sk_bound_dev_if ||
> > >  		     !sk2->sk_bound_dev_if ||
> > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > -			break;
> > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > +				break;
> > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> 
> I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> 

Oh its a typo

we should test ipv6_addr_any(inet6_rcv_saddr(sk))

instead of ipv6_addr_any(inet6_rcv_saddr(sk2))

(sk is AF_INET6, while sk2 could be AF_INET)

I'll submit a patch promptly



^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Eric Dumazet @ 2010-04-25 16:21 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Michael S. Tsirkin, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425155600.GA13319@ioremap.net>

Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> 
> > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > index 0c5e3c3..fb6959c 100644
> > > --- a/net/ipv6/inet6_connection_sock.c
> > > +++ b/net/ipv6/inet6_connection_sock.c
> > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > >  		if (sk != sk2 &&
> > >  		    (!sk->sk_bound_dev_if ||
> > >  		     !sk2->sk_bound_dev_if ||
> > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > -			break;
> > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > +				break;
> > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> 
> I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> 

Sorry, I cant test this at this moment (I am travelling)

Evgeniy, David could you double check ?

Michael, could you test this patch ?

Thanks !

[PATCH] ipv6: Fix inet6_csk_bind_conflict()

Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
introduced a bug on IPV6 part.
We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
IPV6.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b4b7d40..3a4d92b 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 			else if (sk->sk_reuse && sk2->sk_reuse &&
-				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
 				ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 		}



^ permalink raw reply related

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Sedat Dilek @ 2010-04-25 16:31 UTC (permalink / raw)
  To: piotr; +Cc: LKML, netdev, David Miller, Jiri Olsa, Eric Dumazet, Jongman Heo
In-Reply-To: <4BD45E68.4080900@example.com>

[ CCing netdev ML ]

Confirmed: The revert-patch [1] fixes the problem here.

See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].

Feel free to add a... Tested-by: Sedat Dilek <sedat.dilek@gmail.com>

- Sedat -

[1] https://patchwork.kernel.org/patch/94959/
[2] https://bugzilla.kernel.org/show_bug.cgi?id=15847

2010/4/25 Piotr Hosowicz <piotr@hosowicz.com>:
> On 25.04.2010 17:11, Sedat Dilek wrote:
>>
>> [ Please CC - I am not subscribed to LKML ]
>>
>> [QUOTE]
>>
>> On 25.04.2010 16:26, Jongman Heo wrote:
>>
>>> I also hit this bug today.
>>
>> I also hit similar bug, maybe it is the same.
>>
>>> Doing git bisect, first bad commit was
>>>
>>>   commit fda48a0d7a8412cedacda46a9c0bf8ef9cd13559
>>>   tcp: bind() fix when many ports are bound
>>>
>>> Reverting above commit fixes the problem.
>>
>> How to do it? Would you please publish a reverting patch?
>>
>> Regards,
>>
>> Piotr Hosowicz
>>
>> [/QUOTE]
>>
>> Hi,
>>
>> did forget to mention 2.6.34-rc5-git5 was OK.
>>
>> Revering this commit did not help:
>>
>> commit f4f914b58019f0e50d521bbbadfaee260d766f95
>> net: ipv6 bind to device issue
>>
>> After looking into net-2.6 GIT repository, "tcp: bind() fix when many
>> ports are bound" could cause indeed the problems here, too.
>> Building now....
>>
>> Regards,
>> - Sedat -
>>
>> P.S.: Attached 0001-Revert-tcp-bind-fix-when-many-ports-are-bound.patch
>
> Thanks a lot. Applied and building now.
>
> Regards,
>
> Piotr Hosowicz
>
> --
> Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach:
> - W którym kraju znajduje się Mount Everest?
> - Hm, to nie Szkocja, prawda?
> NP: Mark Knopfler - Cleaning My Gun
> NB: 2.6.34-rc5-git5
>

^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Piotr Hosowicz @ 2010-04-25 16:36 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Sedat Dilek, LKML, netdev, David Miller, Jiri Olsa, Eric Dumazet,
	Jongman Heo
In-Reply-To: <g2z2d0a357f1004250931pf5880c60l32fd0643e0f14bde@mail.gmail.com>

On 25.04.2010 18:31, Sedat Dilek wrote:
> [ CCing netdev ML ]
>
> Confirmed: The revert-patch [1] fixes the problem here.

I confirm, I've built a git6 kernel and it works fine.

> See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
>
> Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>

I added created and tested phrase in my archive. ;-) Thank you a lot. I 
hope there will be no this error in git7.

Regards,

Piotr Hosowicz

> - Sedat -
>
> [1] https://patchwork.kernel.org/patch/94959/
> [2] https://bugzilla.kernel.org/show_bug.cgi?id=15847
>
> 2010/4/25 Piotr Hosowicz<piotr@hosowicz.com>:
>> On 25.04.2010 17:11, Sedat Dilek wrote:
>>>
>>> [ Please CC - I am not subscribed to LKML ]
>>>
>>> [QUOTE]
>>>
>>> On 25.04.2010 16:26, Jongman Heo wrote:
>>>
>>>> I also hit this bug today.
>>>
>>> I also hit similar bug, maybe it is the same.
>>>
>>>> Doing git bisect, first bad commit was
>>>>
>>>>    commit fda48a0d7a8412cedacda46a9c0bf8ef9cd13559
>>>>    tcp: bind() fix when many ports are bound
>>>>
>>>> Reverting above commit fixes the problem.
>>>
>>> How to do it? Would you please publish a reverting patch?
>>>
>>> Regards,
>>>
>>> Piotr Hosowicz
>>>
>>> [/QUOTE]
>>>
>>> Hi,
>>>
>>> did forget to mention 2.6.34-rc5-git5 was OK.
>>>
>>> Revering this commit did not help:
>>>
>>> commit f4f914b58019f0e50d521bbbadfaee260d766f95
>>> net: ipv6 bind to device issue
>>>
>>> After looking into net-2.6 GIT repository, "tcp: bind() fix when many
>>> ports are bound" could cause indeed the problems here, too.
>>> Building now....
>>>
>>> Regards,
>>> - Sedat -
>>>
>>> P.S.: Attached 0001-Revert-tcp-bind-fix-when-many-ports-are-bound.patch
>>
>> Thanks a lot. Applied and building now.
>>
>> Regards,
>>
>> Piotr Hosowicz
>>
>> --
>> Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach:
>> - W którym kraju znajduje się Mount Everest?
>> - Hm, to nie Szkocja, prawda?
>> NP: Mark Knopfler - Cleaning My Gun
>> NB: 2.6.34-rc5-git5
>>
>


-- 
Grupa marzeń w eliminacjach MŚ :
Zimbabwe, Alaska, Grenlandia, Antarktyda i Zair.
NP: Chickenfoot - Oh Yeah
NB: 2.6.34-rc5-git6

^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Eric Dumazet @ 2010-04-25 16:39 UTC (permalink / raw)
  To: piotr
  Cc: sedat.dilek, Sedat Dilek, LKML, netdev, David Miller, Jiri Olsa,
	Jongman Heo
In-Reply-To: <4BD46F9C.5060500@example.com>

Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
> On 25.04.2010 18:31, Sedat Dilek wrote:
> > [ CCing netdev ML ]
> >
> > Confirmed: The revert-patch [1] fixes the problem here.
> 
> I confirm, I've built a git6 kernel and it works fine.
> 
> > See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
> >
> > Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
> 
> I added created and tested phrase in my archive. ;-) Thank you a lot. I 
> hope there will be no this error in git7.
> 

Did you test the proposed fix ?


[PATCH] ipv6: Fix inet6_csk_bind_conflict()

Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
introduced a bug on IPV6 part.
We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
IPV6.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b4b7d40..3a4d92b 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 			else if (sk->sk_reuse && sk2->sk_reuse &&
-				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
 				ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 		}

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox